Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix for #292 and changed filenamelimit #293

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 22 additions & 12 deletions dumpgenerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,23 @@ def getVersion():


def truncateFilename(other={}, filename=''):
""" Truncate filenames when downloading images with large filenames """
return filename[:other['filenamelimit']] + \
md5(filename.encode('utf-8')).hexdigest() + '.' + filename.split('.')[-1]
""" Truncate filename if longer than other['filenamelimit'] """
filename = unicode(filename)
if len(filename.encode('utf-8')) < other['filenamelimit']:
return filename
fileext = filename.split('.')
if len(fileext) == 1:
fileext = ""
else:
fileext = '.' + fileext[-1]
# make room for md5, file extension and imagesdescext
trunc = other['filenamelimit'] - 32 - len(fileext) - len(other['imagesdescext'])
assert (trunc > 0)
while len(filename[:trunc].encode('utf-8')) > other['filenamelimit']:
trunc -= 1
trunked_fn = filename[:trunc] + md5(filename.encode('utf-8')).hexdigest() + fileext
print 'Filename is too long, truncating. Now it is:', trunked_fn
return trunked_fn


def delay(config={}, session=None):
Expand Down Expand Up @@ -1097,13 +1111,8 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None):
delay(config=config, session=session)

# saving file
# truncate filename if length > 100 (100 + 32 (md5) = 132 < 143 (crash
# limit). Later .desc is added to filename, so better 100 as max)
filename2 = urllib.unquote(filename)
if len(filename2) > other['filenamelimit']:
# split last . (extension) and then merge
filename2 = truncateFilename(other=other, filename=filename2)
print 'Filename is too long, truncating. Now it is:', filename2
filename2 = truncateFilename(other=other, filename=filename2)
filename3 = u'%s/%s' % (imagepath, filename2)
imagefile = open(filename3, 'wb')
r = requests.get(url=url)
Expand All @@ -1123,7 +1132,7 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None):
text=u'The page "%s" was missing in the wiki (probably deleted)' % (title.decode('utf-8'))
)

f = open('%s/%s.desc' % (imagepath, filename2), 'w')
f = open('%s/%s%s' % (imagepath, filename2, other['imagesdescext']), 'w')
# <text xml:space="preserve" bytes="36">Banner featuring SG1, SGA, SGU teams</text>
if not re.search(r'</mediawiki>', xmlfiledesc):
# failure when retrieving desc? then save it as empty .desc
Expand Down Expand Up @@ -1498,9 +1507,10 @@ def getParameters(params=[]):

other = {
'resume': args.resume,
'filenamelimit': 100, # do not change
'filenamelimit': 140, # encryptfs reduce the filename limit from 255 to ~148 chars :/
'force': args.force,
'session': session
'session': session,
'imagesdescext': '.desc'
}

# calculating path, if not defined by user with --path=
Expand Down
82 changes: 82 additions & 0 deletions testing/test_dumpgenerator_offline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# This file is part of the wikiteam project.
#
# Copyright (C) 2017 Robert Felten - https://github.com/rfelten/
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA

import sys
import os
sys.path.append(os.path.join(os.path.dirname(__file__), '..')) # q&d import hack, sorry
import unittest
from dumpgenerator import truncateFilename

# This file is intended to test offline functionality of the dumpgenerator.py.
# For all other tests see test_dumpgenerator.py


class TestDumpgeneratorOffline(unittest.TestCase):

def setUp(self):
other = dict() # FIXME: get from dumpgenerator, but code base is a pre-OO mess
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 for inspirational quote. Wonderful poetry that summarizes dumpgenerator.py in a single sentence.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hehe, sry, could not resist ;)
I was pretty discouraged after giving up to get the configuration out of dumpgenerator.py :(

other['filenamelimit'] = 140 # encryptfs reduce the filename limit from 255 to ~148 chars :/
other['imagesdescext'] = '.desc'
self.other = other

def tearDown(self):
pass

def helper_truncateFilename(self, fn):
fn_trunc = truncateFilename(other=self.other, filename=fn)
self.assertLessEqual(len(fn_trunc), self.other['filenamelimit'],
"trunced filename '%s' len of %d exceed limit of %d." % (
fn_trunc, len(fn_trunc), self.other['filenamelimit']))

def test_truncateFilename1(self):
""" Test if truncFilename() obey other['filenamelimit'] - real world example 1"""
fn = u"Assortiment de différentes préparation à bases de légumes et féculents, bien sur servit avec de l'injara.JPG"
self.assertEqual(len(fn), 108)
self.assertEqual(len(fn.encode("utf-8")), 113) # chars like 'è' will extend length - this is maybe unexpected
self.helper_truncateFilename(fn)

def test_truncateFilename2(self):
""" Test if truncFilename() obey other['filenamelimit'] - longest valid name w/o file extension"""
fn = "A" * self.other['filenamelimit']
self.helper_truncateFilename(fn)

def test_truncateFilename3(self):
""" Test if truncFilename() obey other['filenamelimit'] - longest valid name w/ file extension"""
fn = "A" * self.other['filenamelimit']
fn = fn[:-4] + ".jpg"
self.helper_truncateFilename(fn)

def test_truncateFilename4(self):
""" Test if truncFilename() obey other['filenamelimit'] - valid name w/ file extension"""
fn = "A" * (self.other['filenamelimit'] / 2)
fn = fn[:-4] + ".jpg"
self.helper_truncateFilename(fn)

def test_truncateFilename5(self):
""" Test if truncFilename() obey other['filenamelimit'] - longest valid name w/ file extension (unicode)"""
fn = u"è" * self.other['filenamelimit']
fn = fn[:-4] + ".jpg"
self.helper_truncateFilename(fn)



if __name__ == '__main__':
unittest.main()