diff --git a/dumpgenerator.py b/dumpgenerator.py index a045ace5..b5b63bb3 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -69,9 +69,23 @@ def getVersion(): def truncateFilename(other={}, filename=''): - """ Truncate filenames when downloading images with large filenames """ - return filename[:other['filenamelimit']] + \ - md5(filename.encode('utf-8')).hexdigest() + '.' + filename.split('.')[-1] + """ Truncate filename if longer than other['filenamelimit'] """ + filename = unicode(filename) + if len(filename.encode('utf-8')) < other['filenamelimit']: + return filename + fileext = filename.split('.') + if len(fileext) == 1: + fileext = "" + else: + fileext = '.' + fileext[-1] + # make room for md5, file extension and imagesdescext + trunc = other['filenamelimit'] - 32 - len(fileext) - len(other['imagesdescext']) + assert (trunc > 0) + while len(filename[:trunc].encode('utf-8')) > other['filenamelimit']: + trunc -= 1 + trunked_fn = filename[:trunc] + md5(filename.encode('utf-8')).hexdigest() + fileext + print 'Filename is too long, truncating. Now it is:', trunked_fn + return trunked_fn def delay(config={}, session=None): @@ -1097,13 +1111,8 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None): delay(config=config, session=session) # saving file - # truncate filename if length > 100 (100 + 32 (md5) = 132 < 143 (crash - # limit). Later .desc is added to filename, so better 100 as max) filename2 = urllib.unquote(filename) - if len(filename2) > other['filenamelimit']: - # split last . (extension) and then merge - filename2 = truncateFilename(other=other, filename=filename2) - print 'Filename is too long, truncating. Now it is:', filename2 + filename2 = truncateFilename(other=other, filename=filename2) filename3 = u'%s/%s' % (imagepath, filename2) imagefile = open(filename3, 'wb') r = requests.get(url=url) @@ -1123,7 +1132,7 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None): text=u'The page "%s" was missing in the wiki (probably deleted)' % (title.decode('utf-8')) ) - f = open('%s/%s.desc' % (imagepath, filename2), 'w') + f = open('%s/%s%s' % (imagepath, filename2, other['imagesdescext']), 'w') # Banner featuring SG1, SGA, SGU teams if not re.search(r'', xmlfiledesc): # failure when retrieving desc? then save it as empty .desc @@ -1498,9 +1507,10 @@ def getParameters(params=[]): other = { 'resume': args.resume, - 'filenamelimit': 100, # do not change + 'filenamelimit': 140, # encryptfs reduce the filename limit from 255 to ~148 chars :/ 'force': args.force, - 'session': session + 'session': session, + 'imagesdescext': '.desc' } # calculating path, if not defined by user with --path= diff --git a/testing/test_dumpgenerator_offline.py b/testing/test_dumpgenerator_offline.py new file mode 100644 index 00000000..0bbe7ddc --- /dev/null +++ b/testing/test_dumpgenerator_offline.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# This file is part of the wikiteam project. +# +# Copyright (C) 2017 Robert Felten - https://github.com/rfelten/ +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software Foundation, +# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +import sys +import os +sys.path.append(os.path.join(os.path.dirname(__file__), '..')) # q&d import hack, sorry +import unittest +from dumpgenerator import truncateFilename + +# This file is intended to test offline functionality of the dumpgenerator.py. +# For all other tests see test_dumpgenerator.py + + +class TestDumpgeneratorOffline(unittest.TestCase): + + def setUp(self): + other = dict() # FIXME: get from dumpgenerator, but code base is a pre-OO mess + other['filenamelimit'] = 140 # encryptfs reduce the filename limit from 255 to ~148 chars :/ + other['imagesdescext'] = '.desc' + self.other = other + + def tearDown(self): + pass + + def helper_truncateFilename(self, fn): + fn_trunc = truncateFilename(other=self.other, filename=fn) + self.assertLessEqual(len(fn_trunc), self.other['filenamelimit'], + "trunced filename '%s' len of %d exceed limit of %d." % ( + fn_trunc, len(fn_trunc), self.other['filenamelimit'])) + + def test_truncateFilename1(self): + """ Test if truncFilename() obey other['filenamelimit'] - real world example 1""" + fn = u"Assortiment de différentes préparation à bases de légumes et féculents, bien sur servit avec de l'injara.JPG" + self.assertEqual(len(fn), 108) + self.assertEqual(len(fn.encode("utf-8")), 113) # chars like 'è' will extend length - this is maybe unexpected + self.helper_truncateFilename(fn) + + def test_truncateFilename2(self): + """ Test if truncFilename() obey other['filenamelimit'] - longest valid name w/o file extension""" + fn = "A" * self.other['filenamelimit'] + self.helper_truncateFilename(fn) + + def test_truncateFilename3(self): + """ Test if truncFilename() obey other['filenamelimit'] - longest valid name w/ file extension""" + fn = "A" * self.other['filenamelimit'] + fn = fn[:-4] + ".jpg" + self.helper_truncateFilename(fn) + + def test_truncateFilename4(self): + """ Test if truncFilename() obey other['filenamelimit'] - valid name w/ file extension""" + fn = "A" * (self.other['filenamelimit'] / 2) + fn = fn[:-4] + ".jpg" + self.helper_truncateFilename(fn) + + def test_truncateFilename5(self): + """ Test if truncFilename() obey other['filenamelimit'] - longest valid name w/ file extension (unicode)""" + fn = u"è" * self.other['filenamelimit'] + fn = fn[:-4] + ".jpg" + self.helper_truncateFilename(fn) + + + +if __name__ == '__main__': + unittest.main()