Merge pull request #645 from benbou8231/add_archive_file

Add archive file to cache files that passed checksum validation (#600)
jjjake · Jun 14, 2024 · 8815d75 · 8815d75
2 parents c97ec24 + e003e4e
commit 8815d75
Show file tree

Hide file tree

Showing 7 changed files with 117 additions and 20 deletions.
diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst
@@ -179,6 +179,20 @@ Alternatively, you can skip files based on md5 checksums. This is will take long
      skipping nasa/nasa_meta.xml, file already exists based on checksum.
      skipping nasa/nasa_reviews.xml, file already exists based on checksum.
 
+Furthermore, you can skip files based on md5 checksums and user a checksum_archive file. This is will be faster than checksum alone because checksums will only need to be calculated once for every file already downloaded. Once calculated successfully, the item/file will be written to the checksum_archive file and succeeding runs will skip the checksum validation::
+
+    >>> download('nasa', verbose=True, checksum_archive=True)
+    nasa:
+     skipping nasa/__ia_thumb.jpg, file already exists based on checksum_archive.
+     skipping nasa/globe_west_540.jpg, file already exists based on checksum_archive.
+     skipping nasa/globe_west_540_thumb.jpg, file already exists based on checksum_archive.
+     skipping nasa/nasa_archive.torrent, file already exists based on checksum_archive.
+     skipping nasa_files.xml: 2.56kiB [00:00, 5.76MiB/s]
+     skipping nasa/nasa_itemimage.jpg, file already exists based on checksum_archive.
+     skipping nasa/nasa_meta.sqlite, file already exists based on checksum.
+     skipping nasa/nasa_meta.xml, file already exists based on checksum.
+     downloading nasa/nasa_reviews.xml, file already exists based on checksum.
+
 By default, the :func:`download <internetarchive.download>` function will download all of the files in an item. However, there are a couple parameters that can be used to download only specific files. Files can be filtered using the ``glob_pattern`` parameter::
 
     >>> download('nasa', verbose=True, glob_pattern='*xml')

diff --git a/internetarchive/api.py b/internetarchive/api.py
@@ -305,6 +305,7 @@ def download(
     verbose: bool = False,
     ignore_existing: bool = False,
     checksum: bool = False,
+    checksum_archive: bool = False,
     destdir: str | None = None,
     no_directory: bool = False,
     retries: int | None = None,
@@ -335,6 +336,10 @@ def download(
 
     :param checksum: Skip downloading file based on checksum.
 
+    :param checksum_archive: Skip downloading file based on checksum, and skip
+                             checksum validation if it already succeeded
+                             (will create and use _checksum_archive.txt).
+
     :param destdir: The directory to download files to.
 
     :param no_directory: Download files to current working
@@ -368,6 +373,7 @@ def download(
         verbose=verbose,
         ignore_existing=ignore_existing,
         checksum=checksum,
+        checksum_archive=checksum_archive,
         destdir=destdir,
         no_directory=no_directory,
         retries=retries,

diff --git a/internetarchive/cli/ia_download.py b/internetarchive/cli/ia_download.py
@@ -48,6 +48,8 @@
 
                                                  ia metadata --formats <identifier>
 
+    --checksum-archive                       Skip files based on _checksum_archive.txt
+                                             [default: False].
     --on-the-fly                             Download on-the-fly files, as well as other matching
                                              files. on-the-fly files include derivative EPUB, MOBI
                                              and DAISY files [default: False].
@@ -198,6 +200,7 @@ def main(argv, session: ArchiveSession) -> None:
             verbose=not args['--quiet'],
             ignore_existing=args['--ignore-existing'],
             checksum=args['--checksum'],
+            checksum_archive=args['--checksum-archive'],
             destdir=args['--destdir'],
             no_directory=args['--no-directories'],
             retries=retries,

diff --git a/internetarchive/files.py b/internetarchive/files.py
@@ -138,12 +138,25 @@ def __repr__(self):
                 f'size={self.size!r}, '
                 f'format={self.format!r})')
 
-    def download(# noqa: max-complexity=38
-                 self, file_path=None, verbose=None, ignore_existing=None,
-                 checksum=None, destdir=None, retries=None, ignore_errors=None,
-                 fileobj=None, return_responses=None, no_change_timestamp=None,
-                 params=None, chunk_size=None, stdout=None, ors=None,
-                 timeout=None):
+    def download(  # noqa: max-complexity=38
+        self,
+        file_path=None,
+        verbose=None,
+        ignore_existing=None,
+        checksum=None,
+        checksum_archive=None,
+        destdir=None,
+        retries=None,
+        ignore_errors=None,
+        fileobj=None,
+        return_responses=None,
+        no_change_timestamp=None,
+        params=None,
+        chunk_size=None,
+        stdout=None,
+        ors=None,
+        timeout=None,
+    ):
         """Download the file into the current working directory.
 
         :type file_path: str
@@ -159,6 +172,11 @@ def download(# noqa: max-complexity=38
         :type checksum: bool
         :param checksum: (optional) Skip downloading file based on checksum.
 
+        :type checksum_archive: bool
+        :param checksum_archive: (optional) Skip downloading file based on checksum, and
+                                 skip checksum validation if it already succeeded
+                                 (will create and use _checksum_archive.txt).
+
         :type destdir: str
         :param destdir: (optional) The directory to download files to.
 
@@ -201,6 +219,7 @@ def download(# noqa: max-complexity=38
         verbose = False if verbose is None else verbose
         ignore_existing = False if ignore_existing is None else ignore_existing
         checksum = False if checksum is None else checksum
+        checksum_archive = False if checksum_archive is None else checksum_archive
         retries = retries or 2
         ignore_errors = ignore_errors or False
         return_responses = return_responses or False
@@ -215,6 +234,8 @@ def download(# noqa: max-complexity=38
         file_path = file_path or self.name
 
         if destdir:
+            if verbose:
+                print(f"destdir: {destdir}")
             if return_responses is not True:
                 try:
                     os.mkdir(destdir)
@@ -228,13 +249,29 @@ def download(# noqa: max-complexity=38
 
         # Check if we should skip...
         if not return_responses and os.path.exists(file_path.encode('utf-8')):
+            if checksum_archive:
+                checksum_archive_filename = '_checksum_archive.txt'
+                if not os.path.exists(checksum_archive_filename):
+                    with open(checksum_archive_filename, 'w', encoding='utf-8') as f:
+                        pass
+                with open(checksum_archive_filename, encoding='utf-8') as f:
+                    checksum_archive_data = f.read().splitlines()
+                if file_path in checksum_archive_data:
+                    msg = (
+                        f'skipping {file_path}, '
+                        f'file already exists based on checksum_archive.'
+                    )
+                    log.info(msg)
+                    if verbose:
+                        print(f' {msg}', file=sys.stderr)
+                    return
             if ignore_existing:
                 msg = f'skipping {file_path}, file already exists.'
                 log.info(msg)
                 if verbose:
                     print(f' {msg}', file=sys.stderr)
                 return
-            elif checksum:
+            elif checksum or checksum_archive:
                 with open(file_path, 'rb') as fp:
                     md5_sum = utils.get_md5(fp)
 
@@ -243,6 +280,10 @@ def download(# noqa: max-complexity=38
                     log.info(msg)
                     if verbose:
                         print(f' {msg}', file=sys.stderr)
+                    if checksum_archive:
+                        # add file to checksum_archive to skip it next time
+                        with open(checksum_archive_filename, 'a', encoding='utf-8') as f:
+                            f.write(f'{file_path}\n')
                     return
 
         # Retry loop
@@ -256,15 +297,17 @@ def download(# noqa: max-complexity=38
                         and self.name != f'{self.identifier}_files.xml' \
                         and os.path.exists(file_path.encode('utf-8')):
                     st = os.stat(file_path.encode('utf-8'))
-                    if st.st_size != self.size and not checksum:
+                    if st.st_size != self.size and not (checksum or checksum_archive):
                         headers = {"Range": f"bytes={st.st_size}-"}
 
-                response = self.item.session.get(self.url,
-                                                 stream=True,
-                                                 timeout=timeout,
-                                                 auth=self.auth,
-                                                 params=params,
-                                                 headers=headers)
+                response = self.item.session.get(
+                    self.url,
+                    stream=True,
+                    timeout=timeout,
+                    auth=self.auth,
+                    params=params,
+                    headers=headers,
+                )
                 # Get timestamp from Last-Modified header
                 last_mod_header = response.headers.get('Last-Modified')
                 if last_mod_header:

diff --git a/internetarchive/item.py b/internetarchive/item.py
@@ -589,6 +589,7 @@ def download(self,
                  verbose: bool = False,
                  ignore_existing: bool = False,
                  checksum: bool = False,
+                 checksum_archive: bool = False,
                  destdir: str | None = None,
                  no_directory: bool = False,
                  retries: int | None = None,
@@ -627,6 +628,10 @@ def download(self,
 
         :param checksum: Skip downloading file based on checksum.
 
+        :param checksum_archive: Skip downloading file based on checksum, and skip
+                                 checksum validation if it already succeeded
+                                 (will create and use _checksum_archive.txt).
+
         :param destdir: The directory to download files to.
 
         :param no_directory: Download files to current working
@@ -670,6 +675,7 @@ def download(self,
         ignore_existing = bool(ignore_existing)
         ignore_errors = bool(ignore_errors)
         checksum = bool(checksum)
+        checksum_archive = bool(checksum_archive)
         no_directory = bool(no_directory)
         return_responses = bool(return_responses)
         no_change_timestamp = bool(no_change_timestamp)
@@ -746,8 +752,8 @@ def download(self,
                 ors = True
             else:
                 ors = False
-            r = f.download(path, verbose, ignore_existing, checksum, destdir,
-                           retries, ignore_errors, fileobj, return_responses,
+            r = f.download(path, verbose, ignore_existing, checksum, checksum_archive,
+                           destdir, retries, ignore_errors, fileobj, return_responses,
                            no_change_timestamp, params, None, stdout, ors, timeout)
             if return_responses:
                 responses.append(r)

diff --git a/pyproject.toml b/pyproject.toml
@@ -74,7 +74,7 @@ line-length = 102
 max-complexity = 33
 
 [tool.ruff.pylint]
-max-args = 23
+max-args = 24
 max-branches = 33
 max-statements = 124
 

diff --git a/tests/cli/test_ia_download.py b/tests/cli/test_ia_download.py
@@ -73,8 +73,9 @@ def test_clobber(tmpdir_ch):
 
     stdout, stderr = call_cmd(cmd)
     assert files_downloaded('nasa') == {'nasa_meta.xml'}
-    expected_stderr = ('nasa:\n'
-                       ' skipping nasa/nasa_meta.xml, file already exists based on length and date.')
+    prefix = 'nasa:\n'.replace('\n', os.linesep)
+    filepath = os.path.join('nasa', 'nasa_meta.xml')
+    expected_stderr = f'{prefix} skipping {filepath}, file already exists based on length and date.'
     assert expected_stderr == stderr
 
 
@@ -84,7 +85,31 @@ def test_checksum(tmpdir_ch):
 
     stdout, stderr = call_cmd('ia --insecure download --checksum nasa nasa_meta.xml')
     assert files_downloaded('nasa') == {'nasa_meta.xml'}
-    assert 'nasa:\n skipping nasa/nasa_meta.xml, file already exists based on checksum.' == stderr
+    prefix = 'nasa:\n'.replace('\n', os.linesep)
+    filepath = os.path.join('nasa', 'nasa_meta.xml')
+    assert f'{prefix} skipping {filepath}, file already exists based on checksum.' == stderr
+
+
+def test_checksum_archive(tmpdir_ch):
+    call_cmd('ia --insecure download nasa nasa_meta.xml')
+    assert files_downloaded('nasa') == {'nasa_meta.xml'}
+
+    stdout, stderr = call_cmd('ia --insecure download --checksum-archive nasa nasa_meta.xml')
+    assert files_downloaded('nasa') == {'nasa_meta.xml'}
+    prefix = 'nasa:\n'.replace('\n', os.linesep)
+    filepath = os.path.join('nasa', 'nasa_meta.xml')
+    assert f'{prefix} skipping {filepath}, file already exists based on checksum.' == stderr
+
+    assert '_checksum_archive.txt' in files_downloaded('.')
+    with open(os.path.join('.', '_checksum_archive.txt'), encoding='utf-8') as f:
+        filepath = os.path.join('nasa', 'nasa_meta.xml')
+        assert f.read() == f'{filepath}\n'
+
+    stdout, stderr = call_cmd('ia --insecure download --checksum-archive nasa nasa_meta.xml')
+    assert files_downloaded('nasa') == {'nasa_meta.xml'}
+    prefix = 'nasa:\n'.replace('\n', os.linesep)
+    filepath = os.path.join('nasa', 'nasa_meta.xml')
+    assert f'{prefix} skipping {filepath}, file already exists based on checksum_archive.' == stderr
 
 
 def test_no_directories(tmpdir_ch):