From 9142f005c9736fc39f65805cb0efe00b7d281a87 Mon Sep 17 00:00:00 2001
From: Chlod Alejandro <chlod@chlod.net>
Date: Sun, 26 May 2024 15:39:08 +0800
Subject: [PATCH] Exit early when downloading existing file

#614 moved skip checks to until after response headers have been
received, which drastically slows down the download process if
the file already exists or the file has an equal checksum. Since
the file name and checksum are already known prior to download,
these checks should remain at the start to avoid having to make
a request which would eventually be discarded anyway.
---
 internetarchive/files.py | 51 ++++++++++++++++++++--------------------
 1 file changed, 26 insertions(+), 25 deletions(-)

diff --git a/internetarchive/files.py b/internetarchive/files.py
index 020b9fa9..7b43b9f3 100644
--- a/internetarchive/files.py
+++ b/internetarchive/files.py
@@ -226,6 +226,25 @@ def download(# noqa: max-complexity=38
 
         parent_dir = os.path.dirname(file_path)
 
+        # Check if we should skip...
+        if not return_responses and os.path.exists(file_path.encode('utf-8')):
+            if ignore_existing:
+                msg = f'skipping {file_path}, file already exists.'
+                log.info(msg)
+                if verbose:
+                    print(f' {msg}', file=sys.stderr)
+                return
+            elif checksum:
+                with open(file_path, 'rb') as fp:
+                    md5_sum = utils.get_md5(fp)
+
+                if md5_sum == self.md5:
+                    msg = f'skipping {file_path}, file already exists based on checksum.'
+                    log.info(msg)
+                    if verbose:
+                        print(f' {msg}', file=sys.stderr)
+                    return
+
         # Retry loop
         while True:
             try:
@@ -256,35 +275,17 @@ def download(# noqa: max-complexity=38
 
                 response.raise_for_status()
 
-                # Check if we should skip...
-                if not return_responses and os.path.exists(file_path.encode('utf-8')):
-                    if ignore_existing:
-                        msg = f'skipping {file_path}, file already exists.'
-                        log.info(msg)
-                        if verbose:
-                            print(f' {msg}', file=sys.stderr)
-                        return
-                    elif checksum:
-                        with open(file_path, 'rb') as fp:
-                            md5_sum = utils.get_md5(fp)
-
-                        if md5_sum == self.md5:
-                            msg = f'skipping {file_path}, file already exists based on checksum.'
+                # Check if we should skip based on last modified time...
+                if not fileobj and not return_responses and os.path.exists(file_path.encode('utf-8')):
+                    st = os.stat(file_path.encode('utf-8'))
+                    if st.st_mtime == last_mod_mtime:
+                        if self.name == f'{self.identifier}_files.xml' or (st.st_size == self.size):
+                            msg = (f'skipping {file_path}, file already exists based on '
+                                    'length and date.')
                             log.info(msg)
                             if verbose:
                                 print(f' {msg}', file=sys.stderr)
                             return
-                    elif not fileobj:
-                        st = os.stat(file_path.encode('utf-8'))
-                        if st.st_mtime == last_mod_mtime:
-                            if self.name == f'{self.identifier}_files.xml' \
-                                or (st.st_size == self.size):
-                                msg = (f'skipping {file_path}, file already exists based on '
-                                        'length and date.')
-                                log.info(msg)
-                                if verbose:
-                                    print(f' {msg}', file=sys.stderr)
-                                return
 
                 elif return_responses:
                     return response