Skip to content

Commit

Permalink
Exit early when downloading existing file
Browse files Browse the repository at this point in the history
jjjake#614 moved skip checks to until after response headers have been
received, which drastically slows down the download process if
the file already exists or the file has an equal checksum. Since
the file name and checksum are already known prior to download,
these checks should remain at the start to avoid having to make
a request which would eventually be discarded anyway.
  • Loading branch information
ChlodAlejandro committed May 26, 2024
1 parent 4efd45f commit 9142f00
Showing 1 changed file with 26 additions and 25 deletions.
51 changes: 26 additions & 25 deletions internetarchive/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,25 @@ def download(# noqa: max-complexity=38

parent_dir = os.path.dirname(file_path)

# Check if we should skip...
if not return_responses and os.path.exists(file_path.encode('utf-8')):
if ignore_existing:
msg = f'skipping {file_path}, file already exists.'
log.info(msg)
if verbose:
print(f' {msg}', file=sys.stderr)
return
elif checksum:
with open(file_path, 'rb') as fp:
md5_sum = utils.get_md5(fp)

if md5_sum == self.md5:
msg = f'skipping {file_path}, file already exists based on checksum.'
log.info(msg)
if verbose:
print(f' {msg}', file=sys.stderr)
return

# Retry loop
while True:
try:
Expand Down Expand Up @@ -256,35 +275,17 @@ def download(# noqa: max-complexity=38

response.raise_for_status()

# Check if we should skip...
if not return_responses and os.path.exists(file_path.encode('utf-8')):
if ignore_existing:
msg = f'skipping {file_path}, file already exists.'
log.info(msg)
if verbose:
print(f' {msg}', file=sys.stderr)
return
elif checksum:
with open(file_path, 'rb') as fp:
md5_sum = utils.get_md5(fp)

if md5_sum == self.md5:
msg = f'skipping {file_path}, file already exists based on checksum.'
# Check if we should skip based on last modified time...
if not fileobj and not return_responses and os.path.exists(file_path.encode('utf-8')):
st = os.stat(file_path.encode('utf-8'))
if st.st_mtime == last_mod_mtime:
if self.name == f'{self.identifier}_files.xml' or (st.st_size == self.size):
msg = (f'skipping {file_path}, file already exists based on '
'length and date.')
log.info(msg)
if verbose:
print(f' {msg}', file=sys.stderr)
return
elif not fileobj:
st = os.stat(file_path.encode('utf-8'))
if st.st_mtime == last_mod_mtime:
if self.name == f'{self.identifier}_files.xml' \
or (st.st_size == self.size):
msg = (f'skipping {file_path}, file already exists based on '
'length and date.')
log.info(msg)
if verbose:
print(f' {msg}', file=sys.stderr)
return

elif return_responses:
return response
Expand Down

0 comments on commit 9142f00

Please sign in to comment.