Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

First pass at using Last-Modified header to set mtime #614

Merged
merged 9 commits into from
Mar 28, 2024
70 changes: 40 additions & 30 deletions internetarchive/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import socket
import sys
from contextlib import nullcontext, suppress
from email.utils import parsedate_to_datetime
from urllib.parse import quote

from requests.exceptions import (
Expand Down Expand Up @@ -218,33 +219,6 @@ def download(self, file_path=None, verbose=None, ignore_existing=None,
raise OSError(f'{destdir} is not a directory!')
file_path = os.path.join(destdir, file_path)

if not return_responses and os.path.exists(file_path.encode('utf-8')):
if ignore_existing:
msg = f'skipping {file_path}, file already exists.'
log.info(msg)
if verbose:
print(f' {msg}', file=sys.stderr)
return
elif checksum:
with open(file_path, 'rb') as fp:
md5_sum = utils.get_md5(fp)

if md5_sum == self.md5:
msg = f'skipping {file_path}, file already exists based on checksum.'
log.info(msg)
if verbose:
print(f' {msg}', file=sys.stderr)
return
elif not fileobj:
st = os.stat(file_path.encode('utf-8'))
if (st.st_mtime == self.mtime) and (st.st_size == self.size) \
or self.name.endswith('_files.xml') and st.st_size != 0:
msg = f'skipping {file_path}, file already exists based on length and date.'
log.info(msg)
if verbose:
print(f' {msg}', file=sys.stderr)
return

parent_dir = os.path.dirname(file_path)
try:
if parent_dir != '' and return_responses is not True:
Expand All @@ -255,8 +229,44 @@ def download(self, file_path=None, verbose=None, ignore_existing=None,
timeout=timeout,
auth=self.auth,
params=params)

# Get timestamp from Last-Modified header
dt = parsedate_to_datetime(response.headers['Last-Modified'])
last_mod_mtime = dt.timestamp()

response.raise_for_status()
if return_responses:

# Check if we should skip...
if not return_responses and os.path.exists(file_path.encode('utf-8')):
if ignore_existing:
msg = f'skipping {file_path}, file already exists.'
log.info(msg)
if verbose:
print(f' {msg}', file=sys.stderr)
return
elif checksum:
with open(file_path, 'rb') as fp:
md5_sum = utils.get_md5(fp)

if md5_sum == self.md5:
msg = f'skipping {file_path}, file already exists based on checksum.'
log.info(msg)
if verbose:
print(f' {msg}', file=sys.stderr)
return
elif not fileobj:
st = os.stat(file_path.encode('utf-8'))
if st.st_mtime == last_mod_mtime:
if self.name == f'{self.identifier}_files.xml' \
or (st.st_size == self.size):
msg = (f'skipping {file_path}, file already exists based on '
'length and date.')
log.info(msg)
if verbose:
print(f' {msg}', file=sys.stderr)
return

elif return_responses:
return response

if verbose:
Expand Down Expand Up @@ -298,11 +308,11 @@ def download(self, file_path=None, verbose=None, ignore_existing=None,
else:
raise exc

# Set mtime with mtime from files.xml.
# Set mtime with timestamp from Last-Modified header
if not no_change_timestamp:
# If we want to set the timestamp to that of the original archive...
with suppress(OSError): # Probably file-like object, e.g. sys.stdout.
os.utime(file_path.encode('utf-8'), (0, self.mtime))
os.utime(file_path.encode('utf-8'), (0,last_mod_mtime))

msg = f'downloaded {self.identifier}/{self.name} to {file_path}'
log.info(msg)
Expand Down
4 changes: 3 additions & 1 deletion tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,10 +266,12 @@ def test_upload_validate_identifier():

def test_download(tmpdir):
tmpdir.chdir()
last_mod_header = {"Last-Modified": "Tue, 14 Nov 2023 20:25:48 GMT"}
with IaRequestsMock() as rsps:
rsps.add(responses.GET,
f'{PROTOCOL}//archive.org/download/nasa/nasa_meta.xml',
body='test content')
body='test content',
adding_headers=last_mod_header)
rsps.add_metadata_mock('nasa')
download('nasa', 'nasa_meta.xml')
p = os.path.join(str(tmpdir), 'nasa')
Expand Down
54 changes: 40 additions & 14 deletions tests/test_item.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
DOWNLOAD_URL_RE = re.compile(f'{PROTOCOL}//archive.org/download/.*')
S3_URL_RE = re.compile(r'.*s3.us.archive.org/.*')

EXPECTED_LAST_MOD_HEADER = {"Last-Modified": "Tue, 14 Nov 2023 20:25:48 GMT"}
EXPECTED_S3_HEADERS = {
'content-length': '7557',
'x-archive-queue-derive': '1',
Expand Down Expand Up @@ -145,11 +146,15 @@ def test_get_files_no_matches(nasa_item):
def test_download(tmpdir, nasa_item):
tmpdir.chdir()
with IaRequestsMock() as rsps:
rsps.add(responses.GET, DOWNLOAD_URL_RE, body='test content')
rsps.add(responses.GET, DOWNLOAD_URL_RE,
body='test content',
adding_headers=EXPECTED_LAST_MOD_HEADER)
nasa_item.download(files='nasa_meta.xml')
assert len(tmpdir.listdir()) == 1
with IaRequestsMock() as rsps:
rsps.add(responses.GET, DOWNLOAD_URL_RE, body='new test content')
rsps.add(responses.GET, DOWNLOAD_URL_RE,
body='new test content',
adding_headers=EXPECTED_LAST_MOD_HEADER)
nasa_item.download(files='nasa_meta.xml')
with open('nasa/nasa_meta.xml') as fh:
assert fh.read() == 'new test content'
Expand All @@ -158,7 +163,9 @@ def test_download(tmpdir, nasa_item):
def test_download_io_error(tmpdir, nasa_item):
tmpdir.chdir()
with IaRequestsMock() as rsps:
rsps.add(responses.GET, DOWNLOAD_URL_RE, body='test content')
rsps.add(responses.GET, DOWNLOAD_URL_RE,
body='test content',
adding_headers=EXPECTED_LAST_MOD_HEADER)
nasa_item.download(files='nasa_meta.xml')
rsps.reset()
with pytest.raises(ConnectionError):
Expand All @@ -167,7 +174,9 @@ def test_download_io_error(tmpdir, nasa_item):

def test_download_ignore_errors(tmpdir, nasa_item):
with IaRequestsMock() as rsps:
rsps.add(responses.GET, DOWNLOAD_URL_RE, body='test content')
rsps.add(responses.GET, DOWNLOAD_URL_RE,
body='test content',
adding_headers=EXPECTED_LAST_MOD_HEADER)
nasa_item.download(files='nasa_meta.xml')
nasa_item.download(files='nasa_meta.xml', ignore_errors=True)

Expand All @@ -177,11 +186,13 @@ def test_download_ignore_existing(tmpdir, nasa_item):
with IaRequestsMock(
assert_all_requests_are_fired=False) as rsps:
rsps.add(responses.GET, DOWNLOAD_URL_RE,
body='test content')
body='test content',
adding_headers=EXPECTED_LAST_MOD_HEADER)
nasa_item.download(files='nasa_meta.xml', ignore_existing=True)

rsps.add(responses.GET, DOWNLOAD_URL_RE,
body='new test content')
body='new test content',
adding_headers=EXPECTED_LAST_MOD_HEADER)
nasa_item.download(files='nasa_meta.xml', ignore_existing=True)
with open('nasa/nasa_meta.xml') as fh:
assert fh.read() == 'test content'
Expand All @@ -190,11 +201,15 @@ def test_download_ignore_existing(tmpdir, nasa_item):
def test_download_clobber(tmpdir, nasa_item):
tmpdir.chdir()
with IaRequestsMock() as rsps:
rsps.add(responses.GET, DOWNLOAD_URL_RE, body='test content')
rsps.add(responses.GET, DOWNLOAD_URL_RE,
body='test content',
adding_headers=EXPECTED_LAST_MOD_HEADER)
nasa_item.download(files='nasa_meta.xml')

rsps.reset()
rsps.add(responses.GET, DOWNLOAD_URL_RE, body='new test content')
rsps.add(responses.GET, DOWNLOAD_URL_RE,
body='new test content',
adding_headers=EXPECTED_LAST_MOD_HEADER)
nasa_item.download(files='nasa_meta.xml')
assert load_file('nasa/nasa_meta.xml') == 'new test content'

Expand All @@ -205,8 +220,12 @@ def test_download_checksum(tmpdir, caplog):
# test overwrite based on checksum.
with IaRequestsMock() as rsps:
rsps.add_metadata_mock('nasa')
rsps.add(responses.GET, DOWNLOAD_URL_RE, body='test content')
rsps.add(responses.GET, DOWNLOAD_URL_RE, body='overwrite based on md5')
rsps.add(responses.GET, DOWNLOAD_URL_RE,
body='test content',
adding_headers=EXPECTED_LAST_MOD_HEADER)
rsps.add(responses.GET, DOWNLOAD_URL_RE,
body='overwrite based on md5',
adding_headers=EXPECTED_LAST_MOD_HEADER)

nasa_item = get_item('nasa')
nasa_item.download(files='nasa_meta.xml')
Expand All @@ -218,7 +237,8 @@ def test_download_checksum(tmpdir, caplog):
with caplog.at_level(logging.DEBUG):
rsps.reset()
rsps.add(responses.GET, DOWNLOAD_URL_RE,
body=load_test_data_file('nasa_meta.xml'))
body=load_test_data_file('nasa_meta.xml'),
adding_headers=EXPECTED_LAST_MOD_HEADER)
nasa_item.download(files='nasa_meta.xml', checksum=True, verbose=True)
nasa_item.download(files='nasa_meta.xml', checksum=True, verbose=True)

Expand All @@ -229,7 +249,9 @@ def test_download_checksum(tmpdir, caplog):
def test_download_destdir(tmpdir, nasa_item):
tmpdir.chdir()
with IaRequestsMock() as rsps:
rsps.add(responses.GET, DOWNLOAD_URL_RE, body='new destdir')
rsps.add(responses.GET, DOWNLOAD_URL_RE,
body='new destdir',
adding_headers=EXPECTED_LAST_MOD_HEADER)
dest = os.path.join(str(tmpdir), 'new destdir')
nasa_item.download(files='nasa_meta.xml', destdir=dest)
assert 'nasa' in os.listdir(dest)
Expand All @@ -241,7 +263,9 @@ def test_download_no_directory(tmpdir, nasa_item):
url_re = re.compile(f'{PROTOCOL}//archive.org/download/.*')
tmpdir.chdir()
with IaRequestsMock() as rsps:
rsps.add(responses.GET, url_re, body='no dest dir')
rsps.add(responses.GET, url_re,
body='no dest dir',
adding_headers=EXPECTED_LAST_MOD_HEADER)
nasa_item.download(files='nasa_meta.xml', no_directory=True)
with open(os.path.join(str(tmpdir), 'nasa_meta.xml')) as fh:
assert fh.read() == 'no dest dir'
Expand Down Expand Up @@ -278,9 +302,11 @@ def test_download_dry_run_on_the_fly_formats(tmpdir, capsys, nasa_item):
def test_download_verbose(tmpdir, capsys, nasa_item):
tmpdir.chdir()
with IaRequestsMock(assert_all_requests_are_fired=False) as rsps:
headers = {'content-length': '11'}
headers.update(EXPECTED_LAST_MOD_HEADER)
rsps.add(responses.GET, DOWNLOAD_URL_RE,
body='no dest dir',
adding_headers={'content-length': '11'})
adding_headers=headers)
nasa_item.download(files='nasa_meta.xml', verbose=True)
out, err = capsys.readouterr()
assert 'downloading nasa_meta.xml' in err
Expand Down