Skip to content

Commit

Permalink
Merge pull request #645 from benbou8231/add_archive_file
Browse files Browse the repository at this point in the history
Add archive file to cache files that passed checksum validation (#600)
  • Loading branch information
jjjake authored Jun 14, 2024
2 parents c97ec24 + e003e4e commit 8815d75
Show file tree
Hide file tree
Showing 7 changed files with 117 additions and 20 deletions.
14 changes: 14 additions & 0 deletions docs/source/quickstart.rst
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,20 @@ Alternatively, you can skip files based on md5 checksums. This is will take long
skipping nasa/nasa_meta.xml, file already exists based on checksum.
skipping nasa/nasa_reviews.xml, file already exists based on checksum.

Furthermore, you can skip files based on md5 checksums and user a checksum_archive file. This is will be faster than checksum alone because checksums will only need to be calculated once for every file already downloaded. Once calculated successfully, the item/file will be written to the checksum_archive file and succeeding runs will skip the checksum validation::

>>> download('nasa', verbose=True, checksum_archive=True)
nasa:
skipping nasa/__ia_thumb.jpg, file already exists based on checksum_archive.
skipping nasa/globe_west_540.jpg, file already exists based on checksum_archive.
skipping nasa/globe_west_540_thumb.jpg, file already exists based on checksum_archive.
skipping nasa/nasa_archive.torrent, file already exists based on checksum_archive.
skipping nasa_files.xml: 2.56kiB [00:00, 5.76MiB/s]
skipping nasa/nasa_itemimage.jpg, file already exists based on checksum_archive.
skipping nasa/nasa_meta.sqlite, file already exists based on checksum.
skipping nasa/nasa_meta.xml, file already exists based on checksum.
downloading nasa/nasa_reviews.xml, file already exists based on checksum.

By default, the :func:`download <internetarchive.download>` function will download all of the files in an item. However, there are a couple parameters that can be used to download only specific files. Files can be filtered using the ``glob_pattern`` parameter::

>>> download('nasa', verbose=True, glob_pattern='*xml')
Expand Down
6 changes: 6 additions & 0 deletions internetarchive/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,7 @@ def download(
verbose: bool = False,
ignore_existing: bool = False,
checksum: bool = False,
checksum_archive: bool = False,
destdir: str | None = None,
no_directory: bool = False,
retries: int | None = None,
Expand Down Expand Up @@ -335,6 +336,10 @@ def download(
:param checksum: Skip downloading file based on checksum.
:param checksum_archive: Skip downloading file based on checksum, and skip
checksum validation if it already succeeded
(will create and use _checksum_archive.txt).
:param destdir: The directory to download files to.
:param no_directory: Download files to current working
Expand Down Expand Up @@ -368,6 +373,7 @@ def download(
verbose=verbose,
ignore_existing=ignore_existing,
checksum=checksum,
checksum_archive=checksum_archive,
destdir=destdir,
no_directory=no_directory,
retries=retries,
Expand Down
3 changes: 3 additions & 0 deletions internetarchive/cli/ia_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@
ia metadata --formats <identifier>
--checksum-archive Skip files based on _checksum_archive.txt
[default: False].
--on-the-fly Download on-the-fly files, as well as other matching
files. on-the-fly files include derivative EPUB, MOBI
and DAISY files [default: False].
Expand Down Expand Up @@ -198,6 +200,7 @@ def main(argv, session: ArchiveSession) -> None:
verbose=not args['--quiet'],
ignore_existing=args['--ignore-existing'],
checksum=args['--checksum'],
checksum_archive=args['--checksum-archive'],
destdir=args['--destdir'],
no_directory=args['--no-directories'],
retries=retries,
Expand Down
71 changes: 57 additions & 14 deletions internetarchive/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,12 +138,25 @@ def __repr__(self):
f'size={self.size!r}, '
f'format={self.format!r})')

def download(# noqa: max-complexity=38
self, file_path=None, verbose=None, ignore_existing=None,
checksum=None, destdir=None, retries=None, ignore_errors=None,
fileobj=None, return_responses=None, no_change_timestamp=None,
params=None, chunk_size=None, stdout=None, ors=None,
timeout=None):
def download( # noqa: max-complexity=38
self,
file_path=None,
verbose=None,
ignore_existing=None,
checksum=None,
checksum_archive=None,
destdir=None,
retries=None,
ignore_errors=None,
fileobj=None,
return_responses=None,
no_change_timestamp=None,
params=None,
chunk_size=None,
stdout=None,
ors=None,
timeout=None,
):
"""Download the file into the current working directory.
:type file_path: str
Expand All @@ -159,6 +172,11 @@ def download(# noqa: max-complexity=38
:type checksum: bool
:param checksum: (optional) Skip downloading file based on checksum.
:type checksum_archive: bool
:param checksum_archive: (optional) Skip downloading file based on checksum, and
skip checksum validation if it already succeeded
(will create and use _checksum_archive.txt).
:type destdir: str
:param destdir: (optional) The directory to download files to.
Expand Down Expand Up @@ -201,6 +219,7 @@ def download(# noqa: max-complexity=38
verbose = False if verbose is None else verbose
ignore_existing = False if ignore_existing is None else ignore_existing
checksum = False if checksum is None else checksum
checksum_archive = False if checksum_archive is None else checksum_archive
retries = retries or 2
ignore_errors = ignore_errors or False
return_responses = return_responses or False
Expand All @@ -215,6 +234,8 @@ def download(# noqa: max-complexity=38
file_path = file_path or self.name

if destdir:
if verbose:
print(f"destdir: {destdir}")
if return_responses is not True:
try:
os.mkdir(destdir)
Expand All @@ -228,13 +249,29 @@ def download(# noqa: max-complexity=38

# Check if we should skip...
if not return_responses and os.path.exists(file_path.encode('utf-8')):
if checksum_archive:
checksum_archive_filename = '_checksum_archive.txt'
if not os.path.exists(checksum_archive_filename):
with open(checksum_archive_filename, 'w', encoding='utf-8') as f:
pass
with open(checksum_archive_filename, encoding='utf-8') as f:
checksum_archive_data = f.read().splitlines()
if file_path in checksum_archive_data:
msg = (
f'skipping {file_path}, '
f'file already exists based on checksum_archive.'
)
log.info(msg)
if verbose:
print(f' {msg}', file=sys.stderr)
return
if ignore_existing:
msg = f'skipping {file_path}, file already exists.'
log.info(msg)
if verbose:
print(f' {msg}', file=sys.stderr)
return
elif checksum:
elif checksum or checksum_archive:
with open(file_path, 'rb') as fp:
md5_sum = utils.get_md5(fp)

Expand All @@ -243,6 +280,10 @@ def download(# noqa: max-complexity=38
log.info(msg)
if verbose:
print(f' {msg}', file=sys.stderr)
if checksum_archive:
# add file to checksum_archive to skip it next time
with open(checksum_archive_filename, 'a', encoding='utf-8') as f:
f.write(f'{file_path}\n')
return

# Retry loop
Expand All @@ -256,15 +297,17 @@ def download(# noqa: max-complexity=38
and self.name != f'{self.identifier}_files.xml' \
and os.path.exists(file_path.encode('utf-8')):
st = os.stat(file_path.encode('utf-8'))
if st.st_size != self.size and not checksum:
if st.st_size != self.size and not (checksum or checksum_archive):
headers = {"Range": f"bytes={st.st_size}-"}

response = self.item.session.get(self.url,
stream=True,
timeout=timeout,
auth=self.auth,
params=params,
headers=headers)
response = self.item.session.get(
self.url,
stream=True,
timeout=timeout,
auth=self.auth,
params=params,
headers=headers,
)
# Get timestamp from Last-Modified header
last_mod_header = response.headers.get('Last-Modified')
if last_mod_header:
Expand Down
10 changes: 8 additions & 2 deletions internetarchive/item.py
Original file line number Diff line number Diff line change
Expand Up @@ -589,6 +589,7 @@ def download(self,
verbose: bool = False,
ignore_existing: bool = False,
checksum: bool = False,
checksum_archive: bool = False,
destdir: str | None = None,
no_directory: bool = False,
retries: int | None = None,
Expand Down Expand Up @@ -627,6 +628,10 @@ def download(self,
:param checksum: Skip downloading file based on checksum.
:param checksum_archive: Skip downloading file based on checksum, and skip
checksum validation if it already succeeded
(will create and use _checksum_archive.txt).
:param destdir: The directory to download files to.
:param no_directory: Download files to current working
Expand Down Expand Up @@ -670,6 +675,7 @@ def download(self,
ignore_existing = bool(ignore_existing)
ignore_errors = bool(ignore_errors)
checksum = bool(checksum)
checksum_archive = bool(checksum_archive)
no_directory = bool(no_directory)
return_responses = bool(return_responses)
no_change_timestamp = bool(no_change_timestamp)
Expand Down Expand Up @@ -746,8 +752,8 @@ def download(self,
ors = True
else:
ors = False
r = f.download(path, verbose, ignore_existing, checksum, destdir,
retries, ignore_errors, fileobj, return_responses,
r = f.download(path, verbose, ignore_existing, checksum, checksum_archive,
destdir, retries, ignore_errors, fileobj, return_responses,
no_change_timestamp, params, None, stdout, ors, timeout)
if return_responses:
responses.append(r)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ line-length = 102
max-complexity = 33

[tool.ruff.pylint]
max-args = 23
max-args = 24
max-branches = 33
max-statements = 124

Expand Down
31 changes: 28 additions & 3 deletions tests/cli/test_ia_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,9 @@ def test_clobber(tmpdir_ch):

stdout, stderr = call_cmd(cmd)
assert files_downloaded('nasa') == {'nasa_meta.xml'}
expected_stderr = ('nasa:\n'
' skipping nasa/nasa_meta.xml, file already exists based on length and date.')
prefix = 'nasa:\n'.replace('\n', os.linesep)
filepath = os.path.join('nasa', 'nasa_meta.xml')
expected_stderr = f'{prefix} skipping {filepath}, file already exists based on length and date.'
assert expected_stderr == stderr


Expand All @@ -84,7 +85,31 @@ def test_checksum(tmpdir_ch):

stdout, stderr = call_cmd('ia --insecure download --checksum nasa nasa_meta.xml')
assert files_downloaded('nasa') == {'nasa_meta.xml'}
assert 'nasa:\n skipping nasa/nasa_meta.xml, file already exists based on checksum.' == stderr
prefix = 'nasa:\n'.replace('\n', os.linesep)
filepath = os.path.join('nasa', 'nasa_meta.xml')
assert f'{prefix} skipping {filepath}, file already exists based on checksum.' == stderr


def test_checksum_archive(tmpdir_ch):
call_cmd('ia --insecure download nasa nasa_meta.xml')
assert files_downloaded('nasa') == {'nasa_meta.xml'}

stdout, stderr = call_cmd('ia --insecure download --checksum-archive nasa nasa_meta.xml')
assert files_downloaded('nasa') == {'nasa_meta.xml'}
prefix = 'nasa:\n'.replace('\n', os.linesep)
filepath = os.path.join('nasa', 'nasa_meta.xml')
assert f'{prefix} skipping {filepath}, file already exists based on checksum.' == stderr

assert '_checksum_archive.txt' in files_downloaded('.')
with open(os.path.join('.', '_checksum_archive.txt'), encoding='utf-8') as f:
filepath = os.path.join('nasa', 'nasa_meta.xml')
assert f.read() == f'{filepath}\n'

stdout, stderr = call_cmd('ia --insecure download --checksum-archive nasa nasa_meta.xml')
assert files_downloaded('nasa') == {'nasa_meta.xml'}
prefix = 'nasa:\n'.replace('\n', os.linesep)
filepath = os.path.join('nasa', 'nasa_meta.xml')
assert f'{prefix} skipping {filepath}, file already exists based on checksum_archive.' == stderr


def test_no_directories(tmpdir_ch):
Expand Down

0 comments on commit 8815d75

Please sign in to comment.