Skip to content

Commit

Permalink
Merge pull request #182 from simleo/http_header
Browse files Browse the repository at this point in the history
Better handling of HTTP header
  • Loading branch information
simleo authored Apr 10, 2024
2 parents 00f4093 + 1b66e14 commit a551acb
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 11 deletions.
23 changes: 12 additions & 11 deletions rocrate/model/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@
# limitations under the License.

from pathlib import Path
import requests
import shutil
import urllib.request
import warnings
from http.client import HTTPResponse
from io import BytesIO, StringIO

from .file_or_dir import FileOrDir
Expand All @@ -48,19 +48,20 @@ def write(self, base_path):
out_file.write(self.source.getvalue())
elif is_url(str(self.source)):
if self.fetch_remote or self.validate_url:
with urllib.request.urlopen(self.source) as response:
if self.validate_url:
if isinstance(response, HTTPResponse):
if self.validate_url:
if self.source.startswith("http"):
with requests.head(self.source) as response:
self._jsonld.update({
'contentSize': response.getheader('Content-Length'),
'encodingFormat': response.getheader('Content-Type')
'contentSize': response.headers.get('Content-Length'),
'encodingFormat': response.headers.get('Content-Type')
})
if not self.fetch_remote:
self._jsonld['sdDatePublished'] = iso_now()
if self.fetch_remote:
out_file_path.parent.mkdir(parents=True, exist_ok=True)
urllib.request.urlretrieve(response.url, out_file_path)
self._jsonld['contentUrl'] = str(self.source)
date_published = response.headers.get("Last-Modified", iso_now())
self._jsonld['sdDatePublished'] = date_published
if self.fetch_remote:
out_file_path.parent.mkdir(parents=True, exist_ok=True)
urllib.request.urlretrieve(self.source, out_file_path)
self._jsonld['contentUrl'] = str(self.source)
elif self.source is None:
# Allows to record a File entity whose @id does not exist, see #73
warnings.warn(f"No source for {self.id}")
Expand Down
17 changes: 17 additions & 0 deletions test/test_write.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import io
import pytest
import requests
import os
import uuid
import sys
Expand Down Expand Up @@ -435,3 +436,19 @@ def test_add_tree(test_data_dir, tmpdir):

with pytest.raises(ValueError):
crate.add_tree(None, dest_path="foobar")


def test_http_header(tmpdir):
crate = ROCrate()
url = "https://zenodo.org/records/10782431/files/lysozyme_datasets.zip"
file_ = crate.add_file(url, validate_url=True)
assert file_.id == url
out_path = tmpdir / 'ro_crate_out'
crate.write(out_path)
out_crate = ROCrate(out_path)
out_file = out_crate.dereference(url)
props = out_file.properties()
assert props.get("encodingFormat") == "application/octet-stream"
assert "sdDatePublished" in props
with requests.head(url) as response:
assert props["sdDatePublished"] == response.headers.get("last-modified")

0 comments on commit a551acb

Please sign in to comment.