Skip to content

Commit

Permalink
init download function
Browse files Browse the repository at this point in the history
  • Loading branch information
Marvin Hofer committed Sep 13, 2023
1 parent 50db7e4 commit 810b55f
Show file tree
Hide file tree
Showing 4 changed files with 129 additions and 2 deletions.
8 changes: 6 additions & 2 deletions python/databusclient/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,5 +35,9 @@ def deploy(


@app.command()
def download(collection: str):
typer.echo(f"TODO")
def download(
localDir: str = typer.Option(..., help="local databus folder"),
databus: str = typer.Option(..., help="databus URL"),
databusURIs: List[str] = typer.Argument(...,help="any kind of these: databus identifier, databus collection identifier, query file")
):
client.download(localDir=localDir,endpoint=databus,databusURIs=databusURIs)
100 changes: 100 additions & 0 deletions python/databusclient/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@
import requests
import hashlib
import json
from tqdm import tqdm
from SPARQLWrapper import SPARQLWrapper, JSON
from urllib.parse import urldefrag
from hashlib import sha256

__debug = False

Expand Down Expand Up @@ -374,3 +377,100 @@ def deploy(
if debug or __debug:
print("---------")
print(resp.text)


def __download_file__(url, filename):
"""
Download a file from the internet with a progress bar using tqdm.
Parameters:
- url: the URL of the file to download
- filename: the local file path where the file should be saved
"""
print("download "+url)
response = requests.get(url, stream=True)
total_size_in_bytes= int(response.headers.get('content-length', 0))
block_size = 1024 # 1 Kibibyte

progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
with open(filename, 'wb') as file:
for data in response.iter_content(block_size):
progress_bar.update(len(data))
file.write(data)
progress_bar.close()
if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
print("ERROR, something went wrong")


def __query_sparql__(endpoint_url, query)-> dict:
"""
Query a SPARQL endpoint and return results in JSON format.
Parameters:
- endpoint_url: the URL of the SPARQL endpoint
- query: the SPARQL query string
Returns:
- Dictionary containing the query results
"""
sparql = SPARQLWrapper(endpoint_url)
sparql.method = 'POST'
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
return results


def __handle__databus_file_query__(endpoint_url, query) -> List[str]:
result_dict = __query_sparql__(endpoint_url,query)
for binding in result_dict['results']['bindings']:
if len(binding.keys()) > 1:
print("Error multiple bindings in query response")
break
else:
value = binding[next(iter(binding.keys()))]['value']
yield value


def wsha256(raw: str):
return sha256(raw.encode('utf-8')).hexdigest()


def __handle_databus_collection__(endpoint, uri: str)-> str:
headers = {"Accept": "text/sparql"}
return requests.get(uri, headers=headers).text


def __download_list__(urls: List[str], localDir: str):
for url in urls:
__download_file__(url=url,filename=localDir+"/"+wsha256(url))


def download(
localDir: str,
endpoint: str,
databusURIs: List[str]
) -> None:
"""
Download datasets to local storage from databus registry
------
localDir: the local directory
databusURIs: identifiers to access databus registered datasets
"""
for databusURI in databusURIs:
# dataID or databus collection
if databusURI.startswith("http://") or databusURI.startswith("https://"):
# databus collection
if "/collections/" in databusURI:
query = __handle_databus_collection__(endpoint,databusURI)
res = __handle__databus_file_query__(endpoint, query)
else:
print("dataId not supported yet")
# query in local file
elif databusURI.startswith("file://"):
print("query in file not supported yet")
# query as argument
else:
print("QUERY {}", databusURI.replace("\n"," "))
res = __handle__databus_file_query__(endpoint,databusURI)
__download_list__(res,localDir)
3 changes: 3 additions & 0 deletions python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ readme = "README.md"
python = "^3.9"
typer = "^0.6.1"
requests = "^2.28.1"
tqdm = "^2.2.3"
SPARQLWrapper = "^2.0.0"


[tool.poetry.dev-dependencies]
black = "^22.6.0"
Expand Down
20 changes: 20 additions & 0 deletions python/tests/test_download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
"""Download Tests"""
import pytest
import databusclient.client as cl

DEFAULT_ENDPOINT="https://databus.dbpedia.org/sparql"
TEST_QUERY="""
PREFIX dcat: <http://www.w3.org/ns/dcat#>
SELECT ?x WHERE {
?sub dcat:downloadURL ?x .
} LIMIT 10
"""
TEST_COLLECTION="https://databus.dbpedia.org/dbpedia/collections/dbpedia-snapshot-2022-12"

def test_with_query():
cl.download("target",DEFAULT_ENDPOINT,[TEST_QUERY]

)

def test_with_collection():
cl.download("target",DEFAULT_ENDPOINT,[TEST_COLLECTION])

0 comments on commit 810b55f

Please sign in to comment.