Skip to content

Commit

Permalink
intial implementation of http
Browse files Browse the repository at this point in the history
  • Loading branch information
pjbull committed Sep 1, 2024
1 parent b776bee commit 1e54b21
Show file tree
Hide file tree
Showing 13 changed files with 542 additions and 53 deletions.
8 changes: 6 additions & 2 deletions cloudpathlib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@
from .azure.azblobclient import AzureBlobClient
from .azure.azblobpath import AzureBlobPath
from .cloudpath import CloudPath, implementation_registry
from .s3.s3client import S3Client
from .gs.gspath import GSPath
from .gs.gsclient import GSClient
from .gs.gspath import GSPath
from .http.httpclient import HttpClient
from .http.httppath import HttpPath
from .s3.s3client import S3Client
from .s3.s3path import S3Path


Expand All @@ -27,6 +29,8 @@
"implementation_registry",
"GSClient",
"GSPath",
"HttpClient",
"HttpPath",
"S3Client",
"S3Path",
]
15 changes: 7 additions & 8 deletions cloudpathlib/cloudpath.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
Generator,
List,
Optional,
Sequence,
Tuple,
Type,
TYPE_CHECKING,
Expand Down Expand Up @@ -286,11 +285,11 @@ def __setstate__(self, state: Dict[str, Any]) -> None:

@property
def _no_prefix(self) -> str:
return self._str[len(self.cloud_prefix) :]
return self._str[len(self.anchor) :]

@property
def _no_prefix_no_drive(self) -> str:
return self._str[len(self.cloud_prefix) + len(self.drive) :]
return self._str[len(self.anchor) + len(self.drive) :]

@overload
@classmethod
Expand Down Expand Up @@ -881,9 +880,9 @@ def relative_to(self, other: Self, walk_up: bool = False) -> PurePosixPath:
# absolute)
if not isinstance(other, CloudPath):
raise ValueError(f"{self} is a cloud path, but {other} is not")
if self.cloud_prefix != other.cloud_prefix:
if self.anchor != other.anchor:
raise ValueError(
f"{self} is a {self.cloud_prefix} path, but {other} is a {other.cloud_prefix} path"
f"{self} is a {self.anchor} path, but {other} is a {other.anchor} path"
)

kwargs = dict(walk_up=walk_up)
Expand Down Expand Up @@ -921,7 +920,7 @@ def parent(self) -> Self:
return self._dispatch_to_path("parent")

@property
def parents(self) -> Sequence[Self]:
def parents(self) -> Tuple[Self, ...]:
return self._dispatch_to_path("parents")

@property
Expand Down Expand Up @@ -1210,8 +1209,8 @@ def _new_cloudpath(self, path: Union[str, os.PathLike]) -> Self:
path = path[1:]

# add prefix/anchor if it is not already
if not path.startswith(self.cloud_prefix):
path = f"{self.cloud_prefix}{path}"
if not path.startswith(self.anchor):
path = f"{self.anchor}{path}"

return self.client.CloudPath(path)

Expand Down
7 changes: 7 additions & 0 deletions cloudpathlib/http/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from .httpclient import HttpClient
from .httppath import HttpPath

__all__ = [
"HttpClient",
"HttpPath",
]
160 changes: 160 additions & 0 deletions cloudpathlib/http/httpclient.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
from datetime import datetime
import os
import re
import urllib.request
import urllib.parse
import urllib.error
from pathlib import Path
from typing import Iterable, Optional, Tuple, Union, Callable
import shutil
import mimetypes
import urllib.response

import pytz

from cloudpathlib.client import Client, register_client_class
from cloudpathlib.enums import FileCacheMode

from .httppath import HttpPath


@register_client_class("http")
class HttpClient(Client):
def __init__(
self,
file_cache_mode: Optional[Union[str, FileCacheMode]] = None,
local_cache_dir: Optional[Union[str, os.PathLike]] = None,
content_type_method: Optional[Callable] = mimetypes.guess_type,
auth: Optional[urllib.request.BaseHandler] = None,
custom_list_page_parser: Optional[Callable[[str], Iterable[str]]] = None,
):
super().__init__(file_cache_mode, local_cache_dir, content_type_method)
self.auth = auth

if self.auth is None:
self.opener = urllib.request.build_opener()
else:
self.openener = urllib.request.build_opener(self.auth)

self.custom_list_page_parser = custom_list_page_parser

def _get_metadata(self, cloud_path: HttpPath) -> dict:
with self.opener.open(cloud_path.as_url()) as response:
last_modified = response.headers.get("Last-Modified", None)

if last_modified is not None:
# per https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Last-Modified
last_modified = datetime.strptime(last_modified, "%a, %d %b %Y %H:%M:%S %Z")

# should always be utc https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Last-Modified#gmt
last_modified = last_modified.replace(tzinfo=pytz.UTC)

return {
"size": int(response.headers.get("Content-Length", 0)),
"last_modified": last_modified,
"content_type": response.headers.get("Content-Type", None),
}

def _download_file(self, cloud_path: HttpPath, local_path: Union[str, os.PathLike]) -> Path:
local_path = Path(local_path)
with self.opener.open(cloud_path.as_url()) as response:
with open(local_path, "wb") as out_file:
shutil.copyfileobj(response, out_file)
return local_path

def _exists(self, cloud_path: HttpPath) -> bool:
request = urllib.request.Request(cloud_path.as_url(), method="HEAD")
try:
with self.opener.open(request) as response:
return response.status == 200
except (urllib.error.HTTPError, urllib.error.URLError) as e:
if isinstance(e, urllib.error.URLError) or e.code == 404:
return False
raise

def _move_file(self, src: HttpPath, dst: HttpPath, remove_src: bool = True) -> HttpPath:
self._upload_file(src, dst)
if remove_src:
self._remove(src)
return dst

def _remove(self, cloud_path: HttpPath, missing_ok: bool = True) -> None:
request = urllib.request.Request(cloud_path.as_url(), method="DELETE")
try:
with self.opener.open(request) as response:
if response.status != 204:
raise Exception(f"Failed to delete {cloud_path}.")
except urllib.error.HTTPError as e:
if e.code == 404 and missing_ok:
pass
else:
raise FileNotFoundError(f"Failed to delete {cloud_path}.")

def _list_dir(self, cloud_path: HttpPath, recursive: bool) -> Iterable[Tuple[HttpPath, bool]]:
try:
with self.opener.open(cloud_path.as_url()) as response:
# Parse the directory listing
for path, is_dir in self._parse_list_dir_response(
response.read().decode(), base_url=str(cloud_path)
):
yield path, is_dir

# If it's a directory and recursive is True, list the contents of the directory
if recursive and is_dir:
yield from self._list_dir(path, recursive=True)

except: # noqa E722
raise NotImplementedError(
"Unable to parse response as a listing of files; please provide a custom parser as `custom_list_page_parser`."
)

def _upload_file(self, local_path: Union[str, os.PathLike], cloud_path: HttpPath) -> HttpPath:
local_path = Path(local_path)
if self.content_type_method is not None:
content_type, _ = self.content_type_method(local_path)

headers = {"Content-Type": content_type or "application/octet-stream"}

with open(local_path, "rb") as file_data:
request = urllib.request.Request(
cloud_path.as_url(), data=file_data.read(), method="PUT", headers=headers
)
with self.opener.open(request) as response:
if response.status != 201 and response.status != 200:
raise Exception(f"Failed to upload {local_path} to {cloud_path}.")
return cloud_path

def _get_public_url(self, cloud_path: HttpPath) -> str:
return cloud_path.as_url()

def _generate_presigned_url(self, cloud_path: HttpPath, expire_seconds: int = 60 * 60) -> str:
raise NotImplementedError("Presigned URLs are not supported using urllib.")

def _parse_list_dir_response(
self, response: str, base_url: str
) -> Iterable[Tuple[HttpPath, bool]]:
# Ensure base_url ends with a trailing slash so joining works
if not base_url.endswith("/"):
base_url += "/"

def _simple_links(html: str) -> Iterable[str]:
return re.findall(r'<a\s+href="([^"]+)"', html)

parser: Callable[[str], Iterable[str]] = (
self.custom_list_page_parser
if self.custom_list_page_parser is not None
else _simple_links
)

yield from (
(self.CloudPath((urllib.parse.urljoin(base_url, match))), Path(match).suffix == "")
for match in parser(response)
)

def request(self, url: HttpPath, method: str, **kwargs) -> None:
request = urllib.request.Request(url.as_url(), method=method, **kwargs)
with self.opener.open(request) as response:
return response


HttpClient.HttpPath = HttpClient.CloudPath # type: ignore
130 changes: 130 additions & 0 deletions cloudpathlib/http/httppath.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
from pathlib import PurePosixPath
from typing import Tuple, Union, Optional

import os
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import TYPE_CHECKING

from ..cloudpath import CloudPath, NoStatError, register_path_class


if TYPE_CHECKING:
from .httpclient import HttpClient


@register_path_class("http")
class HttpPath(CloudPath):
cloud_prefix = "http://"
client: "HttpClient"

def __init__(
self,
cloud_path: Union[str, "HttpPath"],
client: Optional["HttpClient"] = None,
) -> None:
super().__init__(cloud_path, client)

self._path = (
PurePosixPath(self._url.path)
if self._url.path.startswith("/")
else PurePosixPath(f"/{self._url.path}")
)

@property
def drive(self) -> str:
# For HTTP paths, no drive; use .anchor for scheme + netloc
return self._url.netloc

@property
def anchor(self) -> str:
return f"{self._url.scheme}://{self._url.netloc}/"

@property
def _no_prefix_no_drive(self) -> str:
# netloc appears in anchor and drive for httppath; so don't double count
return self._str[len(self.anchor) - 1 :]

def is_dir(self) -> bool:
if not self.exists():
return False

# HTTP doesn't really have directories, but some servers might list files if treated as such
# Here we'll assume paths without are dirs
return self._path.suffix == ""

def is_file(self) -> bool:
if not self.exists():
return False

# HTTP doesn't have a direct file check, but we assume if it has a suffix, it's a file
return self._path.suffix != ""

def mkdir(self, parents: bool = False, exist_ok: bool = False) -> None:
pass # no-op for HTTP Paths

def touch(self, exist_ok: bool = True) -> None:
if self.exists():
if not exist_ok:
raise FileExistsError(f"File already exists: {self}")

raise NotImplementedError(
"Touch not implemented for existing HTTP files since we can't update the modified time."
)
else:
empty_file = Path(TemporaryDirectory().name) / "empty_file.txt"
empty_file.parent.mkdir(parents=True, exist_ok=True)
empty_file.write_text("")
self.client._upload_file(empty_file, self)

def stat(self, follow_symlinks: bool = True) -> os.stat_result:
try:
meta = self.client._get_metadata(self)
except: # noqa E722
raise NoStatError(f"Could not get metadata for {self}")

return os.stat_result(
( # type: ignore
None, # mode
None, # ino
self.cloud_prefix, # dev,
None, # nlink,
None, # uid,
None, # gid,
meta.get("size", 0), # size,
None, # atime,
meta.get("last_modified", 0).timestamp(), # mtime,
None, # ctime,
)
)

def as_url(self, presign: bool = False, expire_seconds: int = 60 * 60) -> str:
if presign:
raise NotImplementedError("Presigning not supported for HTTP paths")

return (
self._url.geturl()
) # recreate from what was initialized so we have the same query params, etc.

@property
def name(self) -> str:
return self._path.name

@property
def parents(self) -> Tuple["HttpPath", ...]:
return super().parents + (self._new_cloudpath(""),)

def get(self, **kwargs):
return self.client.request(self, "GET", **kwargs)

def put(self, **kwargs):
return self.client.request(self, "PUT", **kwargs)

def post(self, **kwargs):
return self.client.request(self, "POST", **kwargs)

def delete(self, **kwargs):
return self.client.request(self, "DELETE", **kwargs)

def head(self, **kwargs):
return self.client.request(self, "HEAD", **kwargs)
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,5 @@ tabulate
tenacity
tqdm
typer
types-pytz
wheel
Loading

0 comments on commit 1e54b21

Please sign in to comment.