Skip to content

Commit

Permalink
Fix globbing top level buckets and (#312)
Browse files Browse the repository at this point in the history
* Add regression tests for 311

* form paths properly in glob and error above bucket

* changelog and version
  • Loading branch information
pjbull authored Jan 5, 2023
1 parent 2556df0 commit caeeb50
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 3 deletions.
4 changes: 4 additions & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# cloudpathlib Changelog

## v0.12.1 (2023-01-04)

- Fix glob logic for buckets; add regression test; add error on globbing all buckets ([Issue #311](https://github.com/drivendataorg/cloudpathlib/issues/311), [PR #312](https://github.com/drivendataorg/cloudpathlib/pull/312))

## v0.12.0 (2022-12-30)

- API Change: `S3Client` supports an `extra_args` kwarg now to pass extra args down to `boto3` functions; this enables Requester Pays bucket access and bucket encryption. (Issues [#254](https://github.com/drivendataorg/cloudpathlib/issues/254), [#180](https://github.com/drivendataorg/cloudpathlib/issues/180); [PR #307](https://github.com/drivendataorg/cloudpathlib/pull/307))
Expand Down
10 changes: 8 additions & 2 deletions cloudpathlib/cloudpath.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,6 +357,11 @@ def _glob_checks(self, pattern: str) -> None:
if pattern.startswith(self.cloud_prefix) or pattern.startswith("/"):
raise CloudPathNotImplementedError("Non-relative patterns are unsupported")

if self.drive == "":
raise CloudPathNotImplementedError(
".glob is only supported within a bucket or container; you can use `.iterdir` to list buckets; for example, CloudPath('s3://').iterdir()"
)

def _glob(
self: DerivedCloudPath, selector, recursive: bool
) -> Generator[DerivedCloudPath, None, None]:
Expand Down Expand Up @@ -390,12 +395,13 @@ def _build_tree(trunk, branch, nodes, is_dir):

root = _CloudPathSelectable(
self.name,
[p.name for p in self.parents[:-1]], # all parents except bucket/container
[], # nothing above self will be returned, so initial parents is empty
file_tree,
)

for p in selector.select_from(root):
yield self.client.CloudPath(f"{self.cloud_prefix}{self.drive}/{p}")
# select_from returns self.name/... so strip before joining
yield (self / str(p)[len(self.name) + 1 :])

def glob(self: DerivedCloudPath, pattern: str) -> Generator[DerivedCloudPath, None, None]:
self._glob_checks(pattern)
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,5 +61,5 @@ def load_requirements(path: Path):
"Source Code": "https://github.com/drivendataorg/cloudpathlib",
},
url="https://github.com/drivendataorg/cloudpathlib",
version="0.12.0",
version="0.12.1",
)
19 changes: 19 additions & 0 deletions tests/test_cloudpath_file_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,25 @@ def _check_glob(pattern, glob_method):
)


def test_glob_buckets(rig):
# CloudPath("s3://").glob("*") results in error
drive_level = rig.path_class(rig.path_class.cloud_prefix)

with pytest.raises(CloudPathNotImplementedError):
list(drive_level.glob("*"))

# CloudPath("s3://bucket").glob("*") should work
# bucket level glob returns correct results
# regression test for #311
bucket = rig.path_class(f"{rig.path_class.cloud_prefix}{rig.drive}")

first_result = next(bucket.glob("*"))

# assert all parts are unique
assert first_result.drive == rig.drive
assert len(first_result.parts) == len(set(first_result.parts))


def test_glob_many_open_files(rig):
# test_glob_many_open_files
# Adapted from: https://github.com/python/cpython/blob/7ffe7ba30fc051014977c6f393c51e57e71a6648/Lib/test/test_pathlib.py#L1697-L1712
Expand Down

0 comments on commit caeeb50

Please sign in to comment.