Fix globbing top level buckets and (#312)

* Add regression tests for 311 * form paths properly in glob and error above bucket * changelog and version
drivendataorg · Jan 5, 2023 · caeeb50 · caeeb50
1 parent 2556df0
commit caeeb50
Show file tree

Hide file tree

Showing 4 changed files with 32 additions and 3 deletions.
diff --git a/HISTORY.md b/HISTORY.md
@@ -1,5 +1,9 @@
 # cloudpathlib Changelog
 
+## v0.12.1 (2023-01-04)
+
+ - Fix glob logic for buckets; add regression test; add error on globbing all buckets ([Issue #311](https://github.com/drivendataorg/cloudpathlib/issues/311), [PR #312](https://github.com/drivendataorg/cloudpathlib/pull/312))
+
 ## v0.12.0 (2022-12-30)
 
  - API Change: `S3Client` supports an `extra_args` kwarg now to pass extra args down to `boto3` functions; this enables Requester Pays bucket access and bucket encryption. (Issues [#254](https://github.com/drivendataorg/cloudpathlib/issues/254), [#180](https://github.com/drivendataorg/cloudpathlib/issues/180); [PR #307](https://github.com/drivendataorg/cloudpathlib/pull/307))

diff --git a/cloudpathlib/cloudpath.py b/cloudpathlib/cloudpath.py
@@ -357,6 +357,11 @@ def _glob_checks(self, pattern: str) -> None:
         if pattern.startswith(self.cloud_prefix) or pattern.startswith("/"):
             raise CloudPathNotImplementedError("Non-relative patterns are unsupported")
 
+        if self.drive == "":
+            raise CloudPathNotImplementedError(
+                ".glob is only supported within a bucket or container; you can use `.iterdir` to list buckets; for example, CloudPath('s3://').iterdir()"
+            )
+
     def _glob(
         self: DerivedCloudPath, selector, recursive: bool
     ) -> Generator[DerivedCloudPath, None, None]:
@@ -390,12 +395,13 @@ def _build_tree(trunk, branch, nodes, is_dir):
 
         root = _CloudPathSelectable(
             self.name,
-            [p.name for p in self.parents[:-1]],  # all parents except bucket/container
+            [],  # nothing above self will be returned, so initial parents is empty
             file_tree,
         )
 
         for p in selector.select_from(root):
-            yield self.client.CloudPath(f"{self.cloud_prefix}{self.drive}/{p}")
+            # select_from returns self.name/... so strip before joining
+            yield (self / str(p)[len(self.name) + 1 :])
 
     def glob(self: DerivedCloudPath, pattern: str) -> Generator[DerivedCloudPath, None, None]:
         self._glob_checks(pattern)

diff --git a/setup.py b/setup.py
@@ -61,5 +61,5 @@ def load_requirements(path: Path):
         "Source Code": "https://github.com/drivendataorg/cloudpathlib",
     },
     url="https://github.com/drivendataorg/cloudpathlib",
-    version="0.12.0",
+    version="0.12.1",
 )
diff --git a/tests/test_cloudpath_file_io.py b/tests/test_cloudpath_file_io.py
@@ -191,6 +191,25 @@ def _check_glob(pattern, glob_method):
     )
 
 
+def test_glob_buckets(rig):
+    # CloudPath("s3://").glob("*") results in error
+    drive_level = rig.path_class(rig.path_class.cloud_prefix)
+
+    with pytest.raises(CloudPathNotImplementedError):
+        list(drive_level.glob("*"))
+
+    # CloudPath("s3://bucket").glob("*") should work
+    # bucket level glob returns correct results
+    # regression test for #311
+    bucket = rig.path_class(f"{rig.path_class.cloud_prefix}{rig.drive}")
+
+    first_result = next(bucket.glob("*"))
+
+    # assert all parts are unique
+    assert first_result.drive == rig.drive
+    assert len(first_result.parts) == len(set(first_result.parts))
+
+
 def test_glob_many_open_files(rig):
     # test_glob_many_open_files
     #  Adapted from: https://github.com/python/cpython/blob/7ffe7ba30fc051014977c6f393c51e57e71a6648/Lib/test/test_pathlib.py#L1697-L1712