feat: Add EuroSAT-MS support

kai-tub · Jul 5, 2024 · 7131812 · 7131812
1 parent 495885b
commit 7131812
Show file tree

Hide file tree

Showing 12 changed files with 272 additions and 15 deletions.
diff --git a/README.md b/README.md
@@ -47,6 +47,7 @@ Currently, `rico-hdl` supports:
 - [BigEarthNet-MM v2.0][ben]
 - [HySpecNet-11k][hyspecnet]
 - [UC Merced Land Use][ucmerced]
+- [EuroSAT][euro]
 
 Additional datasets will be added in the near future.
 
@@ -304,27 +305,27 @@ integration_tests/tiffs/UCMerced_LandUse
 ```
 'airplane00':
   {
-    'Red':   <256x256 int16 safetensors image data>
-    'Green': <256x256 int16 safetensors image data>
-    'Blue':  <256x256 int16 safetensors image data>
+    'Red':   <256x256 uint8 safetensors image data>
+    'Green': <256x256 uint8 safetensors image data>
+    'Blue':  <256x256 uint8 safetensors image data>
   },
 'airplane42':
   {
-    'Red':   <256x256 int16 safetensors image data>
-    'Green': <256x256 int16 safetensors image data>
-    'Blue':  <256x256 int16 safetensors image data>
+    'Red':   <256x256 uint8 safetensors image data>
+    'Green': <256x256 uint8 safetensors image data>
+    'Blue':  <256x256 uint8 safetensors image data>
   },
 'forest10':
   {
-    'Red':   <256x256 int16 safetensors image data>
-    'Green': <256x256 int16 safetensors image data>
-    'Blue':  <256x256 int16 safetensors image data>
+    'Red':   <256x256 uint8 safetensors image data>
+    'Green': <256x256 uint8 safetensors image data>
+    'Blue':  <256x256 uint8 safetensors image data>
   },
 'forest99':
   {
-    'Red':   <256x256 int16 safetensors image data>
-    'Green': <256x256 int16 safetensors image data>
-    'Blue':  <256x256 int16 safetensors image data>
+    'Red':   <256x256 uint8 safetensors image data>
+    'Green': <256x256 uint8 safetensors image data>
+    'Blue':  <256x256 uint8 safetensors image data>
   }
 ```
 
@@ -352,6 +353,96 @@ tensor = np.stack([safetensor_dict[key] for key in ["Red", "Green", "Blue"]])
 assert tensor.shape == (3, 256, 256)
 ```
 
+### [EuroSAT][euro] Example
+
+First, [download the rico-hdl](#Download) binary and install
+the Python [lmdb][pyl] and [saftensors][pys] packages.
+Then, to convert the patches from the [EuroSAT][euro] multi-spectral
+dataset into the optimized format, call the application with:
+
+```bash
+rico-hdl eurosat-multi-spectral --dataset-dir <EURO_SAT_MS_ROOT_DIR> --dataset-dir Encoded-EuroSAT-MS
+```
+
+In [EuroSAT][euro], each patch contains 13 bands from a Sentinel-2 L1C tile.
+The encoder will convert each patch into a [safetensors][s]
+where the dictionary's key is the band name (`B01`, `B02`,..., `B10`, `B11`, `B12`, `B8A`)
+of the safetensor dictionary.
+
+<details>
+  <summary>Example Input</summary>
+
+```
+integration_tests/tiffs/EuroSAT_MS
+├── AnnualCrop
+│  └── AnnualCrop_1.tif
+├── Pasture
+│  └── Pasture_300.tif
+└── SeaLake
+   └── SeaLake_3000.tif
+```
+</details>
+
+<details>
+  <summary>LMDB Result</summary>
+
+```
+'AnnualCrop_1':
+  {
+    'B01':   <64x64 uint16 safetensors image data>,
+    'B02':   <64x64 uint16 safetensors image data>,
+    'B03':   <64x64 uint16 safetensors image data>,
+    'B04':   <64x64 uint16 safetensors image data>,
+    'B05':   <64x64 uint16 safetensors image data>,
+    'B06':   <64x64 uint16 safetensors image data>,
+    'B07':   <64x64 uint16 safetensors image data>,
+    'B08':   <64x64 uint16 safetensors image data>,
+    'B09':   <64x64 uint16 safetensors image data>,
+    'B10':   <64x64 uint16 safetensors image data>,
+    'B11':   <64x64 uint16 safetensors image data>,
+    'B12':   <64x64 uint16 safetensors image data>,
+    'B08A':  <64x64 uint16 safetensors image data>,
+  },
+```
+
+</details>
+
+```python
+import lmdb
+import numpy as np
+# import desired deep-learning library:
+# numpy, torch, tensorflow, paddle, flax, mlx
+from safetensors.numpy import load
+from pathlib import Path
+
+encoded_path = "Encoded-EuroSAT-MS"
+
+# Make sure to only open the environment once
+# and not everytime an item is accessed.
+env = lmdb.open(str(encoded_path), readonly=True)
+
+with env.begin() as txn:
+  # string encoding is required to map the string to an LMDB key
+  safetensor_dict = load(txn.get("AnnualCrop_1".encode()))
+
+tensor = np.stack([safetensor_dict[key] for key in [
+  "B01",
+  "B02",
+  "B03",
+  "B04",
+  "B05",
+  "B06",
+  "B07",
+  "B08",
+  "B09",
+  "B10",
+  "B11",
+  "B12",
+  "B08A"
+]])
+assert tensor.shape == (13, 64, 64)
+```
+
 
 ## Design
 
@@ -399,3 +490,4 @@ These characteristics make array-structured data formats less suitable for deep-
 [pyl]: https://lmdb.readthedocs.io/en/release/
 [pys]: https://github.com/huggingface/safetensors
 [ucmerced]: http://weegee.vision.ucmerced.edu/datasets/landuse.html
+[euro]: https://zenodo.org/records/7711810
diff --git a/flake.nix b/flake.nix
@@ -117,7 +117,8 @@
           export RICO_HDL_S2_PATH=${./integration_tests/tiffs/BigEarthNet/BigEarthNet-S2}
           export RICO_HDL_HYSPECNET_PATH=${./integration_tests/tiffs/HySpecNet-11k}
           export RICO_HDL_LMDB_REF_PATH=${./integration_tests/BigEarthNet_LMDB}
-          export RICO_HDL_UC_MERCED_PATH=${./integration_tests/BigEarthNet_LMDB}
+          export RICO_HDL_UC_MERCED_PATH=${./integration_tests/UCMerced_LandUse}
+          export RICO_HDL_EUROSAT_MS_PATH=${./integration_tests/tiffs/EuroSAT_MS}
           echo "Running Python integration tests."
           pytest ${./integration_tests/test_python_integration.py} && echo "Success!"
         '';
@@ -138,6 +139,10 @@
             name = "RICO_HDL_UC_MERCED_PATH";
             eval = "$PRJ_ROOT/integration_tests/tiffs/UCMerced_LandUse/";
           }
+          {
+            name = "RICO_HDL_EUROSAT_MS_PATH";
+            eval = "$PRJ_ROOT/integration_tests/tiffs/EuroSAT_MS";
+          }
           {
             name = "RICO_HDL_S1_PATH";
             eval = "$PRJ_ROOT/integration_tests/tiffs/BigEarthNet/BigEarthNet-S1";
@@ -165,6 +170,9 @@
             {
               projectDir = ./.;
               preferWheels = true;
+              editablePackageSources = {
+                rico-hdl = ./src;
+              };
             })
           pkgs.poetry
         ];

diff --git a/integration_tests/rico_hdl b/integration_tests/rico_hdl
@@ -0,0 +1 @@
+../rico_hdl/
diff --git a/integration_tests/test_python_integration.py b/integration_tests/test_python_integration.py
@@ -1,3 +1,9 @@
+# I am kinda duplicating code.
+# The main goal of these tests is to ensure that the data is not empty
+# which may happen over time.
+# And that the data can be loaded from the LMDB afterwards without any issues.
+# This should ensure that the safetensor format also remains functional and reproducible
+# But I do not think that I have to check if all encoded arrays remain identical for all datasets.
 import lmdb
 import rasterio
 import numpy as np
@@ -7,6 +13,7 @@
 import pytest
 import subprocess
 import hashlib
+from rico_hdl.rico_hdl import EUROSAT_MS_BANDS
 
 
 def read_single_band_raster(path):
@@ -59,6 +66,15 @@ def uc_merced_root() -> Path:
     return p
 
 
+@pytest.fixture(scope="session")
+def eurosat_ms_root() -> Path:
+    str_p = os.environ.get("RICO_HDL_EUROSAT_MS_PATH") or "./tiffs/EUROSAT_MS/"
+    p = Path(str_p)
+    assert p.exists()
+    assert p.is_dir()
+    return p
+
+
 # https://docs.pytest.org/en/6.2.x/tmpdir.html#[email protected](scope="session")
 @pytest.fixture
 def encoded_bigearthnet_s1_s2_path(s1_root, s2_root, tmpdir_factory) -> Path:
@@ -107,6 +123,21 @@ def encoded_uc_merced_path(uc_merced_root, tmpdir_factory) -> Path:
     return Path(tmp_path)
 
 
+@pytest.fixture
+def encoded_eurosat_ms_path(eurosat_ms_root, tmpdir_factory) -> Path:
+    tmp_path = tmpdir_factory.mktemp("eurosat_ms_lmdb")
+    subprocess.run(
+        [
+            "rico-hdl",
+            "eurosat-multi-spectral",
+            f"--dataset-dir={eurosat_ms_root}",
+            f"--target-dir={tmp_path}",
+        ],
+        check=True,
+    )
+    return Path(tmp_path)
+
+
 def test_bigearthnet_integration(
     s1_root, s2_root, encoded_bigearthnet_s1_s2_path, bigearthnet_lmdb_ref_path
 ):
@@ -225,3 +256,42 @@ def test_uc_merced_integration(uc_merced_root, encoded_uc_merced_path):
                 np.array_equal(source_data, decoded_dict[source_key])
                 for decoded_dict in decoded_dicts
             ), f"Couldn't find data in the LMDB database that matches the data from: {source_file}:{source_key}"
+
+
+def read_all_eurosat_ms_bands(path):
+    """
+    Given a path to a TIFF file return all bands as a dictionary,
+    where the keys are the EuroSAT MS band value
+    """
+    with rasterio.open(path) as r:
+        return {key: r.read(i) for i, key in enumerate(EUROSAT_MS_BANDS, start=1)}
+
+
+def test_eurosat_integration(eurosat_ms_root, encoded_eurosat_ms_path):
+    source_file_data = {
+        file: read_all_eurosat_ms_bands(file)
+        for file in eurosat_ms_root.glob("**/*.tif")
+    }
+    assert len(source_file_data) > 0
+
+    env = lmdb.open(str(encoded_eurosat_ms_path), readonly=True)
+
+    with env.begin(write=False) as txn:
+        cur = txn.cursor()
+        decoded_lmdb_data = {k.decode("utf-8"): load(v) for (k, v) in cur}
+
+    # The encoded data is nested inside of another safetensor dictionary,
+    # where the inner keys are derived from color mapping
+    decoded_dicts = [d for d in decoded_lmdb_data.values()]
+
+    # Simply check if the data remains identical, as this is the only _true_ thing I care about from the Python viewpoint
+    # Here I iterate over all file name and raster data as dictionaries pairs
+    # and then for each raster data dictionary iterate over all key-value pairs, where the key is the band name
+    # in the same style as the LMDB file and check if the LMDB file contained a matching array from
+    # a safetensors dictionary accessed via the shared band name as key.
+    for source_file, source_data_dict in source_file_data.items():
+        for source_key, source_data in source_data_dict.items():
+            assert any(
+                np.array_equal(source_data, decoded_dict[source_key])
+                for decoded_dict in decoded_dicts
+            ), f"Couldn't find data in the LMDB database that matches the data from: {source_file}:{source_key}"
diff --git a/integration_tests/tiffs/EuroSAT_MS/AnnualCrop/AnnualCrop_1.tif b/integration_tests/tiffs/EuroSAT_MS/AnnualCrop/AnnualCrop_1.tif
diff --git a/integration_tests/tiffs/EuroSAT_MS/Pasture/Pasture_300.tif b/integration_tests/tiffs/EuroSAT_MS/Pasture/Pasture_300.tif
diff --git a/integration_tests/tiffs/EuroSAT_MS/SeaLake/SeaLake_3000.tif b/integration_tests/tiffs/EuroSAT_MS/SeaLake/SeaLake_3000.tif
diff --git a/integration_tests/tiffs/EuroSAT_RGB/AnnualCrop/AnnualCrop_1.jpg b/integration_tests/tiffs/EuroSAT_RGB/AnnualCrop/AnnualCrop_1.jpg
diff --git a/integration_tests/tiffs/EuroSAT_RGB/Pasture/Pasture_300.jpg b/integration_tests/tiffs/EuroSAT_RGB/Pasture/Pasture_300.jpg
diff --git a/integration_tests/tiffs/EuroSAT_RGB/SeaLake/SeaLake_3000.jpg b/integration_tests/tiffs/EuroSAT_RGB/SeaLake/SeaLake_3000.jpg
diff --git a/rico_hdl/__init__.py b/rico_hdl/__init__.py
@@ -0,0 +1 @@
+