diff --git a/cyano/config.py b/cyano/config.py index adc2eb85..46e8f94b 100644 --- a/cyano/config.py +++ b/cyano/config.py @@ -22,6 +22,7 @@ class FeaturesConfig(BaseModel): pc_meters_search_window: Optional[int] = 1000 use_sentinel_bands: Optional[List] = ["B02", "B03", "B04"] image_feature_meter_window: Optional[int] = 500 + n_sentinel_items: Optional[int] = 1 satellite_features: Optional[List] = [ "B02_mean", "B02_min", diff --git a/cyano/data/features.py b/cyano/data/features.py index e94b793e..373152f4 100644 --- a/cyano/data/features.py +++ b/cyano/data/features.py @@ -51,26 +51,32 @@ def generate_satellite_features( # Iterate over samples for uid in tqdm(uids): satellite_features_dict[uid] = {} - sample_dir = Path(cache_dir) / f"satellite/{uid}" + sample_dir = Path(cache_dir) / f"sentinel_{config.image_feature_meter_window}/{uid}" # Skip samples with no imagery if not sample_dir.exists(): continue - # Load stacked array for each image + # Load band arrays for each image # Right now we only have one item per sample, process will need to # change if we have multiple - item_paths = list(sample_dir.glob("*.npy")) - if len(item_paths) > 1: + item_dirs = list(sample_dir.iterdir()) + if len(item_dirs) == 0: + continue + elif len(item_dirs) > 1: raise NotImplementedError( f"{uid} has multiple items, cannot process multiple items per sample" ) - stacked_array = np.load(item_paths[0]) - # Load stacked array in dictionary form with band names for keys + item_dir = item_dirs[0] + # Load band arrays into a dictionary with band names for keys band_arrays = {} # If we want to mask image data with water boundaries in some way, add here - for idx, band in enumerate(config.use_sentinel_bands): - band_arrays[band] = stacked_array[idx] + for band in config.use_sentinel_bands: + if not (item_dir / f"{band}.npy").exists(): + raise FileNotFoundError( + f"Band {band} is missing from pystac item directory {item_dir}" + ) + band_arrays[band] = np.load(item_dir / f"{band}.npy") # Iterate over features to generate for feature in config.satellite_features: diff --git a/cyano/data/satellite_data.py b/cyano/data/satellite_data.py index fd7c2c0b..c54e8fba 100644 --- a/cyano/data/satellite_data.py +++ b/cyano/data/satellite_data.py @@ -1,6 +1,9 @@ from datetime import timedelta -from typing import List, Union +import json +import shutil +from typing import Dict, List, Tuple, Union +from cloudpathlib import AnyPath import geopy.distance as distance from loguru import logger import numpy as np @@ -41,13 +44,13 @@ def get_bounding_box(latitude: float, longitude: float, meters_window: int) -> L def get_date_range(date: str, days_window: int) -> str: """Get a date range to search for in the planetary computer based on a sample's date. The time range will go from time_buffer_days - before the sample date to time_buffer_days after the sample date + before the sample date to the sample date Returns a string""" datetime_format = "%Y-%m-%d" - range_start = pd.to_datetime(date) - timedelta(days=days_window) - range_end = pd.to_datetime(date) + timedelta(days=days_window) - date_range = f"{range_start.strftime(datetime_format)}/{range_end.strftime(datetime_format)}" + date = pd.to_datetime(date) + range_start = date - timedelta(days=days_window) + date_range = f"{range_start.strftime(datetime_format)}/{date.strftime(datetime_format)}" return date_range @@ -85,6 +88,28 @@ def search_planetary_computer( return search_results +def bbox_from_geometry(geometry: Dict) -> Dict: + """For pystac items that don't have the bbox attribute, get the + bbox from the geometry + + Args: + geometry (Dict): A dictionary of geometry from item.geometry + + Returns: + Dict: Dictionary with keys for min_long, max_long, min_lat, + and max_lat + """ + lons = [coord_pair[0] for coord_pair in geometry["coordinates"][0]] + lats = [coord_pair[1] for coord_pair in geometry["coordinates"][0]] + + return { + "min_long": min(lons), + "max_long": max(lons), + "min_lat": min(lats), + "max_lat": max(lats), + } + + def get_items_metadata( search_results: ItemSearch, latitude: float, @@ -111,13 +136,24 @@ def get_items_metadata( item_meta = { "item_id": item.id, "datetime": item.datetime.strftime("%Y-%m-%d"), - "min_long": item.bbox[0], - "max_long": item.bbox[2], - "min_lat": item.bbox[1], - "max_lat": item.bbox[3], + "platform": item.properties["platform"], } + # Add item bounding box + if "bbox" in item.to_dict(): + item_meta.update( + { + "min_long": item.bbox[0], + "max_long": item.bbox[2], + "min_lat": item.bbox[1], + "max_lat": item.bbox[3], + } + ) + elif "geometry" in item.to_dict(): + bbox_dict = bbox_from_geometry(item.geometry) + item_meta.update(bbox_dict) + if "eo:cloud_cover" in item.properties: - item_meta.update({"cloud_cover": item.properties["eo:cloud_cover"]}) + item_meta.update({"eo:cloud_cover": item.properties["eo:cloud_cover"]}) # Add links to download each band needed for features for band in config.use_sentinel_bands: item_meta.update({f"{band}_href": item.assets[band].href}) @@ -137,123 +173,207 @@ def get_items_metadata( return items_meta +def generate_candidate_metadata( + samples: pd.DataFrame, config: FeaturesConfig +) -> Tuple[pd.DataFrame, Dict]: + """Generate metadata for all of the satellite item candidates + that could be used to generate features for each sample + + Args: + samples (pd.DataFrame): Dataframe where the index is uid and + there are columns for date, longitude, and latitude + config (FeaturesConfig): Features config + + Returns: + Tuple[pd.DataFrame, Dict]: Tuple of (metadata for all sentinel item + candidates, dictionary mapping sample UIDs to the relevant + pystac item IDs) + """ + logger.info("Generating metadata for all satellite item candidates") + + if len(samples) > 20: + # Load from saved directory with search results for all competition data + # Remove for final package + pc_results_dir = ( + AnyPath("s3://drivendata-competition-nasa-cyanobacteria") + / "data/interim/full_pc_search" + ) + sentinel_meta = pd.read_csv(pc_results_dir / "sentinel_metadata.csv") + logger.info( + f"Loaded {sentinel_meta.shape[0]:,} rows of Sentinel candidate metadata from {pc_results_dir}" + ) + with open(pc_results_dir / "sample_item_map.json", "r") as fp: + sample_item_map = json.load(fp) + + return (sentinel_meta, sample_item_map) + + # Otherwise, search the planetary computer + logger.info( + f"Searching {config.pc_collections} within {config.pc_days_search_window} days and {config.pc_meters_search_window} meters" + ) + sentinel_meta = [] + sample_item_map = {} + for sample in tqdm(samples.itertuples(), total=len(samples)): + # Search planetary computer + search_results = search_planetary_computer( + sample.date, + sample.latitude, + sample.longitude, + collections=config.pc_collections, + days_search_window=config.pc_days_search_window, + meters_search_window=config.pc_meters_search_window, + ) + + # Get satelite metadata + sample_items_meta = get_items_metadata( + search_results, sample.latitude, sample.longitude, config + ) + + sample_item_map[sample.Index] = { + "sentinel_item_ids": sample_items_meta.item_id.tolist() + if len(sample_items_meta) > 0 + else [] + } + sentinel_meta.append(sample_items_meta) + sentinel_meta = ( + pd.concat(sentinel_meta).groupby("item_id", as_index=False).first().reset_index(drop=True) + ) + logger.info(f"Generated metadata for {sentinel_meta.shape[0]:,} Sentinel item candidates") + + return (sentinel_meta, sample_item_map) + + def select_items( items_meta: pd.DataFrame, + date: Union[str, pd.Timestamp], + config: FeaturesConfig, ) -> List[str]: """Select which pystac items to include for a given sample Args: item_meta (pd.DataFrame): Dataframe with metadata about all possible pystac items to include for the given sample + date (Union[str, pd.Timestamp]): Date the sample was collected + config (FeaturesConfig): Features config Returns: List[str]: List of the pystac items IDs for the selected items """ - # Select least cloudy item - least_cloudy = items_meta.sort_values(by="cloud_cover").iloc[0].item_id + # Calculate days between sample and image + items_meta["day_diff"] = (pd.to_datetime(date) - pd.to_datetime(items_meta.datetime)).dt.days + # Filter by time frame + items_meta = items_meta[items_meta.day_diff.between(0, config.pc_days_search_window)].copy() - return [least_cloudy] + # Sort and select + items_meta["day_diff"] = np.abs(items_meta.day_diff) + selected = items_meta.sort_values( + by=["eo:cloud_cover", "day_diff"], ascending=[True, True] + ).head(config.n_sentinel_items) + return selected.item_id.tolist() -def identify_satellite_data( - samples: pd.DataFrame, config: FeaturesConfig, cache_dir -) -> pd.DataFrame: + +def identify_satellite_data(samples: pd.DataFrame, config: FeaturesConfig) -> pd.DataFrame: """Identify all pystac items to be used during feature generation for a given set of samples Args: samples (pd.DataFrame): Dataframe where the index is uid and there are columns for date, longitude, and latitude - config (FeaturesConfig): Featires config + config (FeaturesConfig): Features config Returns: pd.DataFrame: Each row is a unique combination of sample ID and pystac item id. The 'selected' column indicates which will be used in feature generation """ - save_dir = Path(cache_dir) / "satellite" - save_dir.mkdir(exist_ok=True, parents=True) - logger.info( - f"Searching {config.pc_collections} within {config.pc_days_search_window} days and {config.pc_meters_search_window} meters" - ) + ## Get all candidate item metadata + candidate_sentinel_meta, sample_item_map = generate_candidate_metadata(samples, config) - satellite_meta = [] - no_results = 0 + ## Select which items to use for each sample + logger.info("Selecting which items to use for feature generation") + selected_satellite_meta = [] for sample in tqdm(samples.itertuples(), total=len(samples)): - # Search planetary computer - search_results = search_planetary_computer( - sample.date, - sample.latitude, - sample.longitude, - collections=config.pc_collections, - days_search_window=config.pc_days_search_window, - meters_search_window=config.pc_meters_search_window, - ) - - # Get satelite metadata - sample_items_meta = get_items_metadata( - search_results, sample.latitude, sample.longitude, config - ) - if len(sample_items_meta) == 0: - no_results += 1 + sample_item_ids = sample_item_map[sample.Index]["sentinel_item_ids"] + if len(sample_item_ids) == 0: continue - # Select items to use for features - selected_ids = select_items(sample_items_meta) - sample_items_meta["selected"] = sample_items_meta.item_id.isin(selected_ids) + sample_items_meta = candidate_sentinel_meta[ + candidate_sentinel_meta.item_id.isin(sample_item_ids) + ].copy() + selected_ids = select_items(sample_items_meta, sample.date, config) + + # Save out the selected items + sample_items_meta = sample_items_meta[sample_items_meta.item_id.isin(selected_ids)] sample_items_meta["sample_id"] = sample.Index - satellite_meta.append(sample_items_meta) - logger.info(f"{no_results} samples did not return any satellite imagery results") + selected_satellite_meta.append(sample_items_meta) + + selected_satellite_meta = pd.concat(selected_satellite_meta).reset_index(drop=True) + logger.info( + f"Identified satellite imagery for {selected_satellite_meta.sample_id.nunique():,} samples" + ) - # Concatenate satellite meta for all samples - return pd.concat(satellite_meta) + return selected_satellite_meta def download_satellite_data( - satellite_meta: pd.DataFrame, samples: pd.DataFrame, config: FeaturesConfig, cache_dir + satellite_meta: pd.DataFrame, + samples: pd.DataFrame, + config: FeaturesConfig, + cache_dir: Union[str, Path], ): """Download satellite images as one stacked numpy arrays per pystac item Args: satellite_meta (pd.DataFrame): Dataframe of satellite metadata - indicating which pystac item(s) will be used in feature - generation for each sample + for all pystac items that have been selected for us in + feature generation samples (pd.DataFrame): Dataframe where the index is uid and there are columns for date, longitude, and latitude config (FeaturesConfig): Features config + cache_dir (Union[str, Path]): Cache directory to save raw imagery """ - # Filter to images selected for feature generation - selected = satellite_meta[satellite_meta.selected] - # Iterate over all rows (item / sample combos) logger.info(f"Downloading bands {config.use_sentinel_bands}") - for _, download_row in tqdm(selected.iterrows(), total=len(selected)): - sample_row = samples.loc[download_row.sample_id] - sample_dir = Path(cache_dir) / f"satellite/{download_row.sample_id}" - sample_dir.mkdir(exist_ok=True, parents=True) + no_data_in_bounds_errs = 0 - # Get bounding box for array to save out - (minx, miny, maxx, maxy) = get_bounding_box( - sample_row.latitude, sample_row.longitude, config.image_feature_meter_window - ) - # Iterate over bands and stack - band_arrays = [] - for band in config.use_sentinel_bands: - band_array = ( - rioxarray.open_rasterio(pc.sign(download_row[f"{band}_href"])) - .rio.clip_box( - minx=minx, - miny=miny, - maxx=maxx, - maxy=maxy, - crs="EPSG:4326", - ) - .to_numpy() + imagery_dir = Path(cache_dir) / f"sentinel_{config.image_feature_meter_window}" + for _, download_row in tqdm(satellite_meta.iterrows(), total=len(satellite_meta)): + sample_row = samples.loc[download_row.sample_id] + sample_image_dir = imagery_dir / f"{download_row.sample_id}/{download_row.item_id}" + sample_image_dir.mkdir(exist_ok=True, parents=True) + try: + # Get bounding box for array to save out + (minx, miny, maxx, maxy) = get_bounding_box( + sample_row.latitude, sample_row.longitude, config.image_feature_meter_window ) - band_arrays.append(band_array) - stacked_array = np.vstack(band_arrays) - - # Save stacked array - array_save_path = sample_dir / f"{download_row.item_id}.npy" - np.save(array_save_path, stacked_array) + # Iterate over bands and save + for band in config.use_sentinel_bands: + # Check if the file already exists + array_save_path = sample_image_dir / f"{band}.npy" + if not array_save_path.exists(): + # Get unsigned URL so we don't use expired token + unsigned_href = download_row[f"{band}_href"].split("?")[0] + band_array = ( + rioxarray.open_rasterio(pc.sign(unsigned_href)) + .rio.clip_box( + minx=minx, + miny=miny, + maxx=maxx, + maxy=maxy, + crs="EPSG:4326", + ) + .to_numpy() + ) + np.save(array_save_path, band_array) + + except rioxarray.exceptions.NoDataInBounds: + no_data_in_bounds_errs += 1 + # Delete item directory if it has already been created + if sample_image_dir.exists(): + shutil.rmtree(sample_image_dir) + if no_data_in_bounds_errs > 0: + logger.warning( + f"Could not download {no_data_in_bounds_errs:,} image/sample combinations with no data in bounds" + ) diff --git a/cyano/experiment.py b/cyano/experiment.py index 45f6e97f..2b7143c3 100644 --- a/cyano/experiment.py +++ b/cyano/experiment.py @@ -16,6 +16,7 @@ class ExperimentConfig(BaseModel): predict_csv: Path cache_dir: Path = None save_dir: Path = None + debug: bool = False @field_serializer("train_csv", "predict_csv", "cache_dir", "save_dir") def serialize_path_to_str(self, x, _info): @@ -27,20 +28,25 @@ def run_experiment(self): model_training_config=self.model_training_config, cache_dir=self.cache_dir, ) - pipeline.run_training(train_csv=self.train_csv, save_path=self.save_dir / "model.zip") + pipeline.run_training( + train_csv=self.train_csv, save_path=self.save_dir / "model.zip", debug=self.debug + ) logger.success(f"Writing out artifact config to {self.save_dir}") with open(f"{self.save_dir}/config_artifact.yaml", "w") as fp: yaml.dump(self.model_dump(), fp) pipeline.run_prediction( - predict_csv=self.predict_csv, preds_path=self.save_dir / "preds.csv" + predict_csv=self.predict_csv, preds_path=self.save_dir / "preds.csv", debug=self.debug ) - EvaluatePreds( - y_true_csv=self.predict_csv, - y_pred_csv=self.save_dir / "preds.csv", - save_dir=self.save_dir / "metrics", - ).calculate_all_and_save() + if self.debug: + logger.info("Evaluation is not run in debug mode") + else: + EvaluatePreds( + y_true_csv=self.predict_csv, + y_pred_csv=self.save_dir / "preds.csv", + save_dir=self.save_dir / "metrics", + ).calculate_all_and_save() - logger.success(f"Wrote out metrics to {self.save_dir}/metrics") + logger.success(f"Wrote out metrics to {self.save_dir}/metrics") diff --git a/cyano/pipeline.py b/cyano/pipeline.py index fb40f9f1..55a7a87a 100644 --- a/cyano/pipeline.py +++ b/cyano/pipeline.py @@ -53,7 +53,7 @@ def _prep_train_data(self, data, debug=False): def _prepare_features(self, samples): ## Identify satellite data - satellite_meta = identify_satellite_data(samples, self.features_config, self.cache_dir) + satellite_meta = identify_satellite_data(samples, self.features_config) save_satellite_to = self.cache_dir / "satellite_metadata_train.csv" satellite_meta.to_csv(save_satellite_to, index=False) logger.info( @@ -148,8 +148,8 @@ def _write_predictions(self, preds_path): self.output_df.to_csv(preds_path, index=True) logger.success(f"Predictions saved to {preds_path}") - def run_prediction(self, predict_csv, preds_path): - self._prep_predict_data(predict_csv) + def run_prediction(self, predict_csv, preds_path, debug=False): + self._prep_predict_data(predict_csv, debug) self._prepare_predict_features() self._predict_model() self._write_predictions(preds_path) diff --git a/tests/assets/evaluate_data.csv b/tests/assets/evaluate_data.csv index f1dc0731..7509216a 100644 --- a/tests/assets/evaluate_data.csv +++ b/tests/assets/evaluate_data.csv @@ -1,6 +1,6 @@ -date,latitude,longitude,region,date,density_cells_per_ml,severity -2021-06-29,41.424144,-73.206937,midwest,2018-05-14,585.0,1 -2021-07-25,36.045,-79.0919415955354,west,2016-08-31,5867500.0,4 -2021-08-21,35.8845244661109,-78.9539970867146,south,2020-11-19,290.0,1 -2021-08-28,41.39249,-75.3607,south,2016-08-24,1614.0,1 -2021-07-11,38.3056,-122.026,midwest,2019-07-23,111825.0,3 +latitude,longitude,date,split,region,severity,density +40.090275,-76.873132,2018-05-21,train,northeast,1,0.0 +35.7200811863161,-79.1374207771809,2013-05-22,train,south,2,29046.0 +35.6940254103693,-79.1858165585188,2016-10-18,train,south,1,94.0 +35.68225,-79.07802,2015-03-24,train,south,1,2179.0 +35.79,-79.0264911210803,2018-07-11,train,south,1,11981.0 diff --git a/tests/assets/experiment/model.zip b/tests/assets/experiment/model.zip index 67704be3..9ffa944a 100644 Binary files a/tests/assets/experiment/model.zip and b/tests/assets/experiment/model.zip differ diff --git a/tests/assets/feature_cache/satellite/3a2c48812b551d720f8d56772efa6df1/S2B_MSIL2A_20190918T154919_R054_T18TVL_20201004T144350.npy b/tests/assets/feature_cache/sentinel_500/3a2c48812b551d720f8d56772efa6df1/S2B_MSIL2A_20190918T154919_R054_T18TVL_20201004T144350/B02.npy similarity index 100% rename from tests/assets/feature_cache/satellite/3a2c48812b551d720f8d56772efa6df1/S2B_MSIL2A_20190918T154919_R054_T18TVL_20201004T144350.npy rename to tests/assets/feature_cache/sentinel_500/3a2c48812b551d720f8d56772efa6df1/S2B_MSIL2A_20190918T154919_R054_T18TVL_20201004T144350/B02.npy diff --git a/tests/assets/predict_data.csv b/tests/assets/predict_data.csv index 19042959..732a817b 100644 --- a/tests/assets/predict_data.csv +++ b/tests/assets/predict_data.csv @@ -1,6 +1,6 @@ date,latitude,longitude -2021-06-29,41.424144,-73.206937 -2021-07-25,36.045,-79.0919415955354 -2021-08-21,35.8845244661109,-78.9539970867146 -2021-08-28,41.39249,-75.3607 -2021-07-11,38.3056,-122.026 +2021-05-18,35.65,-78.6828160966743 +2018-10-22,37.5643183,-101.3355745 +2021-05-17,36.05,-76.7 +2016-08-31,35.7054164952572,-79.1646588522751 +2015-06-27,41.287577,-80.424543 diff --git a/tests/assets/satellite_meta.csv b/tests/assets/satellite_meta.csv deleted file mode 100644 index 6c6e8239..00000000 --- a/tests/assets/satellite_meta.csv +++ /dev/null @@ -1,3 +0,0 @@ -item_id,datetime,min_long,max_long,min_lat,max_lat,cloud_cover,B02_href,selected,sample_id -S2B_MSIL2A_20170723T155909_R097_T17SPV_20210210T132957,2017-07-23,-79.90214616115195,-78.66878536100633,35.13300633202725,36.13974073092875,42.945348,https://sentinel2l2a01.blob.core.windows.net/sentinel2-l2/17/S/PV/2017/07/23/S2B_MSIL2A_20170723T155909_N0212_R097_T17SPV_20210210T132957.SAFE/GRANULE/L2A_T17SPV_A001982_20170723T161238/IMG_DATA/R10m/T17SPV_20170723T155909_B02_10m.tif?st=2023-07-25T17%3A31%3A40Z&se=2023-07-26T18%3A16%3A40Z&sp=rl&sv=2021-06-08&sr=c&skoid=c85c15d6-d1ae-42d4-af60-e2ca0f81359b&sktid=72f988bf-86f1-41af-91ab-2d7cd011db47&skt=2023-07-26T13%3A00%3A01Z&ske=2023-08-02T13%3A00%3A01Z&sks=b&skv=2021-06-08&sig=Sm7ZNd3rsECuWRWlIQsrvfDHQAu5gpDfh0OpCEbORfM%3D,True,rszn -S2B_MSIL2A_20190918T154919_R054_T18TVL_20201004T144350,2019-09-18,-76.19946,-74.882996,40.556706911840905,41.55135515173343,5.237853,https://sentinel2l2a01.blob.core.windows.net/sentinel2-l2/18/T/VL/2019/09/18/S2B_MSIL2A_20190918T154919_N0212_R054_T18TVL_20201004T144350.SAFE/GRANULE/L2A_T18TVL_A013236_20190918T160136/IMG_DATA/R10m/T18TVL_20190918T154919_B02_10m.tif?st=2023-07-25T17%3A31%3A40Z&se=2023-07-26T18%3A16%3A40Z&sp=rl&sv=2021-06-08&sr=c&skoid=c85c15d6-d1ae-42d4-af60-e2ca0f81359b&sktid=72f988bf-86f1-41af-91ab-2d7cd011db47&skt=2023-07-26T13%3A00%3A01Z&ske=2023-08-02T13%3A00%3A01Z&sks=b&skv=2021-06-08&sig=Sm7ZNd3rsECuWRWlIQsrvfDHQAu5gpDfh0OpCEbORfM%3D,True,ofhd diff --git a/tests/assets/satellite_metadata.csv b/tests/assets/satellite_metadata.csv new file mode 100644 index 00000000..a47d64e5 --- /dev/null +++ b/tests/assets/satellite_metadata.csv @@ -0,0 +1,3 @@ +item_id,datetime,platform,min_long,max_long,min_lat,max_lat,eo:cloud_cover,B02_href,B03_href,day_diff,selected,sample_id +S2B_MSIL2A_20170723T155909_R097_T17SPV_20210210T132957,2017-07-23,Sentinel-2B,-79.90214616115195,-78.66878536100633,35.13300633202725,36.13974073092875,42.945348,https://sentinel2l2a01.blob.core.windows.net/sentinel2-l2/17/S/PV/2017/07/23/S2B_MSIL2A_20170723T155909_N0212_R097_T17SPV_20210210T132957.SAFE/GRANULE/L2A_T17SPV_A001982_20170723T161238/IMG_DATA/R10m/T17SPV_20170723T155909_B02_10m.tif?st=2023-08-08T20%3A04%3A50Z&se=2023-08-09T20%3A49%3A50Z&sp=rl&sv=2021-06-08&sr=c&skoid=c85c15d6-d1ae-42d4-af60-e2ca0f81359b&sktid=72f988bf-86f1-41af-91ab-2d7cd011db47&skt=2023-08-09T19%3A20%3A48Z&ske=2023-08-16T19%3A20%3A48Z&sks=b&skv=2021-06-08&sig=cROyki1g3qv0aPv4AINV%2Bs2p6xuGUnoAo8q6cTrB7cw%3D,https://sentinel2l2a01.blob.core.windows.net/sentinel2-l2/17/S/PV/2017/07/23/S2B_MSIL2A_20170723T155909_N0212_R097_T17SPV_20210210T132957.SAFE/GRANULE/L2A_T17SPV_A001982_20170723T161238/IMG_DATA/R10m/T17SPV_20170723T155909_B03_10m.tif?st=2023-08-08T20%3A04%3A50Z&se=2023-08-09T20%3A49%3A50Z&sp=rl&sv=2021-06-08&sr=c&skoid=c85c15d6-d1ae-42d4-af60-e2ca0f81359b&sktid=72f988bf-86f1-41af-91ab-2d7cd011db47&skt=2023-08-09T19%3A20%3A48Z&ske=2023-08-16T19%3A20%3A48Z&sks=b&skv=2021-06-08&sig=cROyki1g3qv0aPv4AINV%2Bs2p6xuGUnoAo8q6cTrB7cw%3D,29,True,9c601f226c2af07d570134127a7fda27 +S2B_MSIL2A_20190730T154819_R054_T18TVL_20201005T200628,2019-07-30,Sentinel-2B,-76.19946,-74.882965,40.556706911840905,41.55178511688993,5.31211,https://sentinel2l2a01.blob.core.windows.net/sentinel2-l2/18/T/VL/2019/07/30/S2B_MSIL2A_20190730T154819_N0212_R054_T18TVL_20201005T200628.SAFE/GRANULE/L2A_T18TVL_A012521_20190730T155818/IMG_DATA/R10m/T18TVL_20190730T154819_B02_10m.tif?st=2023-08-08T20%3A04%3A50Z&se=2023-08-09T20%3A49%3A50Z&sp=rl&sv=2021-06-08&sr=c&skoid=c85c15d6-d1ae-42d4-af60-e2ca0f81359b&sktid=72f988bf-86f1-41af-91ab-2d7cd011db47&skt=2023-08-09T19%3A20%3A48Z&ske=2023-08-16T19%3A20%3A48Z&sks=b&skv=2021-06-08&sig=cROyki1g3qv0aPv4AINV%2Bs2p6xuGUnoAo8q6cTrB7cw%3D,https://sentinel2l2a01.blob.core.windows.net/sentinel2-l2/18/T/VL/2019/07/30/S2B_MSIL2A_20190730T154819_N0212_R054_T18TVL_20201005T200628.SAFE/GRANULE/L2A_T18TVL_A012521_20190730T155818/IMG_DATA/R10m/T18TVL_20190730T154819_B03_10m.tif?st=2023-08-08T20%3A04%3A50Z&se=2023-08-09T20%3A49%3A50Z&sp=rl&sv=2021-06-08&sr=c&skoid=c85c15d6-d1ae-42d4-af60-e2ca0f81359b&sktid=72f988bf-86f1-41af-91ab-2d7cd011db47&skt=2023-08-09T19%3A20%3A48Z&ske=2023-08-16T19%3A20%3A48Z&sks=b&skv=2021-06-08&sig=cROyki1g3qv0aPv4AINV%2Bs2p6xuGUnoAo8q6cTrB7cw%3D,29,True,3a2c48812b551d720f8d56772efa6df1 diff --git a/tests/conftest.py b/tests/conftest.py index 988c371c..34931bfd 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -19,6 +19,11 @@ def train_data(train_data_path) -> pd.DataFrame: return pd.read_csv(train_data_path) +@pytest.fixture(scope="session") +def satellite_meta() -> pd.DataFrame: + return pd.read_csv(ASSETS_DIR / "satellite_metadata.csv") + + @pytest.fixture(scope="session") def experiment_config_path() -> Path: return ASSETS_DIR / "experiment_config.yaml" diff --git a/tests/test_features.py b/tests/test_features.py index 7a70cafe..ce740589 100644 --- a/tests/test_features.py +++ b/tests/test_features.py @@ -3,6 +3,7 @@ import numpy as np from cyano.data.features import generate_features +from cyano.data.satellite_data import download_satellite_data, generate_candidate_metadata from cyano.data.utils import add_unique_identifier ASSETS_DIR = Path(__file__).parent / "assets" @@ -22,3 +23,55 @@ def test_known_features(train_data, features_config): assert np.isclose(features.loc["3a2c48812b551d720f8d56772efa6df1", "B02_mean"], 161.532712) assert np.isclose(features.loc["3a2c48812b551d720f8d56772efa6df1", "B02_min"], 50) assert np.isclose(features.loc["3a2c48812b551d720f8d56772efa6df1", "B02_max"], 1182) + + +def test_generate_candidate_metadata(train_data, features_config): + train_data = add_unique_identifier(train_data) + + candidate_meta, sample_item_map = generate_candidate_metadata(train_data, features_config) + + # Check that item map has the correct samples and matches known values + assert len(sample_item_map) == len(train_data) + assert set(sample_item_map.keys()) == set(train_data.index) + assert sample_item_map["3a2c48812b551d720f8d56772efa6df1"]["sentinel_item_ids"] == [ + "S2A_MSIL2A_20190824T154911_R054_T18TVL_20201106T052956", + "S2B_MSIL2A_20190819T154819_R054_T18TVL_20201005T022720", + "S2A_MSIL2A_20190814T154911_R054_T18TVL_20201005T001501", + "S2B_MSIL2A_20190809T154819_R054_T18TVL_20201004T222827", + "S2A_MSIL2A_20190804T154911_R054_T18TVL_20201004T201836", + "S2B_MSIL2A_20190730T154819_R054_T18TVL_20201005T200628", + ] + + # Check that candidate metadata matches known expected values + assert candidate_meta.item_id.is_unique + assert len(candidate_meta) == 9 + assert ( + "S2A_MSIL2A_20170728T155901_R097_T17SPV_20210210T154351" in candidate_meta.item_id.values + ) + assert ( + "S2B_MSIL2A_20190819T154819_R054_T18TVL_20201005T022720" in candidate_meta.item_id.values + ) + + +def test_download_satellite_data(tmp_path, satellite_meta, train_data, features_config): + # Download imagery + features_config.use_sentinel_bands = ["B02", "B03"] + train_data = add_unique_identifier(train_data) + download_satellite_data(satellite_meta, train_data, features_config, tmp_path) + + # Sentinel image cache directory exists + sentinel_dir = tmp_path / f"sentinel_{features_config.image_feature_meter_window}" + assert sentinel_dir.exists() + assert len(list(sentinel_dir.rglob("*.npy"))) > 0 + + # Check that the structure of saved image arrays is correct + for sample_dir in sentinel_dir.iterdir(): + # Correct number of items per sample + sample_item_dirs = list(sample_dir.iterdir()) + assert len(sample_item_dirs) == features_config.n_sentinel_items + + # Correct bands for each item + for sample_item_dir in sample_item_dirs: + assert set([pth.stem for pth in sample_item_dir.iterdir()]) == set( + features_config.use_sentinel_bands + )