diff --git a/setup.cfg b/setup.cfg index 88842cd9..52b58f75 100644 --- a/setup.cfg +++ b/setup.cfg @@ -40,7 +40,7 @@ install_requires = numpy numba pandas - pyarrow >= 13.0.0 + pyarrow >= 14.0.0 pyyaml >= 5.1 quivr >= 0.7.1 ray[default] diff --git a/thor/__init__.py b/thor/__init__.py index 3c1fef6d..11cc9a9b 100644 --- a/thor/__init__.py +++ b/thor/__init__.py @@ -1,14 +1,13 @@ from .version import __version__ from .config import * -from .constants import * from .orbits import * from .utils import * +from .range_and_transform import * +from .clusters import * from .projections import * from .orbit import * +from .orbit_determination import * from .orbit_selection import * -from .filter_orbits import * from .main import * -from .main_2 import * # TODO: remove when main is replaced -from .analysis import * logger = setupLogger(__name__) diff --git a/thor/analysis.py b/thor/analysis.py deleted file mode 100644 index 4e686bda..00000000 --- a/thor/analysis.py +++ /dev/null @@ -1,290 +0,0 @@ -import os - -import pandas as pd -from difi import analyzeLinkages, analyzeObservations - -__all__ = ["readOrbitDir", "analyzeTHOROrbit", "analyzeTHOR"] - - -def readOrbitDir(orbit_dir): - - projected_observations = pd.read_csv( - os.path.join(orbit_dir, "projected_observations.csv"), - index_col=False, - dtype={"obs_id": str}, - ) - - clusters = pd.read_csv(os.path.join(orbit_dir, "clusters.csv"), index_col=False) - - cluster_members = pd.read_csv( - os.path.join(orbit_dir, "cluster_members.csv"), - index_col=False, - dtype={"obs_id": str}, - ) - - iod_orbits = Orbits.from_csv( - os.path.join(orbit_dir, "iod_orbits.csv"), - ).to_df(include_units=False) - - iod_orbit_members = pd.read_csv( - os.path.join(orbit_dir, "iod_orbit_members.csv"), - index_col=False, - dtype={"obs_id": str}, - ) - - od_orbits = Orbits.from_csv( - os.path.join(orbit_dir, "od_orbits.csv"), - ).to_df(include_units=False) - - od_orbit_members = pd.read_csv( - os.path.join(orbit_dir, "od_orbit_members.csv"), - index_col=False, - dtype={"obs_id": str}, - ) - - recovered_orbits = Orbits.from_csv( - os.path.join(orbit_dir, "recovered_orbits.csv"), - ).to_df(include_units=False) - - recovered_orbit_members = pd.read_csv( - os.path.join(orbit_dir, "recovered_orbit_members.csv"), - index_col=False, - dtype={"obs_id": str}, - ) - - data_products = ( - projected_observations, - clusters, - cluster_members, - iod_orbits, - iod_orbit_members, - od_orbits, - od_orbit_members, - recovered_orbits, - recovered_orbit_members, - ) - return data_products - - -def analyzeTHOROrbit( - preprocessed_associations, - orbit_dir, - classes=None, - min_obs=5, - contamination_percentage=20, - metric="min_obs", - metric_kwargs={ - "min_obs": 5, - }, -): - - data_products = readOrbitDir(orbit_dir) - ( - projected_observations, - clusters, - cluster_members, - iod_orbits, - iod_orbit_members, - od_orbits, - od_orbit_members, - recovered_orbits, - recovered_orbit_members, - ) = data_products - - analysis_observations = projected_observations.merge( - preprocessed_associations, on="obs_id", how="left" - ) - - column_mapping = {"obs_id": "obs_id", "linkage_id": "cluster_id", "truth": "obj_id"} - - all_truths, findable_observations, summary = analyzeObservations( - analysis_observations, - classes=classes, - metric=metric, - **metric_kwargs, - column_mapping=column_mapping, - ) - - all_clusters, all_truths_clusters, summary_clusters = analyzeLinkages( - analysis_observations, - cluster_members, - all_truths=all_truths, - min_obs=min_obs, - contamination_percentage=contamination_percentage, - classes=classes, - column_mapping=column_mapping, - ) - for df in [all_clusters, all_truths_clusters, summary_clusters]: - df.insert(0, "component", "clustering") - - column_mapping["linkage_id"] = "orbit_id" - - all_iod_orbits, all_truths_iod, summary_iod = analyzeLinkages( - analysis_observations, - iod_orbit_members, - all_truths=all_truths, - min_obs=min_obs, - contamination_percentage=contamination_percentage, - classes=classes, - column_mapping=column_mapping, - ) - for df in [all_iod_orbits, all_truths_iod, summary_iod]: - df.insert(0, "component", "iod") - - all_od_orbits, all_truths_od, summary_od = analyzeLinkages( - analysis_observations, - od_orbit_members, - all_truths=all_truths, - min_obs=min_obs, - contamination_percentage=0.0, - classes=classes, - column_mapping=column_mapping, - ) - for df in [all_od_orbits, all_truths_od, summary_od]: - df.insert(0, "component", "od") - - all_recovered_orbits, all_truths_recovered, summary_recovered = analyzeLinkages( - analysis_observations, - recovered_orbit_members, - all_truths=all_truths, - min_obs=min_obs, - contamination_percentage=0.0, - classes=classes, - column_mapping=column_mapping, - ) - for df in [all_recovered_orbits, all_truths_recovered, summary_recovered]: - df.insert(0, "component", "od+a") - - summary = pd.concat([summary_clusters, summary_iod, summary_od, summary_recovered]) - summary.reset_index(inplace=True, drop=True) - - all_truths = pd.concat( - [all_truths_clusters, all_truths_iod, all_truths_od, all_truths_recovered] - ) - all_truths.reset_index(inplace=True, drop=True) - - all_linkages = pd.concat( - [all_clusters, all_iod_orbits, all_od_orbits, all_recovered_orbits] - ) - all_linkages.reset_index(inplace=True, drop=True) - - all_linkages = all_linkages[ - [ - "component", - "cluster_id", - "orbit_id", - "num_obs", - "num_members", - "pure", - "pure_complete", - "partial", - "mixed", - "contamination_percentage", - "found_pure", - "found_partial", - "found", - "linked_truth", - ] - ] - all_linkages.loc[all_linkages["cluster_id"].isna(), "cluster_id"] = "None" - all_linkages.loc[all_linkages["orbit_id"].isna(), "orbit_id"] = "None" - - summary["linkage_purity"] = 100 * (summary["pure_linkages"] / summary["linkages"]) - - return all_linkages, all_truths, summary - - -def analyzeTHOR( - preprocessed_associations, - out_dir, - min_obs=5, - contamination_percentage=20, - classes=None, - metric="min_obs", - metric_kwargs={"min_obs": 5}, -): - - # Read preprocessed observations from out dir - preprocessed_observations = pd.read_csv( - os.path.join(out_dir, "preprocessed_observations.csv"), - index_col=False, - dtype={"obs_id": str}, - ) - - # Merge with prepprocessed associations to create a set of 'analysis observations': - # observations that contain any previously known labels - analysis_observations = preprocessed_observations.merge( - preprocessed_associations, on="obs_id" - ) - - # Calculate which objects should be findable - column_mapping = {"obs_id": "obs_id", "truth": "obj_id", "linkage_id": "orbit_id"} - all_truths, findable_observations, summary = analyzeObservations( - analysis_observations, - classes=classes, - metric="min_obs", - column_mapping=column_mapping, - **metric_kwargs, - ) - - # Read the recovered orbits and orbit members - recovered_orbits = Orbits.from_csv(os.path.join(out_dir, "recovered_orbits.csv")) - - recovered_orbit_members = pd.read_csv( - os.path.join(out_dir, "recovered_orbit_members.csv"), - index_col=False, - dtype={"obs_id": str}, - ) - - # Calculate which objects were actually recovered - all_recovered_orbits, all_recovered_truths, recovered_summary = analyzeLinkages( - analysis_observations, - recovered_orbit_members, - all_truths=all_truths, - contamination_percentage=0.0, # Recovered orbits should be contamination free (no partial orbits) - classes=classes, - column_mapping=column_mapping, - ) - - # Read test_orbits file from out_dir - test_orbits_file = os.path.join(out_dir, "test_orbits_out.csv") - if not os.path.exists(test_orbits_file): - raise ValueError("Cannot find test_orbits_out.csv.") - else: - test_orbits = Orbits.from_csv(test_orbits_file) - - test_orbits_df = test_orbits.to_df(include_units=False) - - all_linkages_dfs = [] - all_truths_dfs = [] - summary_dfs = [] - - # Go through each test orbit directory and analyze the outputs - for orbit_id in test_orbits_df["test_orbit_id"].unique(): - - orbit_dir = os.path.join(out_dir, "orbit_{}".format(orbit_id)) - - all_linkages, all_truths, summary = analyzeTHOROrbit( - preprocessed_associations, - orbit_dir, - classes=classes, - min_obs=min_obs, - contamination_percentage=contamination_percentage, - metric="min_obs", - metric_kwargs=metric_kwargs, - ) - - for df in [all_linkages, all_truths, summary]: - df.insert(0, "test_orbit_id", orbit_id) - - all_linkages_dfs.append(all_linkages) - all_truths_dfs.append(all_truths) - summary_dfs.append(summary) - - all_linkages = pd.concat(all_linkages_dfs, ignore_index=True) - all_truths = pd.concat(all_truths_dfs, ignore_index=True) - summary = pd.concat(summary_dfs, ignore_index=True) - - test_orbit_analysis = (all_linkages, all_truths, summary) - run_analysis = (all_recovered_orbits, all_recovered_truths, recovered_summary) - return run_analysis, test_orbit_analysis diff --git a/thor/clusters.py b/thor/clusters.py index 546f6ea7..ef4c3d38 100644 --- a/thor/clusters.py +++ b/thor/clusters.py @@ -1,5 +1,18 @@ +import logging +import time +import uuid +from typing import List, Literal, Optional, Tuple + import numba import numpy as np +import numpy.typing as npt +import pyarrow as pa +import pyarrow.compute as pc +import quivr as qv +import ray +from adam_core.propagator import _iterate_chunks + +from .range_and_transform import TransformedDetections # Disable GPU until the GPU-accelerated clustering codes # are better tested and implemented @@ -12,6 +25,74 @@ from sklearn.cluster import DBSCAN +__all__ = [ + "cluster_and_link", + "Clusters", + "ClusterMembers", +] + +logger = logging.getLogger("thor") + + +class Clusters(qv.Table): + cluster_id = qv.StringColumn(default=lambda: uuid.uuid4().hex) + vtheta_x = qv.Float64Column() + vtheta_y = qv.Float64Column() + arc_length = qv.Float64Column() + num_obs = qv.Int64Column() + + def drop_duplicates( + self, cluster_members: "ClusterMembers" + ) -> Tuple["Clusters", "ClusterMembers"]: + """ + Drop clusters that have identical sets of observation IDs. + + Parameters + ---------- + cluster_members: `~thor.clusters.ClusterMembers` + A table of cluster members. + + Returns + ------- + `~thor.clusters.Clusters` + A table of clusters with duplicate clusters removed. + """ + + # Sort by cluster_id and obs_id + sorted = self.sort_by(["cluster_id"]) + cluster_members = cluster_members.sort_by(["cluster_id", "obs_id"]) + + grouped_by_cluster_id = cluster_members.table.group_by( + ["cluster_id"] + ).aggregate([("obs_id", "distinct")]) + grouped_by_cluster_id = grouped_by_cluster_id.append_column( + "index", pa.array(np.arange(0, len(sorted))) + ) + + # We revert to pandas here because grouping by a list of observation IDs with + # pyarrow functions fails at the table creation stage during aggregation. + # This is likely a missing feature in pyarrow. The following code doesn't work: + # grouped_by_obs_lists = grouped_by_cluster_id.group_by( + # ["obs_id_distinct"], + # use_threads=False + # ).aggregate([("index", "first") + + df = grouped_by_cluster_id.to_pandas() + df["obs_id_distinct"] = df["obs_id_distinct"].apply(lambda x: x.tolist()) + indices = df.drop_duplicates(subset=["obs_id_distinct"])["index"].values + + filtered = sorted.take(indices) + filtered_cluster_members = cluster_members.apply_mask( + pc.is_in(cluster_members.cluster_id, filtered.cluster_id) + ) + return filtered, filtered_cluster_members + + +class ClusterMembers(qv.Table): + cluster_id = qv.StringColumn() + obs_id = qv.StringColumn() + + def find_clusters(points, eps, min_samples, alg="hotspot_2d"): """ Find all clusters in a 2-dimensional array of datapoints. @@ -85,6 +166,7 @@ def filter_clusters_by_length(clusters, dt, min_samples, min_arc_length): The original clusters list, filtered down. """ filtered_clusters = [] + arc_lengths = [] for cluster in clusters: dt_in_cluster = dt[cluster] num_obs = len(dt_in_cluster) @@ -95,7 +177,9 @@ def filter_clusters_by_length(clusters, dt, min_samples, min_arc_length): and (arc_length >= min_arc_length) ): filtered_clusters.append(cluster) - return filtered_clusters + arc_lengths.append(arc_length) + + return filtered_clusters, arc_lengths def _find_clusters_hotspots_2d(points, eps, min_samples): @@ -356,3 +440,335 @@ def _find_clusters_dbscan(points, eps, min_samples): clusters.append(cluster_indices) del db return clusters + + +def cluster_velocity( + obs_ids: npt.ArrayLike, + x: npt.NDArray[np.float64], + y: npt.NDArray[np.float64], + dt: npt.NDArray[np.float64], + vx: float, + vy: float, + radius: float = 0.005, + min_obs: int = 5, + min_arc_length: float = 1.0, + alg: Literal["hotspot_2d", "dbscan"] = "dbscan", +) -> Tuple[Clusters, ClusterMembers]: + """ + Clusters THOR projection with different velocities + in the projection plane using `~scipy.cluster.DBSCAN`. + Parameters + ---------- + obs_ids : `~numpy.ndarray' (N) + Observation IDs. + x : `~numpy.ndarray' (N) + Projection space x coordinate in degrees or radians. + y : `~numpy.ndarray' (N) + Projection space y coordinate in degrees or radians. + dt : `~numpy.ndarray' (N) + Change in time from 0th exposure in units of MJD. + vx : float + Projection space x velocity in units of degrees or radians per day in MJD. + vy : float + Projection space y velocity in units of degrees or radians per day in MJD. + radius : float, optional + The maximum distance between two samples for them to be considered + as in the same neighborhood. + See: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.dbscan.html + [Default = 0.005] + min_obs : int, optional + The number of samples (or total weight) in a neighborhood for a + point to be considered as a core point. This includes the point itself. + See: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.dbscan.html + [Default = 5] + min_arc_length : float, optional + Minimum arc length in units of days for a cluster to be accepted. + + Returns + ------- + list + If clusters are found, will return a list of numpy arrays containing the + observation IDs for each cluster. If no clusters are found, will return np.NaN. + """ + logger.debug(f"cluster: vx={vx} vy={vy} n_obs={len(obs_ids)}") + xx = x - vx * dt + yy = y - vy * dt + + X = np.stack((xx, yy), 1) + + clusters = find_clusters(X, radius, min_obs, alg=alg) + clusters, arc_lengths = filter_clusters_by_length( + clusters, + dt, + min_obs, + min_arc_length, + ) + + if len(clusters) == 0: + return Clusters.empty(), ClusterMembers.empty() + else: + + cluster_ids = [] + cluster_num_obs = [] + cluster_members_cluster_ids = [] + cluster_members_obs_ids = [] + for cluster in clusters: + id = uuid.uuid4().hex + obs_ids_i = obs_ids[cluster] + num_obs = len(obs_ids_i) + + cluster_ids.append(id) + cluster_num_obs.append(num_obs) + cluster_members_cluster_ids.append(np.full(num_obs, id)) + cluster_members_obs_ids.append(obs_ids_i) + + clusters = Clusters.from_kwargs( + cluster_id=cluster_ids, + vtheta_x=np.full(len(cluster_ids), vx), + vtheta_y=np.full(len(cluster_ids), vy), + arc_length=arc_lengths, + num_obs=cluster_num_obs, + ) + + cluster_members = ClusterMembers.from_kwargs( + cluster_id=np.concatenate(cluster_members_cluster_ids), + obs_id=np.concatenate(cluster_members_obs_ids), + ) + + return clusters, cluster_members + + +def cluster_velocity_worker( + vx: npt.NDArray[np.float64], + vy: npt.NDArray[np.float64], + obs_ids: npt.ArrayLike, + x: npt.NDArray[np.float64], + y: npt.NDArray[np.float64], + dt: npt.NDArray[np.float64], + radius: float = 0.005, + min_obs: int = 5, + min_arc_length: float = 1.0, + alg: Literal["hotspot_2d", "dbscan"] = "dbscan", +) -> Tuple[Clusters, ClusterMembers]: + """ + Helper function for parallelizing cluster_velocity. This function takes a + batch or chunk of velocities and returns the clusters and cluster members + for that batch. + + """ + clusters_list = [] + cluster_members_list = [] + for vx_i, vy_i in zip(vx, vy): + clusters_i, cluster_members_i = cluster_velocity( + obs_ids, + x, + y, + dt, + vx_i, + vy_i, + radius=radius, + min_obs=min_obs, + min_arc_length=min_arc_length, + alg=alg, + ) + clusters_list.append(clusters_i) + cluster_members_list.append(cluster_members_i) + + return qv.concatenate(clusters_list), qv.concatenate(cluster_members_list) + + +cluster_velocity_remote = ray.remote(cluster_velocity_worker) +cluster_velocity_remote.options( + num_returns=1, + num_cpus=1, +) + + +def cluster_and_link( + observations: TransformedDetections, + vx_range: List[float] = [-0.1, 0.1], + vy_range: List[float] = [-0.1, 0.1], + vx_bins: int = 100, + vy_bins: int = 100, + radius: float = 0.005, + min_obs: int = 5, + min_arc_length: float = 1.0, + alg: Literal["hotspot_2d", "dbscan"] = "dbscan", + chunk_size: int = 1000, + max_processes: Optional[int] = 1, +) -> Tuple[Clusters, ClusterMembers]: + """ + Cluster and link correctly projected (after ranging and shifting) + detections. + + Parameters + ---------- + observations : `~pandas.DataFrame` + DataFrame containing post-range and shift observations. + vx_range : {None, list or `~numpy.ndarray` (2)} + Maximum and minimum velocity range in x. + [Default = [-0.1, 0.1]] + vy_range : {None, list or `~numpy.ndarray` (2)} + Maximum and minimum velocity range in y. + [Default = [-0.1, 0.1]] + vx_bins : int, optional + Length of x-velocity grid between vx_range[0] + and vx_range[-1]. + [Default = 100] + vy_bins: int, optional + Length of y-velocity grid between vy_range[0] + and vy_range[-1]. + [Default = 100] + radius : float, optional + The maximum distance between two samples for them to be considered + as in the same neighborhood. + See: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.dbscan.html + [Default = 0.005] + min_obs : int, optional + The number of samples (or total weight) in a neighborhood for a + point to be considered as a core point. This includes the point itself. + See: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.dbscan.html + [Default = 5] + alg: str + Algorithm to use. Can be "dbscan" or "hotspot_2d". + num_jobs : int, optional + Number of jobs to launch. + parallel_backend : str, optional + Which parallelization backend to use {'ray', 'mp', 'cf'}. + Defaults to using Python's concurrent futures module ('cf'). + + Returns + ------- + clusters : `~pandas.DataFrame` + DataFrame with the cluster ID, the number of observations, and the x and y velocity. + cluster_members : `~pandas.DataFrame` + DataFrame containing the cluster ID and the observation IDs of its members. + + Notes + ----- + The algorithm chosen can have a big impact on performance and accuracy. + + alg="dbscan" uses the DBSCAN algorithm of Ester et. al. It's relatively slow + but works with high accuracy; it is certain to find all clusters with at + least min_obs points that are separated by at most radius. + + alg="hotspot_2d" is much faster (perhaps 10-20x faster) than dbscan, but it + may miss some clusters, particularly when points are spaced a distance of 'radius' + apart. + """ + time_start_cluster = time.time() + logger.info("Running velocity space clustering...") + + vx = np.linspace(*vx_range, num=vx_bins) + vy = np.linspace(*vy_range, num=vy_bins) + vxx, vyy = np.meshgrid(vx, vy) + vxx = vxx.flatten() + vyy = vyy.flatten() + + logger.debug("X velocity range: {}".format(vx_range)) + logger.debug("X velocity bins: {}".format(vx_bins)) + logger.debug("Y velocity range: {}".format(vy_range)) + logger.debug("Y velocity bins: {}".format(vy_bins)) + logger.debug("Velocity grid size: {}".format(vx_bins)) + logger.info("Max sample distance: {}".format(radius)) + logger.info("Minimum samples: {}".format(min_obs)) + + clusters_list = [] + cluster_members_list = [] + if len(observations) > 0: + # Extract useful quantities + obs_ids = observations.id.to_numpy(zero_copy_only=False) + theta_x = observations.coordinates.theta_x.to_numpy(zero_copy_only=False) + theta_y = observations.coordinates.theta_y.to_numpy(zero_copy_only=False) + mjd = observations.coordinates.time.mjd().to_numpy(zero_copy_only=False) + + # Select detections in first exposure + first = np.where(mjd == mjd.min())[0] + mjd0 = mjd[first][0] + dt = mjd - mjd0 + + if max_processes is None or max_processes > 1: + + if not ray.is_initialized(): + ray.init(address="auto") + + # Put all arrays (which can be large) in ray's + # local object store ahead of time + obs_ids_oid = ray.put(obs_ids) + theta_x_oid = ray.put(theta_x) + theta_y_oid = ray.put(theta_y) + dt_oid = ray.put(dt) + + futures = [] + for vxi_chunk, vyi_chunk in zip( + _iterate_chunks(vxx, chunk_size), _iterate_chunks(vyy, chunk_size) + ): + futures.append( + cluster_velocity_remote.remote( + vxi_chunk, + vyi_chunk, + obs_ids_oid, + theta_x_oid, + theta_y_oid, + dt_oid, + radius=radius, + min_obs=min_obs, + min_arc_length=min_arc_length, + alg=alg, + ) + ) + + while futures: + finished, futures = ray.wait(futures, num_returns=1) + result = ray.get(finished[0]) + clusters_list.append(result[0]) + cluster_members_list.append(result[1]) + + else: + + for vxi_chunk, vyi_chunk in zip( + _iterate_chunks(vxx, chunk_size), _iterate_chunks(vyy, chunk_size) + ): + clusters_i, cluster_members_i = cluster_velocity_worker( + vxi_chunk, + vyi_chunk, + obs_ids, + theta_x, + theta_y, + dt, + radius=radius, + min_obs=min_obs, + min_arc_length=min_arc_length, + alg=alg, + ) + clusters_list.append(clusters_i) + cluster_members_list.append(cluster_members_i) + + clusters = qv.concatenate(clusters_list) + cluster_members = qv.concatenate(cluster_members_list) + + # Drop duplicate clusters + time_start_drop = time.time() + logger.info("Removing duplicate clusters...") + num_clusters = len(clusters) + clusters, cluster_members = clusters.drop_duplicates(cluster_members) + logger.info(f"Removed {num_clusters - len(clusters)} duplicate clusters.") + time_end_drop = time.time() + logger.info( + f"Cluster deduplication completed in {time_end_drop - time_start_drop:.3f} seconds." + ) + + else: + + clusters = Clusters.empty() + cluster_members = ClusterMembers.empty() + + time_end_cluster = time.time() + logger.info("Found {} clusters.".format(len(clusters))) + logger.info( + "Clustering completed in {:.3f} seconds.".format( + time_end_cluster - time_start_cluster + ) + ) + + return clusters, cluster_members diff --git a/thor/config.py b/thor/config.py index 27b783fa..6524bc16 100644 --- a/thor/config.py +++ b/thor/config.py @@ -21,6 +21,7 @@ class Config: cluster_min_obs: int = 6 cluster_min_arc_length: float = 1.0 cluster_algorithm: Literal["hotspot_2d", "dbscan"] = "dbscan" + cluster_chunk_size: int = 1000 iod_min_obs: int = 6 iod_min_arc_length: float = 1.0 iod_contamination_percentage: float = 20.0 diff --git a/thor/constants.py b/thor/constants.py deleted file mode 100644 index ef6e107a..00000000 --- a/thor/constants.py +++ /dev/null @@ -1,60 +0,0 @@ -import numpy as np - -__all__ = [ - "KM_P_AU", - "S_P_DAY", - "Constants", - "DE43X", - "DE44X", -] - -# km in an au -KM_P_AU = 149597870.700 -# seconds in a day -S_P_DAY = 86400.0 - - -class _Constants: - def __init__(self, C=None, MU=None, R_Earth=None, Obliquity=None): - self.C = C - self.MU = MU - self.R_EARTH = R_Earth - self.OBLIQUITY = Obliquity - - # Transformation matrix from Equatorial J2000 to Ecliptic J2000 - self.TRANSFORM_EQ2EC = np.array( - [ - [1, 0, 0], - [0, np.cos(self.OBLIQUITY), np.sin(self.OBLIQUITY)], - [0, -np.sin(self.OBLIQUITY), np.cos(self.OBLIQUITY)], - ] - ) - - # Transformation matrix from Ecliptic J2000 to Equatorial J2000 - self.TRANSFORM_EC2EQ = self.TRANSFORM_EQ2EC.T - return - - -DE43X_CONSTANTS = { - # Speed of Light : au / d (299792.458 km / s -- DE430/DE431) - "C": 299792.458 / KM_P_AU * S_P_DAY, - # Standard Gravitational Parameter -- Sun : au**3 / d**2 (0.295912208285591100E-3 -- DE431/DE430) - "MU": 0.295912208285591100e-3, - # Earth Equatorial Radius: au (6378.1363 km -- DE431/DE430) - "R_Earth": 6378.1363 / KM_P_AU, - # Mean Obliquity at J2000: radians (84381.448 arcseconds -- DE431/DE430) - "Obliquity": 84381.448 * np.pi / (180.0 * 3600.0), -} -Constants = DE43X = _Constants(**DE43X_CONSTANTS) - -DE44X_CONSTANTS = { - # Speed of Light : au / d (299792.458 km / s -- DE430/DE431) - "C": 299792.458 / KM_P_AU * S_P_DAY, - # Standard Gravitational Parameter -- Sun : au**3 / d**2 (0.29591220828411956E-03 -- DE441/DE440) - "MU": 0.29591220828411956e-03, - # Earth Equatorial Radius: au (6378.1363 km -- DE431/DE430) - "R_Earth": 6378.1363 / KM_P_AU, - # Mean Obliquity at J2000: radians (84381.448 arcseconds -- DE431/DE430) - "Obliquity": 84381.448 * np.pi / (180.0 * 3600.0), -} -DE44X = _Constants(**DE44X_CONSTANTS) diff --git a/thor/filter_orbits.py b/thor/filter_orbits.py deleted file mode 100644 index 16f453db..00000000 --- a/thor/filter_orbits.py +++ /dev/null @@ -1,188 +0,0 @@ -from typing import Tuple - -import pandas as pd - -from .utils import calcDeltas - -__all__ = ["filterKnownOrbits", "filterOrbits"] - - -def filterKnownOrbits( - orbits: pd.DataFrame, - orbit_observations: pd.DataFrame, - associations: pd.DataFrame, - min_obs: int = 5, -) -> Tuple[pd.DataFrame, pd.DataFrame]: - """ - Remove all observations of unknown objects, keeping only observations of objects with - a known association. If any orbits have fewer than min_obs observations after removing - unknown observations then remove those orbits as well. - - This function will also set the provisional and permanent designation columns as required - by the ADES file format. - - Parameters - ---------- - orbits : `~pandas.DataFrame` - DataFrame of orbits. - orbit_observations : `~pandas.DataFrame` - Dataframe of orbit observations with at least one column with the orbit ID ('orbit_id') and - one column with the 'obs_id' - associations : `~pandas.DataFrame` - DataFrame of known associations, with one column of containing the observation ID ('obs_id') - and another column containing the association ('obj_id'). Any unknown objects should have - been assigned an unknown ID. See preprocessObservations. - min_obs : int - The minimum number of observations for an object to be considered as recovered. - - Returns - ------- - known_orbits : `~pandas.DataFrame` - Orbits of previously known objects. - known_orbit_observations : `~pandas.DataFrame` - Observations of previously known objects, the constituent observations - to which the orbits were fit. - """ - # Merge associations with orbit observations - labeled_observations = orbit_observations.merge( - associations[["obs_id", "obj_id"]], on="obs_id", how="left" - ) - - # Keep only observations of known objects - labeled_observations = labeled_observations[ - ~labeled_observations["obj_id"].str.contains(UNKNOWN_ID_REGEX, regex=True) - ] - - # Keep only known objects with at least min_obs observations - occurences = labeled_observations["orbit_id"].value_counts() - orbit_ids = occurences.index.values[occurences.values >= min_obs] - - # Filter input orbits - orbits_mask = orbits["orbit_id"].isin(orbit_ids) - orbit_observations_mask = labeled_observations["orbit_id"].isin(orbit_ids) - known_orbits = orbits[orbits_mask].copy() - known_orbit_observations = labeled_observations[orbit_observations_mask].copy() - - # Split into permanent and provisional designations - if len(known_orbit_observations) > 0: - known_orbit_observations.loc[:, "permID"] = "" - known_orbit_observations.loc[:, "provID"] = "" - else: - known_orbit_observations["permID"] = "" - known_orbit_observations["provID"] = "" - - # Process permanent IDs first - # TODO : add an equivalent for Comets - perm_ids = known_orbit_observations["obj_id"].str.isnumeric() - known_orbit_observations.loc[perm_ids, "permID"] = known_orbit_observations[ - perm_ids - ]["obj_id"].values - - # Identify provisional IDs next - prov_ids = (~known_orbit_observations["obj_id"].str.isnumeric()) & ( - ~known_orbit_observations["obj_id"].str.contains(UNKNOWN_ID_REGEX, regex=True) - ) - known_orbit_observations.loc[prov_ids, "provID"] = known_orbit_observations[ - prov_ids - ]["obj_id"].values - - # Reorder the columns to put the labels toward the front - cols = known_orbit_observations.columns - first = ["orbit_id", "permID", "provID", "obj_id", "obs_id"] - cols_ordered = first + cols[~cols.isin(first)].tolist() - known_orbit_observations = known_orbit_observations[cols_ordered] - - return known_orbits, known_orbit_observations - - -def filterOrbits( - orbits: pd.DataFrame, - orbit_observations: pd.DataFrame, - associations: pd.DataFrame, - min_obs: int = 5, - min_time_separation: float = 30.0, - delta_cols: list = ["mjd_utc", "mag", "RA_deg", "Dec_deg"], -) -> Tuple[pd.DataFrame, pd.DataFrame]: - """ - Filter orbits into orbits of previously known objects and potential discovery candidates. - - Parameters - ---------- - orbits : `~pandas.DataFrame` - DataFrame of orbits. - orbit_observations : `~pandas.DataFrame` - Dataframe of orbit observations with at least one column with the orbit ID ('orbit_id') and - one column with the 'obs_id' - associations : `~pandas.DataFrame` - DataFrame of known associations, with one column of containing the observation ID ('obs_id') - and another column containing the association ('obj_id'). Any unknown objects should have - been assigned an unknown ID. See preprocessObservations. - min_obs : int - The minimum number of observations for an object to be considered as recovered. - min_time_separation : int - The minimum time two observations should be separated in minutes. If any observations - for a single orbit are seperated by less than this amount then only the first observation is kept. - This is useful to prevent stationary sources from biasing orbit fits, although may decrease overall - completeness. - delta_cols : list[str] - Columns for which to calculate deltas (must include mjd_utc). - - Returns - ------- - discovery_candidates : (`~pandas.DataFrame`, `~pandas.DataFrame`) - DataFrame of dicovery candidate orbits and discovery candidate observations. - known_orbits : (`~pandas.DataFrame`, `~pandas.DataFrame`) - DataFrame of known orbits and known orbit observations. - """ - # Calculate deltas of a variety of quantities (this returns the orbit_observations dataframe - # with the delta columns added) - deltas = calcDeltas( - orbit_observations, groupby_cols=["orbit_id", "night_id"], delta_cols=delta_cols - ) - - # Mark all observations within min_time of another as filtered - deltas.loc[:, "filtered"] = 1 - deltas.loc[ - (deltas["dmjd_utc"].isna()) - | (deltas["dmjd_utc"] >= min_time_separation / 60 / 24), - "filtered", - ] = 0 - orbits_ = orbits.copy() - orbit_observations_ = deltas.copy() - - # Identify known orbits (also remove any observations of unknown objects from these orbits) - recovered_known_orbits, recovered_known_orbit_observations = filterKnownOrbits( - orbits_, orbit_observations_, associations, min_obs=min_obs - ) - - # Remove the known orbits from the pool of orbits - # The remaining orbits are potential candidates - known_orbit_ids = recovered_known_orbits["orbit_id"].values - candidate_orbits = orbits_[~orbits_["orbit_id"].isin(known_orbit_ids)] - candidate_orbit_observations = orbit_observations_[ - ~orbit_observations_["orbit_id"].isin(known_orbit_ids) - ] - - # Remove any observations of the candidate discoveries that are potentially - # too close in time to eachother (removes stationary source that could bias results) - # Any orbits that now have fewer than min_obs observations are also removed - candidate_orbit_observations = candidate_orbit_observations[ - candidate_orbit_observations["filtered"] == 0 - ] - occurences = candidate_orbit_observations["orbit_id"].value_counts() - orbit_ids = occurences.index.values[occurences.values >= min_obs] - candidate_orbits = orbits[orbits["orbit_id"].isin(orbit_ids)] - candidate_orbit_observations = candidate_orbit_observations[ - candidate_orbit_observations["orbit_id"].isin(orbit_ids) - ] - - # Add a trkSub column to the discovery candidates - trk_subs = [ - f"t{i[0:4]}{i[-3:]}" for i in candidate_orbit_observations["orbit_id"].values - ] - candidate_orbit_observations.insert(1, "trkSub", trk_subs) - - discovery_candidates = (candidate_orbits, candidate_orbit_observations) - known_orbits = (recovered_known_orbits, recovered_known_orbit_observations) - - return discovery_candidates, known_orbits diff --git a/thor/main.py b/thor/main.py index 31e351b7..1083cac8 100644 --- a/thor/main.py +++ b/thor/main.py @@ -1,443 +1,193 @@ -import os - -os.environ["OMP_NUM_THREADS"] = "1" -os.environ["OPENBLAS_NUM_THREADS"] = "1" -os.environ["MKL_NUM_THREADS"] = "1" -os.environ["VECLIB_MAXIMUM_THREADS"] = "1" -os.environ["NUMEXPR_NUM_THREADS"] = "1" - -import concurrent.futures as cf import logging -import multiprocessing as mp import time -import uuid -from functools import partial - -import numpy as np -import pandas as pd - -from .clusters import filter_clusters_by_length, find_clusters -from .utils import _checkParallel, _initWorker +from typing import Any, Iterator, List, Optional + +import quivr as qv +import ray +from adam_core.propagator import PYOORB + +from .clusters import cluster_and_link +from .config import Config +from .observations.filters import ObservationFilter, TestOrbitRadiusObservationFilter +from .observations.observations import Observations +from .orbit import TestOrbit +from .orbits import ( + differential_correction, + initial_orbit_determination, + merge_and_extend_orbits, +) +from .range_and_transform import range_and_transform logger = logging.getLogger("thor") -__all__ = [ - "clusterVelocity", - "clusterVelocity_worker", - "clusterAndLink", -] - - - -def clusterVelocity( - obs_ids, - x, - y, - dt, - vx, - vy, - eps=0.005, - min_obs=5, - min_arc_length=1.0, - alg="hotspot_2d", -): - """ - Clusters THOR projection with different velocities - in the projection plane using `~scipy.cluster.DBSCAN`. - Parameters - ---------- - obs_ids : `~numpy.ndarray' (N) - Observation IDs. - x : `~numpy.ndarray' (N) - Projection space x coordinate in degrees or radians. - y : `~numpy.ndarray' (N) - Projection space y coordinate in degrees or radians. - dt : `~numpy.ndarray' (N) - Change in time from 0th exposure in units of MJD. - vx : `~numpy.ndarray' (N) - Projection space x velocity in units of degrees or radians per day in MJD. - vy : `~numpy.ndarray' (N) - Projection space y velocity in units of degrees or radians per day in MJD. - eps : float, optional - The maximum distance between two samples for them to be considered - as in the same neighborhood. - See: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.dbscan.html - [Default = 0.005] - min_obs : int, optional - The number of samples (or total weight) in a neighborhood for a - point to be considered as a core point. This includes the point itself. - See: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.dbscan.html - [Default = 5] - min_arc_length : float, optional - Minimum arc length in units of days for a cluster to be accepted. - - Returns - ------- - list - If clusters are found, will return a list of numpy arrays containing the - observation IDs for each cluster. If no clusters are found, will return np.NaN. - """ - logger.debug(f"cluster: vx={vx} vy={vy} n_obs={len(obs_ids)}") - xx = x - vx * dt - yy = y - vy * dt - - X = np.stack((xx, yy), 1) - - clusters = find_clusters(X, eps, min_obs, alg=alg) - clusters = filter_clusters_by_length( - clusters, - dt, - min_obs, - min_arc_length, - ) - - cluster_ids = [] - for cluster in clusters: - cluster_ids.append(obs_ids[cluster]) - - if len(cluster_ids) == 0: - cluster_ids = np.NaN - return cluster_ids - - -def clusterVelocity_worker( - vx, - vy, - obs_ids=None, - x=None, - y=None, - dt=None, - eps=None, - min_obs=None, - min_arc_length=None, - alg=None, -): - """ - Helper function to multiprocess clustering. - - """ - cluster_ids = clusterVelocity( - obs_ids, - x, - y, - dt, - vx, - vy, - eps=eps, - min_obs=min_obs, - min_arc_length=min_arc_length, - alg=alg, - ) - return cluster_ids - - - -def clusterAndLink( - observations, - vx_range=[-0.1, 0.1], - vy_range=[-0.1, 0.1], - vx_bins=100, - vy_bins=100, - eps=0.005, - min_obs=5, - min_arc_length=1.0, - alg="dbscan", - num_jobs=1, - parallel_backend="cf", -): +def link_test_orbit( + test_orbit: TestOrbit, + observations: Observations, + filters: Optional[List[ObservationFilter]] = None, + config: Optional[Config] = None, +) -> Iterator[Any]: """ - Cluster and link correctly projected (after ranging and shifting) - detections. + Run THOR for a single test orbit on the given observations. This function will yield + results at each stage of the pipeline. + 1. transformed_detections + 2. clusters, cluster_members + 3. iod_orbits, iod_orbit_members + 4. od_orbits, od_orbit_members + 5. recovered_orbits, recovered_orbit_members Parameters ---------- - observations : `~pandas.DataFrame` - DataFrame containing post-range and shift observations. - vx_range : {None, list or `~numpy.ndarray` (2)} - Maximum and minimum velocity range in x. - [Default = [-0.1, 0.1]] - vy_range : {None, list or `~numpy.ndarray` (2)} - Maximum and minimum velocity range in y. - [Default = [-0.1, 0.1]] - vx_bins : int, optional - Length of x-velocity grid between vx_range[0] - and vx_range[-1]. - [Default = 100] - vy_bins: int, optional - Length of y-velocity grid between vy_range[0] - and vy_range[-1]. - [Default = 100] - eps : float, optional - The maximum distance between two samples for them to be considered - as in the same neighborhood. - See: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.dbscan.html - [Default = 0.005] - min_obs : int, optional - The number of samples (or total weight) in a neighborhood for a - point to be considered as a core point. This includes the point itself. - See: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.dbscan.html - [Default = 5] - alg: str - Algorithm to use. Can be "dbscan" or "hotspot_2d". - num_jobs : int, optional - Number of jobs to launch. - parallel_backend : str, optional - Which parallelization backend to use {'ray', 'mp', 'cf'}. - Defaults to using Python's concurrent futures module ('cf'). - - Returns - ------- - clusters : `~pandas.DataFrame` - DataFrame with the cluster ID, the number of observations, and the x and y velocity. - cluster_members : `~pandas.DataFrame` - DataFrame containing the cluster ID and the observation IDs of its members. - - Notes - ----- - The algorithm chosen can have a big impact on performance and accuracy. - - alg="dbscan" uses the DBSCAN algorithm of Ester et. al. It's relatively slow - but works with high accuracy; it is certain to find all clusters with at - least min_obs points that are separated by at most eps. - - alg="hotspot_2d" is much faster (perhaps 10-20x faster) than dbscan, but it - may miss some clusters, particularly when points are spaced a distance of 'eps' - apart. + test_orbit : `~thor.orbit.TestOrbit` + Test orbit to use to gather and transform observations. + observations : `~thor.observations.observations.Observations` + Observations from which range and transform the detections. + filters : list of `~thor.observations.filters.ObservationFilter`, optional + List of filters to apply to the observations before running THOR. + propagator : `~adam_core.propagator.propagator.Propagator` + Propagator to use to propagate the test orbit and generate + ephemerides. + max_processes : int, optional + Maximum number of processes to use for parallelization. """ - time_start_cluster = time.time() - logger.info("Running velocity space clustering...") - - vx = np.linspace(*vx_range, num=vx_bins) - vy = np.linspace(*vy_range, num=vy_bins) - vxx, vyy = np.meshgrid(vx, vy) - vxx = vxx.flatten() - vyy = vyy.flatten() - - logger.debug("X velocity range: {}".format(vx_range)) - logger.debug("X velocity bins: {}".format(vx_bins)) - logger.debug("Y velocity range: {}".format(vy_range)) - logger.debug("Y velocity bins: {}".format(vy_bins)) - logger.debug("Velocity grid size: {}".format(vx_bins)) - logger.info("Max sample distance: {}".format(eps)) - logger.info("Minimum samples: {}".format(min_obs)) - - possible_clusters = [] - if len(observations) > 0: - # Extract useful quantities - obs_ids = observations["obs_id"].values - theta_x = observations["theta_x_deg"].values - theta_y = observations["theta_y_deg"].values - mjd = observations["mjd_utc"].values - - # Select detections in first exposure - first = np.where(mjd == mjd.min())[0] - mjd0 = mjd[first][0] - dt = mjd - mjd0 + time_start = time.perf_counter() + logger.info("Running test orbit...") - parallel, num_workers = _checkParallel(num_jobs, parallel_backend) - if parallel: - if parallel_backend == "ray": - import ray + if config is None: + config = Config() - if not ray.is_initialized(): - ray.init(address="auto") - - clusterVelocity_worker_ray = ray.remote(clusterVelocity_worker) - clusterVelocity_worker_ray = clusterVelocity_worker_ray.options( - num_returns=1, num_cpus=1 - ) - - # Put all arrays (which can be large) in ray's - # local object store ahead of time - obs_ids_oid = ray.put(obs_ids) - theta_x_oid = ray.put(theta_x) - theta_y_oid = ray.put(theta_y) - dt_oid = ray.put(dt) - - p = [] - for vxi, vyi in zip(vxx, vyy): - p.append( - clusterVelocity_worker_ray.remote( - vxi, - vyi, - obs_ids=obs_ids_oid, - x=theta_x_oid, - y=theta_y_oid, - dt=dt_oid, - eps=eps, - min_obs=min_obs, - min_arc_length=min_arc_length, - alg=alg, - ) - ) - possible_clusters = ray.get(p) - - elif parallel_backend == "mp": - p = mp.Pool(processes=num_workers, initializer=_initWorker) - possible_clusters = p.starmap( - partial( - clusterVelocity_worker, - obs_ids=obs_ids, - x=theta_x, - y=theta_y, - dt=dt, - eps=eps, - min_obs=min_obs, - min_arc_length=min_arc_length, - alg=alg, - ), - zip(vxx, vyy), - ) - p.close() - - elif parallel_backend == "cf": - with cf.ProcessPoolExecutor( - max_workers=num_workers, initializer=_initWorker - ) as executor: - futures = [] - for vxi, vyi in zip(vxx, vyy): - f = executor.submit( - clusterVelocity_worker, - vxi, - vyi, - obs_ids=obs_ids, - x=theta_x, - y=theta_y, - dt=dt, - eps=eps, - min_obs=min_obs, - min_arc_length=min_arc_length, - alg=alg, - ) - futures.append(f) - - possible_clusters = [] - for f in cf.as_completed(futures): - possible_clusters.append(f.result()) - - else: - raise ValueError( - "Invalid parallel_backend: {}".format(parallel_backend) - ) - - else: - possible_clusters = [] - for vxi, vyi in zip(vxx, vyy): - possible_clusters.append( - clusterVelocity( - obs_ids, - theta_x, - theta_y, - dt, - vxi, - vyi, - eps=eps, - min_obs=min_obs, - min_arc_length=min_arc_length, - alg=alg, - ) - ) - - time_end_cluster = time.time() - logger.info( - "Clustering completed in {:.3f} seconds.".format( - time_end_cluster - time_start_cluster - ) - ) - - logger.info("Restructuring clusters...") - time_start_restr = time.time() - - possible_clusters = pd.DataFrame({"clusters": possible_clusters}) - - # Remove empty clusters - possible_clusters = possible_clusters[~possible_clusters["clusters"].isna()] - - if len(possible_clusters) != 0: - ### The following code is a little messy, its a lot of pandas dataframe manipulation. - ### I have tried doing an overhaul wherein the clusters and cluster_members dataframe are created per - ### velocity combination in the clusterVelocity function. However, this adds an overhead in that function - ### of ~ 1ms. So clustering 90,000 velocities takes 90 seconds longer which on small datasets is problematic. - ### On large datasets, the effect is not as pronounced because the below code takes a while to run due to - ### in-memory pandas dataframe restructuring. - - # Make DataFrame with cluster velocities so we can figure out which - # velocities yielded clusters, add names to index so we can enable the join - cluster_velocities = pd.DataFrame({"vtheta_x": vxx, "vtheta_y": vyy}) - cluster_velocities.index.set_names("velocity_id", inplace=True) - - # Split lists of cluster ids into one column per cluster for each different velocity - # then stack the result - possible_clusters = pd.DataFrame( - possible_clusters["clusters"].values.tolist(), index=possible_clusters.index - ) - possible_clusters = pd.DataFrame(possible_clusters.stack()) - possible_clusters.rename(columns={0: "obs_ids"}, inplace=True) - possible_clusters = pd.DataFrame( - possible_clusters["obs_ids"].values.tolist(), index=possible_clusters.index + if config.propagator == "PYOORB": + propagator = PYOORB + else: + raise ValueError(f"Unknown propagator: {config.propagator}") + + use_ray = False + if config.max_processes is None or config.max_processes > 1: + # Initialize ray + if not ray.is_initialized(): + logger.debug( + f"Ray is not initialized. Initializing with {config.max_processes}..." + ) + ray.init(num_cpus=config.max_processes) + + if not isinstance(observations, ray.ObjectRef): + observations = ray.put(observations) + + use_ray = True + + # Apply filters to the observations + filtered_observations = observations + if filters is None: + # By default we always filter by radius from the predicted position of the test orbit + filters = [TestOrbitRadiusObservationFilter(radius=config.cell_radius)] + for filter_i in filters: + filtered_observations = filter_i.apply( + filtered_observations, test_orbit, max_processes=config.max_processes ) - # Drop duplicate clusters - possible_clusters.drop_duplicates(inplace=True) - - # Set index names - possible_clusters.index.set_names(["velocity_id", "cluster_id"], inplace=True) - - # Reset index - possible_clusters.reset_index("cluster_id", drop=True, inplace=True) - possible_clusters["cluster_id"] = [ - str(uuid.uuid4().hex) for i in range(len(possible_clusters)) - ] + # Defragment the observations + if len(filtered_observations) > 0: + filtered_observations = qv.defragment(filtered_observations) - # Make clusters DataFrame - clusters = possible_clusters.join(cluster_velocities) - clusters.reset_index(drop=True, inplace=True) - clusters = clusters[["cluster_id", "vtheta_x", "vtheta_y"]] - - # Make cluster_members DataFrame - cluster_members = possible_clusters.reset_index(drop=True).copy() - cluster_members.index = cluster_members["cluster_id"] - cluster_members.drop("cluster_id", axis=1, inplace=True) - cluster_members = pd.DataFrame(cluster_members.stack()) - cluster_members.rename(columns={0: "obs_id"}, inplace=True) - cluster_members.reset_index(inplace=True) - cluster_members.drop("level_1", axis=1, inplace=True) - - # Calculate arc length and add it to the clusters dataframe - cluster_members_time = cluster_members.merge( - observations[["obs_id", "mjd_utc"]], on="obs_id", how="left" - ) - clusters_time = ( - cluster_members_time.groupby(by=["cluster_id"])["mjd_utc"] - .apply(lambda x: x.max() - x.min()) - .to_frame() - ) - clusters_time.reset_index(inplace=True) - clusters_time.rename(columns={"mjd_utc": "arc_length"}, inplace=True) - clusters = clusters.merge( - clusters_time[["cluster_id", "arc_length"]], - on="cluster_id", - how="left", - ) + # Observations are no longer needed, so we can delete them + del observations - else: - cluster_members = pd.DataFrame(columns=["cluster_id", "obs_id"]) - clusters = pd.DataFrame( - columns=["cluster_id", "vtheta_x", "vtheta_y", "arc_length"] - ) + if use_ray: + filtered_observations = ray.put(filtered_observations) - time_end_restr = time.time() - logger.info( - "Restructuring completed in {:.3f} seconds.".format( - time_end_restr - time_start_restr - ) + # Range and transform the observations + transformed_detections = range_and_transform( + test_orbit, + filtered_observations, + propagator=propagator, + max_processes=config.max_processes, ) - logger.info("Found {} clusters.".format(len(clusters))) - logger.info( - "Clustering and restructuring completed in {:.3f} seconds.".format( - time_end_restr - time_start_cluster - ) + yield transformed_detections + + # TODO: ray support for the rest of the pipeline has not yet been implemented + # so we will convert the ray objects to regular objects for now + if use_ray: + filtered_observations = ray.get(filtered_observations) + + # Run clustering + clusters, cluster_members = cluster_and_link( + transformed_detections, + vx_range=[config.vx_min, config.vx_max], + vy_range=[config.vy_min, config.vy_max], + vx_bins=config.vx_bins, + vy_bins=config.vy_bins, + radius=config.cluster_radius, + min_obs=config.cluster_min_obs, + min_arc_length=config.cluster_min_arc_length, + alg=config.cluster_algorithm, + chunk_size=config.cluster_chunk_size, + max_processes=config.max_processes, + ) + yield clusters, cluster_members + + # Run initial orbit determination + iod_orbits, iod_orbit_members = initial_orbit_determination( + filtered_observations, + cluster_members, + min_obs=config.iod_min_obs, + min_arc_length=config.iod_min_arc_length, + contamination_percentage=config.iod_contamination_percentage, + rchi2_threshold=config.iod_rchi2_threshold, + observation_selection_method=config.iod_observation_selection_method, + propagator=propagator, + propagator_kwargs={}, + chunk_size=config.iod_chunk_size, + max_processes=config.max_processes, + # TODO: investigate whether these should be configurable + iterate=False, + light_time=True, + linkage_id_col="cluster_id", + ) + yield iod_orbits, iod_orbit_members + + # Run differential correction + od_orbits, od_orbit_members = differential_correction( + iod_orbits, + iod_orbit_members, + filtered_observations, + min_obs=config.od_min_obs, + min_arc_length=config.od_min_arc_length, + contamination_percentage=config.od_contamination_percentage, + rchi2_threshold=config.od_rchi2_threshold, + delta=config.od_delta, + max_iter=config.od_max_iter, + propagator=config.propagator, + propagator_kwargs={}, + chunk_size=config.od_chunk_size, + max_processes=config.max_processes, + # TODO: investigate whether these should be configurable + method="central", + fit_epoch=False, + ) + yield od_orbits, od_orbit_members + + # Run arc extension + recovered_orbits, recovered_orbit_members = merge_and_extend_orbits( + od_orbits, + od_orbit_members, + filtered_observations, + min_obs=config.arc_extension_min_obs, + min_arc_length=config.arc_extension_min_arc_length, + contamination_percentage=config.arc_extension_contamination_percentage, + rchi2_threshold=config.arc_extension_rchi2_threshold, + radius=config.arc_extension_radius, + delta=config.od_delta, + max_iter=config.od_max_iter, + propagator=config.propagator, + propagator_kwargs={}, + orbits_chunk_size=config.arc_extension_chunk_size, + max_processes=config.max_processes, + # TODO: investigate whether these should be configurable + method="central", + fit_epoch=False, + observations_chunk_size=100000, ) + yield recovered_orbits, recovered_orbit_members - return clusters, cluster_members + time_end = time.perf_counter() + logger.info(f"Test orbit completed in {time_end-time_start:.3f} seconds.") diff --git a/thor/main_2.py b/thor/main_2.py deleted file mode 100644 index fd39b72c..00000000 --- a/thor/main_2.py +++ /dev/null @@ -1,506 +0,0 @@ -import logging -import time -from typing import Any, Iterator, List, Optional, Union - -import pandas as pd -import pyarrow.compute as pc -import quivr as qv -import ray -from adam_core.coordinates import ( - CartesianCoordinates, - OriginCodes, - transform_coordinates, -) -from adam_core.propagator import PYOORB, Propagator - -from .config import Config -from .main import clusterAndLink -from .observations.filters import ObservationFilter, TestOrbitRadiusObservationFilter -from .observations.observations import Observations, ObserversWithStates -from .orbit import TestOrbit, TestOrbitEphemeris -from .orbits import ( - differentialCorrection, - initialOrbitDetermination, - mergeAndExtendOrbits, -) -from .projections import GnomonicCoordinates - -logger = logging.getLogger(__name__) - - -class TransformedDetections(qv.Table): - id = qv.StringColumn() - coordinates = GnomonicCoordinates.as_column() - state_id = qv.Int64Column() - - -def range_and_transform_worker( - ranged_detections: CartesianCoordinates, - observations: Observations, - ephemeris: TestOrbitEphemeris, - state_id: int, -) -> TransformedDetections: - """ - Given ranged detections and their original observations, transform these to the gnomonic tangent - plane centered on the motion of the test orbit for a single state. - - Parameters - ---------- - ranged_detections - Cartesian detections ranged so that their heliocentric distance is the same as the test orbit - for each state - observations - The observations from which the ranged detections originate. These should be sorted one-to-one - with the ranged detections - ephemeris - Ephemeris from which to extract the test orbit's aberrated state. - state_id - The ID for this particular state. - - Returns - ------- - transformed_detections - Detections transformed to a gnomonic tangent plane centered on the motion of the - test orbit. - """ - # Select the detections and ephemeris for this state id - mask = pc.equal(state_id, observations.state_id) - ranged_detections_state = ranged_detections.apply_mask(mask) - ephemeris_state = ephemeris.select("id", state_id) - observations_state = observations.select("state_id", state_id) - - # Transform the detections into the co-rotating frame - return TransformedDetections.from_kwargs( - id=observations_state.detections.id, - coordinates=GnomonicCoordinates.from_cartesian( - ranged_detections_state, - center_cartesian=ephemeris_state.ephemeris.aberrated_coordinates, - ), - state_id=observations_state.state_id, - ) - - -range_and_transform_remote = ray.remote(range_and_transform_worker) - - -def range_and_transform( - test_orbit: TestOrbit, - observations: Union[Observations, ray.ObjectRef], - propagator: Propagator = PYOORB(), - max_processes: Optional[int] = 1, -) -> TransformedDetections: - """ - Range observations for a single test orbit and transform them into a - gnomonic projection centered on the motion of the test orbit (co-rotating - frame). - - Parameters - ---------- - test_orbit : `~thor.orbit.TestOrbit` - Test orbit to use to gather and transform observations. - observations : `~thor.observations.observations.Observations` - Observations from which range and transform the detections. - propagator : `~adam_core.propagator.propagator.Propagator` - Propagator to use to propagate the test orbit and generate - ephemerides. - max_processes : int, optional - Maximum number of processes to use for parallelization. If - an existing ray cluster is already running, this parameter - will be ignored if larger than 1 or not None. - - Returns - ------- - transformed_detections : `~thor.main.TransformedDetections` - The transformed detections as gnomonic coordinates - of the observations in the co-rotating frame. - """ - time_start = time.perf_counter() - logger.info("Running range and transform...") - logger.info(f"Assuming r = {test_orbit.orbit.coordinates.r[0]} au") - logger.info(f"Assuming v = {test_orbit.orbit.coordinates.v[0]} au/d") - - if isinstance(observations, ray.ObjectRef): - observations = ray.get(observations) - - if len(observations) > 0: - # Compute the ephemeris of the test orbit (this will be cached) - ephemeris = test_orbit.generate_ephemeris_from_observations( - observations, - propagator=propagator, - max_processes=max_processes, - ) - - # Assume that the heliocentric distance of all point sources in - # the observations are the same as that of the test orbit - ranged_detections_spherical = test_orbit.range_observations( - observations, - propagator=propagator, - max_processes=max_processes, - ) - - # Transform from spherical topocentric to cartesian heliocentric coordinates - ranged_detections_cartesian = transform_coordinates( - ranged_detections_spherical.coordinates, - representation_out=CartesianCoordinates, - frame_out="ecliptic", - origin_out=OriginCodes.SUN, - ) - - transformed_detection_list = [] - if max_processes is None or max_processes > 1: - - if not ray.is_initialized(): - logger.debug( - f"Ray is not initialized. Initializing with {max_processes}..." - ) - ray.init(num_cpus=max_processes) - - if isinstance(observations, ray.ObjectRef): - observations_ref = observations - observations = ray.get(observations_ref) - else: - observations_ref = ray.put(observations) - - if isinstance(ephemeris, ray.ObjectRef): - ephemeris_ref = ephemeris - else: - ephemeris_ref = ray.put(ephemeris) - - ranged_detections_cartesian_ref = ray.put(ranged_detections_cartesian) - - # Get state IDs - state_ids = observations.state_id.unique().sort() - futures = [] - for state_id in state_ids: - futures.append( - range_and_transform_remote.remote( - ranged_detections_cartesian_ref, - observations_ref, - ephemeris_ref, - state_id, - ) - ) - - while futures: - finished, futures = ray.wait(futures, num_returns=1) - transformed_detection_list.append(ray.get(finished[0])) - - else: - # Get state IDs - state_ids = observations.state_id.unique().sort() - for state_id in state_ids: - mask = pc.equal(state_id, observations.state_id) - transformed_detection_list.append( - range_and_transform_worker( - ranged_detections_cartesian.apply_mask(mask), - observations.select("state_id", state_id), - ephemeris.select("id", state_id), - state_id, - ) - ) - - transformed_detections = qv.concatenate(transformed_detection_list) - transformed_detections = transformed_detections.sort_by(by=["state_id"]) - - else: - transformed_detections = TransformedDetections.empty() - - time_end = time.perf_counter() - logger.info(f"Transformed {len(transformed_detections)} observations.") - logger.info( - f"Range and transform completed in {time_end - time_start:.3f} seconds." - ) - return transformed_detections - - -def _observations_to_observations_df(observations: Observations) -> pd.DataFrame: - """ - Convert THOR observations (v2.0) to the older format used by the rest of the - pipeline. This will eventually be removed once the rest of the pipeline is - updated to use the new format. - - Parameters - ---------- - observations : `~thor.observations.observations.Observations` - Observations to convert. - - Returns - ------- - observations_df : `~pandas.DataFrame` - Observations in the old format. - """ - observations_df = observations.to_dataframe() - observations_df.rename( - columns={ - "detections.id": "obs_id", - "detections.ra": "RA_deg", - "detections.dec": "Dec_deg", - "detections.ra_sigma": "RA_sigma_deg", - "detections.dec_sigma": "Dec_sigma_deg", - "detections.mag": "mag", - "detections.mag_sigma": "mag_sigma", - }, - inplace=True, - ) - observations_df["mjd_utc"] = ( - observations.detections.time.rescale("utc").mjd().to_numpy(zero_copy_only=False) - ) - return observations_df - - -def _observers_with_states_to_observers_df( - observers: ObserversWithStates, -) -> pd.DataFrame: - """ - Convert THOR observers (v2.0) to the older format used by the rest of the - pipeline. This will eventually be removed once the rest of the pipeline is - updated to use the new format. - - Parameters - ---------- - observers : `~adam_core.observers.observers.Observers` - Observers to convert to a dataframe. - - Returns - ------- - observers_df : `~pandas.DataFrame` - Observers in the old format. - """ - observers_df = observers.to_dataframe() - observers_df.rename( - columns={ - "observers.coordinates.x": "obs_x", - "observers.coordinates.y": "obs_y", - "observers.coordinates.z": "obs_z", - "observers.coordinates.vx": "obs_vx", - "observers.coordinates.vy": "obs_vy", - "observers.coordinates.vz": "obs_vz", - }, - inplace=True, - ) - return observers_df - - -def _transformed_detections_to_transformed_detections_df( - transformed_detections: TransformedDetections, -) -> pd.DataFrame: - """ - Convert THOR transformed detections (v2.0) to the older format used by the - rest of the pipeline. This will eventually be removed once the rest of the - pipeline is updated to use the new format. - - Parameters - ---------- - transformed_detections : `~thor.main.TransformedDetections` - Transformed detections to convert to a dataframe. - - Returns - ------- - transformed_detections_df : `~pandas.DataFrame` - Transformed detections in the old format. - """ - transformed_detections_df = transformed_detections.to_dataframe() - transformed_detections_df.rename( - columns={ - "id": "obs_id", - "coordinates.theta_x": "theta_x_deg", - "coordinates.theta_y": "theta_y_deg", - }, - inplace=True, - ) - return transformed_detections_df - - -def link_test_orbit( - test_orbit: TestOrbit, - observations: Observations, - filters: Optional[List[ObservationFilter]] = None, - config: Optional[Config] = None, -) -> Iterator[Any]: - """ - Run THOR for a single test orbit on the given observations. This function will yield - results at each stage of the pipeline. - 1. transformed_detections - 2. clusters, cluster_members - 3. iod_orbits, iod_orbit_members - 4. od_orbits, od_orbit_members - 5. recovered_orbits, recovered_orbit_members - - Parameters - ---------- - test_orbit : `~thor.orbit.TestOrbit` - Test orbit to use to gather and transform observations. - observations : `~thor.observations.observations.Observations` - Observations from which range and transform the detections. - filters : list of `~thor.observations.filters.ObservationFilter`, optional - List of filters to apply to the observations before running THOR. - propagator : `~adam_core.propagator.propagator.Propagator` - Propagator to use to propagate the test orbit and generate - ephemerides. - max_processes : int, optional - Maximum number of processes to use for parallelization. - """ - time_start = time.perf_counter() - logger.info("Running test orbit...") - - if config is None: - config = Config() - - if config.propagator == "PYOORB": - propagator = PYOORB() - else: - raise ValueError(f"Unknown propagator: {config.propagator}") - - use_ray = False - if config.max_processes is None or config.max_processes > 1: - # Initialize ray - if not ray.is_initialized(): - logger.debug( - f"Ray is not initialized. Initializing with {config.max_processes}..." - ) - ray.init(num_cpus=config.max_processes) - - if not isinstance(observations, ray.ObjectRef): - observations = ray.put(observations) - - use_ray = True - - # Apply filters to the observations - filtered_observations = observations - if filters is None: - # By default we always filter by radius from the predicted position of the test orbit - filters = [TestOrbitRadiusObservationFilter(radius=config.cell_radius)] - for filter_i in filters: - filtered_observations = filter_i.apply( - filtered_observations, test_orbit, max_processes=config.max_processes - ) - - # Defragment the observations - if len(filtered_observations) > 0: - filtered_observations = qv.defragment(filtered_observations) - - # Observations are no longer needed, so we can delete them - del observations - - if use_ray: - filtered_observations = ray.put(filtered_observations) - - # Range and transform the observations - transformed_detections = range_and_transform( - test_orbit, - filtered_observations, - propagator=propagator, - max_processes=config.max_processes, - ) - yield transformed_detections - - # TODO: ray support for the rest of the pipeline has not yet been implemented - # so we will convert the ray objects to regular objects for now - if use_ray: - filtered_observations = ray.get(filtered_observations) - - if len(filtered_observations) > 0: - # Convert quivr tables to dataframes used by the rest of the pipeline - observations_df = _observations_to_observations_df(filtered_observations) - observers_df = _observers_with_states_to_observers_df( - filtered_observations.get_observers() - ) - transformed_detections_df = ( - _transformed_detections_to_transformed_detections_df(transformed_detections) - ) - - # Merge dataframes together - observations_df = observations_df.merge(observers_df, on="state_id") - transformed_detections_df = transformed_detections_df.merge( - observations_df[["obs_id", "mjd_utc", "observatory_code"]], on="obs_id" - ) - else: - transformed_detections_df = pd.DataFrame() - observations_df = pd.DataFrame() - - # Run clustering - clusters, cluster_members = clusterAndLink( - transformed_detections_df, - vx_range=[config.vx_min, config.vx_max], - vy_range=[config.vy_min, config.vy_max], - vx_bins=config.vx_bins, - vy_bins=config.vy_bins, - eps=config.cluster_radius, - min_obs=config.cluster_min_obs, - min_arc_length=config.cluster_min_arc_length, - alg=config.cluster_algorithm, - num_jobs=config.max_processes, - parallel_backend=config.parallel_backend, - ) - yield clusters, cluster_members - - # Run initial orbit determination - iod_orbits, iod_orbit_members = initialOrbitDetermination( - observations_df, - cluster_members, - min_obs=config.iod_min_obs, - min_arc_length=config.iod_min_arc_length, - contamination_percentage=config.iod_contamination_percentage, - rchi2_threshold=config.iod_rchi2_threshold, - observation_selection_method=config.iod_observation_selection_method, - propagator=config.propagator, - propagator_kwargs={}, - chunk_size=config.iod_chunk_size, - num_jobs=config.max_processes, - parallel_backend=config.parallel_backend, - # TODO: investigate whether these should be configurable - iterate=False, - identify_subsets=False, - light_time=True, - linkage_id_col="cluster_id", - ) - yield iod_orbits, iod_orbit_members - - # Run differential correction - od_orbits, od_orbit_members = differentialCorrection( - iod_orbits, - iod_orbit_members, - observations_df, - min_obs=config.od_min_obs, - min_arc_length=config.od_min_arc_length, - contamination_percentage=config.od_contamination_percentage, - rchi2_threshold=config.od_rchi2_threshold, - delta=config.od_delta, - max_iter=config.od_max_iter, - propagator=config.propagator, - propagator_kwargs={}, - chunk_size=config.od_chunk_size, - num_jobs=config.max_processes, - parallel_backend=config.parallel_backend, - # TODO: investigate whether these should be configurable - method="central", - fit_epoch=False, - ) - yield od_orbits, od_orbit_members - - # Run arc extension - recovered_orbits, recovered_orbit_members = mergeAndExtendOrbits( - od_orbits, - od_orbit_members, - observations_df, - min_obs=config.arc_extension_min_obs, - min_arc_length=config.arc_extension_min_arc_length, - contamination_percentage=config.arc_extension_contamination_percentage, - rchi2_threshold=config.arc_extension_rchi2_threshold, - eps=config.arc_extension_radius, - delta=config.od_delta, - max_iter=config.od_max_iter, - propagator=config.propagator, - propagator_kwargs={}, - orbits_chunk_size=config.arc_extension_chunk_size, - num_jobs=config.max_processes, - parallel_backend=config.parallel_backend, - # TODO: investigate whether these should be configurable - method="central", - fit_epoch=False, - observations_chunk_size=100000, - ) - yield recovered_orbits, recovered_orbit_members - - time_end = time.perf_counter() - logger.info(f"Test orbit completed in {time_end-time_start:.3f} seconds.") diff --git a/thor/observations/observations.py b/thor/observations/observations.py index 0c820a79..8af588f1 100644 --- a/thor/observations/observations.py +++ b/thor/observations/observations.py @@ -4,6 +4,7 @@ import pyarrow as pa import pyarrow.compute as pc import quivr as qv +from adam_core.coordinates import CoordinateCovariances, Origin, SphericalCoordinates from adam_core.observations import Exposures, PointSourceDetections from adam_core.observers import Observers from adam_core.time import Timestamp @@ -193,6 +194,28 @@ def get_observers(self) -> ObserversWithStates: observers = qv.concatenate(observers_with_states) return observers.sort_by("state_id") + def to_spherical_coordinates(self) -> SphericalCoordinates: + """ + Convert the observations to spherical coordinates which can be used + to calculate residuals. + + Returns + ------- + coordinates : `~adam_core.coordinates.spherical.SphericalCoordinates` + The detections represented as spherical coordinates. + """ + sigmas = np.zeros((len(self), 6)) + sigmas[:, 1] = self.detections.ra_sigma.to_numpy(zero_copy_only=False) + sigmas[:, 2] = self.detections.dec_sigma.to_numpy(zero_copy_only=False) + return SphericalCoordinates.from_kwargs( + lon=self.detections.ra, + lat=self.detections.dec, + time=self.detections.time, + covariance=CoordinateCovariances.from_sigmas(sigmas), + origin=Origin.from_kwargs(code=self.observatory_code), + frame="equatorial", + ) + def select_exposure(self, exposure_id: int) -> "Observations": """ Select observations from a single exposure. diff --git a/thor/orbit_determination/__init__.py b/thor/orbit_determination/__init__.py new file mode 100644 index 00000000..0e9f032d --- /dev/null +++ b/thor/orbit_determination/__init__.py @@ -0,0 +1 @@ +from .fitted_orbits import * diff --git a/thor/orbit_determination/fitted_orbits.py b/thor/orbit_determination/fitted_orbits.py new file mode 100644 index 00000000..2ab94093 --- /dev/null +++ b/thor/orbit_determination/fitted_orbits.py @@ -0,0 +1,176 @@ +import uuid +from typing import List, Literal, Optional, Tuple + +import numpy as np +import pyarrow as pa +import pyarrow.compute as pc +import quivr as qv +from adam_core.coordinates import CartesianCoordinates +from adam_core.coordinates.residuals import Residuals +from adam_core.orbits import Orbits + +from ..utils.quivr import drop_duplicates + +__all__ = [ + "FittedOrbits", + "FittedOrbitMembers", +] + + +class FittedOrbits(qv.Table): + + orbit_id = qv.StringColumn(default=lambda: uuid.uuid4().hex) + object_id = qv.StringColumn(nullable=True) + coordinates = CartesianCoordinates.as_column() + arc_length = qv.Float64Column() + num_obs = qv.Int64Column() + chi2 = qv.Float64Column() + reduced_chi2 = qv.Float64Column() + improved = qv.BooleanColumn(nullable=True) + + def to_orbits(self) -> Orbits: + """ + Convert fitted orbits to orbits that can be used by + a Propagator. + + Returns + ------- + orbits : `~adam_core.orbits.Orbits` + Orbits. + """ + return Orbits.from_kwargs( + orbit_id=self.orbit_id, + object_id=self.object_id, + coordinates=self.coordinates, + ) + + def drop_duplicates( + self, + orbit_members: "FittedOrbitMembers", + subset: Optional[List[str]] = None, + keep: Literal["first", "last"] = "first", + ) -> Tuple["FittedOrbits", "FittedOrbitMembers"]: + """ + Drop duplicate orbits from the fitted orbits and remove + the corresponding orbit members. + + Parameters + ---------- + orbit_members : `~thor.orbit_determination.FittedOrbitMembers` + Fitted orbit members. + subset : list of str, optional + Subset of columns to consider when dropping duplicates. If not specified all the columns + specifying unique state are used: time, x, y, z, vx, vy, vz. + keep : {'first', 'last'}, default 'first' + If there are duplicate rows then keep the first or last row. + + Returns + ------- + filtered : `~thor.orbit_determination.FittedOrbits` + Fitted orbits without duplicates. + filtered_orbit_members : `~thor.orbit_determination.FittedOrbitMembers` + Fitted orbit members without duplicates. + """ + if subset is None: + subset = [ + "coordinates.time.days", + "coordinates.time.nanos", + "coordinates.x", + "coordinates.y", + "coordinates.z", + "coordinates.vx", + "coordinates.vy", + "coordinates.vz", + ] + + filtered = drop_duplicates(self, subset=subset, keep=keep) + filtered_orbit_members = orbit_members.apply_mask( + pc.is_in(orbit_members.orbit_id, filtered.orbit_id) + ) + return filtered, filtered_orbit_members + + def assign_duplicate_observations( + self, orbit_members: "FittedOrbitMembers" + ) -> Tuple["FittedOrbits", "FittedOrbitMembers"]: + """ + Assigns observations that have been assigned to multiple orbits to the orbit with t + he most observations, longest arc length, and lowest reduced chi2. + + Parameters + ---------- + orbit_members : `~thor.orbit_determination.FittedOrbitMembers` + Fitted orbit members. + + Returns + ------- + filtered : `~thor.orbit_determination.FittedOrbits` + Fitted orbits with duplicate assignments removed. + filtered_orbit_members : `~thor.orbit_determination.FittedOrbitMembers` + Fitted orbit members with duplicate assignments removed. + """ + # Sort by number of observations, arc length, and reduced chi2 + # Here we assume that orbits that generally have more observations, longer arc lengths, and lower reduced chi2 are better + # as candidates for assigning detections that have been assigned to multiple orbits + sorted = self.sort_by( + [ + ("num_obs", "descending"), + ("arc_length", "descending"), + ("reduced_chi2", "ascending"), + ] + ) + + # Extract the orbit IDs from the sorted table + orbit_ids = sorted.orbit_id.unique() + + # Calculate the order in which these orbit IDs appear in the orbit_members table + order_in_orbits = pc.index_in(orbit_members.orbit_id, orbit_ids) + + # Create an index into the orbit_members table and append the order_in_orbits column + orbit_members_table = ( + orbit_members.flattened_table() + .append_column("index", pa.array(np.arange(len(orbit_members)))) + .append_column("order_in_orbits", order_in_orbits) + ) + + # Drop the residual values (a list column) due to: https://github.com/apache/arrow/issues/32504 + orbit_members_table = orbit_members_table.drop_columns(["residuals.values"]) + + # Sort orbit members by the orbit IDs (in the same order as the orbits table) + orbit_members_table = orbit_members_table.sort_by( + [("order_in_orbits", "ascending")] + ) + + # Now group by the orbit ID and aggregate the index column to get the first index for each orbit ID + indices = ( + orbit_members_table.group_by("obs_id", use_threads=False) + .aggregate([("index", "first")]) + .column("index_first") + ) + + # Use the indices to filter the orbit_members table and then use the resulting orbit IDs to filter the orbits table + filtered_orbit_members = orbit_members.take(indices) + filtered = self.apply_mask( + pc.is_in(self.orbit_id, filtered_orbit_members.orbit_id) + ) + + return filtered, filtered_orbit_members + + +class FittedOrbitMembers(qv.Table): + + orbit_id = qv.StringColumn() + obs_id = qv.StringColumn() + residuals = Residuals.as_column(nullable=True) + solution = qv.BooleanColumn(nullable=True) + outlier = qv.BooleanColumn(nullable=True) + + def drop_outliers(self) -> "FittedOrbitMembers": + """ + Drop outliers from the fitted orbit members. + + Returns + ------- + fitted_orbit_members : `~thor.orbit_determination.FittedOrbitMembers` + Fitted orbit members without outliers. + """ + return self.apply_mask(pc.equal(self.outlier, False)) diff --git a/thor/orbit_selection.py b/thor/orbit_selection.py index 10cd17cf..96fd76f2 100644 --- a/thor/orbit_selection.py +++ b/thor/orbit_selection.py @@ -5,8 +5,6 @@ from adam_core.orbits import Orbits from astropy.time import Time -from .utils import assignPatchesHEALPix, assignPatchesSquare - __all__ = [ "findAverageOrbits", "findTestOrbitsPatch", diff --git a/thor/orbits/__init__.py b/thor/orbits/__init__.py index 42552650..a97a21e9 100644 --- a/thor/orbits/__init__.py +++ b/thor/orbits/__init__.py @@ -3,7 +3,6 @@ from .gauss import * from .gibbs import * from .herrick_gibbs import * -from .residuals import * from .attribution import * from .iod import * from .od import * diff --git a/thor/orbits/attribution.py b/thor/orbits/attribution.py index 7a283277..f1684898 100644 --- a/thor/orbits/attribution.py +++ b/thor/orbits/attribution.py @@ -1,92 +1,157 @@ -import os -from typing import Literal - -os.environ["OMP_NUM_THREADS"] = "1" -os.environ["OPENBLAS_NUM_THREADS"] = "1" -os.environ["MKL_NUM_THREADS"] = "1" -os.environ["VECLIB_MAXIMUM_THREADS"] = "1" -os.environ["NUMEXPR_NUM_THREADS"] = "1" - -import concurrent.futures as cf import logging -import multiprocessing as mp import time -from functools import partial +from typing import Literal, Optional, Tuple import numpy as np -import pandas as pd +import numpy.typing as npt +import pyarrow as pa +import pyarrow.compute as pc import quivr as qv -from adam_core.observers import Observers +import ray +from adam_core.coordinates.residuals import Residuals from adam_core.orbits import Orbits from adam_core.propagator import PYOORB from adam_core.propagator.utils import _iterate_chunks -from adam_core.time import Timestamp from sklearn.neighbors import BallTree -from ..utils import ( - _checkParallel, - _initWorker, - calcChunkSize, - removeDuplicateObservations, - sortLinkages, - yieldChunks, -) -from .od import differentialCorrection -from .residuals import calcResiduals -from .utils import _ephemeris_to_dataframe +from ..observations.observations import Observations +from ..orbit_determination import FittedOrbitMembers, FittedOrbits +from .od import differential_correction logger = logging.getLogger(__name__) -__all__ = ["attribution_worker", "attributeObservations", "mergeAndExtendOrbits"] +__all__ = ["Attributions", "attribute_observations", "merge_and_extend_orbits"] + + +LATLOT_INDEX = np.array([2, 1]) + + +class Attributions(qv.Table): + orbit_id = qv.StringColumn() + obs_id = qv.StringColumn() + residuals = Residuals.as_column(nullable=True) + distance = qv.Float64Column(nullable=True) + + def drop_coincident_attributions( + self, observations: Observations + ) -> "Attributions": + """ + Drop attributions that are coincident in time: two or more observations attributed + to the same orbit that occur at the same time. The observation with the lowest + distance is kept. + + Parameters + ---------- + observations : `~thor.observations.observations.Observations` + Observations which will be used to get the observation times. + + Returns + ------- + attributions : `~thor.orbits.attribution.Attributions` + Attributions table with coincident attributions removed. + """ + # Flatten the table so nested columns are dot-delimited at the top level + flattened_table = self.flattened_table() + + # Drop the residual values (a list column) due to: https://github.com/apache/arrow/issues/32504 + flattened_table = flattened_table.drop(["residuals.values"]) + + # Filter the observations to only include those that have been attributed + # to an orbit + observations_filtered = observations.apply_mask( + pc.is_in(observations.detections.id, flattened_table.column("obs_id")) + ) + + # Flatten the observations table + flattened_observations = observations_filtered.flattened_table() + + # Only keep relevant columns + flattened_observations = flattened_observations.select( + ["detections.id", "detections.time.days", "detections.time.nanos"] + ) + + # Join the time column back to the flattened attributions table + flattened_table = flattened_table.join( + flattened_observations, ["obs_id"], right_keys=["detections.id"] + ) + + # Add index column + flattened_table = flattened_table.add_column( + 0, "index", pa.array(np.arange(len(flattened_table))) + ) + + # Sort the table + flattened_table = flattened_table.sort_by( + [ + ("orbit_id", "ascending"), + ("detections.time.days", "ascending"), + ("detections.time.nanos", "ascending"), + ] + ) + + # Group by orbit ID and observation time + indices = ( + flattened_table.group_by( + ["orbit_id", "detections.time.days", "detections.time.nanos"], + use_threads=False, + ) + .aggregate([("index", "first")]) + .column("index_first") + ) + + return self.take(indices) def attribution_worker( - orbits, - observations, - eps=1 / 3600, - include_probabilistic=True, + orbit_ids: npt.NDArray[np.str_], + observation_indices: npt.NDArray[np.int64], + orbits: FittedOrbits, + observations: Observations, + radius: float = 1 / 3600, propagator: Literal["PYOORB"] = "PYOORB", propagator_kwargs: dict = {}, -): +) -> Attributions: if propagator == "PYOORB": prop = PYOORB(**propagator_kwargs) else: raise ValueError(f"Invalid propagator '{propagator}'.") - # Create Observers table - observers_list = [] - for observatory_code in observations["observatory_code"].unique(): - observers_list.append( - Observers.from_code( - observatory_code, - Timestamp.from_mjd( - observations[ - observations["observatory_code"].isin([observatory_code]) - ]["mjd_utc"].unique(), - scale="utc", - ), - ) - ) + # Select the orbits and observations for this batch + observations = observations.take(observation_indices) + orbits = orbits.apply_mask(pc.equal(orbits.orbit_id, orbit_ids)) + + # Get the unique observers for this batch of observations + observers_with_states = observations.get_observers() + observers = observers_with_states.observers - observers = qv.concatenate(observers_list) - observers = observers.sort_by( - ["coordinates.time.days", "coordinates.time.nanos", "code"] + # Generate ephemerides for each orbit at the observation times + ephemeris = prop.generate_ephemeris( + orbits, observers, chunk_size=len(orbits), max_processes=1 ) - # Genereate ephemerides for each orbit at the observation times - ephemeris = prop._generate_ephemeris(orbits, observers) - ephemeris = _ephemeris_to_dataframe(ephemeris) - # TODO: Replace types below with pyarrow/quivr tables - - # Group the predicted ephemerides and observations by visit / exposure - ephemeris_grouped = ephemeris.groupby(by=["observatory_code", "mjd_utc"]) - ephemeris_visits = [ - ephemeris_grouped.get_group(g) for g in ephemeris_grouped.groups - ] - observations_grouped = observations.groupby(by=["observatory_code", "mjd_utc"]) - observations_visits = [ - observations_grouped.get_group(g) for g in observations_grouped.groups - ] + # Round the ephemeris and observations to the nearest millisecond + ephemeris = ephemeris.set_column( + "coordinates.time", ephemeris.coordinates.time.rounded(precision="ms") + ) + observations_rounded = observations.set_column( + "detections.time", observations.detections.time.rounded(precision="ms") + ) + + # Create a linkage between the ephemeris and observations + linkage = qv.MultiKeyLinkage( + ephemeris, + observations_rounded, + left_keys={ + "days": ephemeris.coordinates.time.days, + "nanos": ephemeris.coordinates.time.nanos, + "code": ephemeris.coordinates.origin.code, + }, + right_keys={ + "days": observations_rounded.detections.time.days, + "nanos": observations_rounded.detections.time.nanos, + "code": observations_rounded.observatory_code, + }, + ) # Loop through each unique exposure and visit, find the nearest observations within # eps (haversine metric) @@ -94,33 +159,23 @@ def attribution_worker( orbit_ids_associated = [] obs_ids_associated = [] obs_times_associated = [] - eps_rad = np.radians(eps) + radius_rad = np.radians(radius) residuals = [] - stats = [] - for ephemeris_visit, observations_visit in zip( - ephemeris_visits, observations_visits - ): - - assert len(ephemeris_visit["mjd_utc"].unique()) == 1 - assert len(observations_visit["mjd_utc"].unique()) == 1 - assert ( - observations_visit["mjd_utc"].unique()[0] - == ephemeris_visit["mjd_utc"].unique()[0] - ) + for _, ephemeris_i, observations_i in linkage.iterate(): + + # Extract the observation IDs and times + obs_ids = observations_i.detections.id.to_numpy(zero_copy_only=False) + obs_times = observations_i.detections.time.mjd().to_numpy(zero_copy_only=False) + orbit_ids = ephemeris_i.orbit_id.to_numpy(zero_copy_only=False) - obs_ids = observations_visit[["obs_id"]].values - obs_times = observations_visit[["mjd_utc"]].values - orbit_ids = ephemeris_visit[["orbit_id"]].values - coords = observations_visit[["RA_deg", "Dec_deg"]].values - coords_latlon = observations_visit[["Dec_deg"]].values - coords_predicted = ephemeris_visit[["RA_deg", "Dec_deg"]].values - coords_sigma = observations_visit[["RA_sigma_deg", "Dec_sigma_deg"]].values + # Extract the spherical coordinates for both the observations + # and ephemeris + coords = observations_i.to_spherical_coordinates() + coords_predicted = ephemeris_i.coordinates # Haversine metric requires latitude first then longitude... - coords_latlon = np.radians(observations_visit[["Dec_deg", "RA_deg"]].values) - coords_predicted_latlon = np.radians( - ephemeris_visit[["Dec_deg", "RA_deg"]].values - ) + coords_latlon = np.radians(coords.values[:, LATLOT_INDEX]) + coords_predicted_latlon = np.radians(coords_predicted.values[:, LATLOT_INDEX]) num_obs = len(coords_predicted) k = np.minimum(3, num_obs) @@ -139,210 +194,112 @@ def attribution_worker( # Select all observations with distance smaller or equal # to the maximum given distance - mask = np.where(d <= eps_rad) + mask = np.where(d <= radius_rad) if len(d[mask]) > 0: orbit_ids_associated.append(orbit_ids[i[mask]]) obs_ids_associated.append(obs_ids[mask[0]]) obs_times_associated.append(obs_times[mask[0]]) - distances.append(d[mask].reshape(-1, 1)) + distances.append(d[mask]) - residuals_visit, stats_visit = calcResiduals( - coords[mask[0]], - coords_predicted[i[mask]], - sigmas_actual=coords_sigma[mask[0]], - include_probabilistic=True, + residuals_i = Residuals.calculate( + coords.take(mask[0]), coords_predicted.take(i[mask]) ) - residuals.append(residuals_visit) - stats.append(np.vstack(stats_visit).T) + residuals.append(residuals_i) if len(distances) > 0: - distances = np.degrees(np.vstack(distances)) - orbit_ids_associated = np.vstack(orbit_ids_associated) - obs_ids_associated = np.vstack(obs_ids_associated) - obs_times_associated = np.vstack(obs_times_associated) - residuals = np.vstack(residuals) - stats = np.vstack(stats) - - attributions = { - "orbit_id": orbit_ids_associated[:, 0], - "obs_id": obs_ids_associated[:, 0], - "mjd_utc": obs_times_associated[:, 0], - "distance": distances[:, 0], - "residual_ra_arcsec": residuals[:, 0] * 3600, - "residual_dec_arcsec": residuals[:, 1] * 3600, - "chi2": stats[:, 0], - } - if include_probabilistic: - attributions["probability"] = stats[:, 1] - attributions["mahalanobis_distance"] = stats[:, 2] - - attributions = pd.DataFrame(attributions) + distances = np.degrees(np.concatenate(distances)) + orbit_ids_associated = np.concatenate(orbit_ids_associated) + obs_ids_associated = np.concatenate(obs_ids_associated) + obs_times_associated = np.concatenate(obs_times_associated) + residuals = qv.concatenate(residuals) + + return Attributions.from_kwargs( + orbit_id=orbit_ids_associated, + obs_id=obs_ids_associated, + residuals=residuals, + distance=distances, + ) else: - columns = [ - "orbit_id", - "obs_id", - "mjd_utc", - "distance", - "residual_ra_arcsec", - "residual_dec_arcsec", - "chi2", - ] - if include_probabilistic: - columns += ["probability", "mahalanobis_distance"] - - attributions = pd.DataFrame(columns=columns) + return Attributions.empty() - return attributions + +attribution_worker_remote = ray.remote(attribution_worker) +attribution_worker_remote.options( + num_returns=1, + num_cpus=1, +) -def attributeObservations( - orbits, - observations, - eps=5 / 3600, - include_probabilistic=True, +def attribute_observations( + orbits: Orbits, + observations: Observations, + radius: float = 5 / 3600, propagator: Literal["PYOORB"] = "PYOORB", propagator_kwargs: dict = {}, - orbits_chunk_size=10, - observations_chunk_size=100000, - num_jobs=1, - parallel_backend="cf", -): + orbits_chunk_size: int = 10, + observations_chunk_size: int = 100000, + max_processes: Optional[int] = 1, +) -> Attributions: logger.info("Running observation attribution...") time_start = time.time() - num_orbits = len(orbits) - - attribution_dfs = [] - - parallel, num_workers = _checkParallel(num_jobs, parallel_backend) - if num_workers > 1: - - if parallel_backend == "ray": - import ray - - if not ray.is_initialized(): - ray.init(address="auto") - - attribution_worker_ray = ray.remote(attribution_worker) - attribution_worker_ray = attribution_worker_ray.options( - num_returns=1, num_cpus=1 - ) - - # Send up to orbits_chunk_size orbits to each OD worker for processing - chunk_size_ = calcChunkSize( - num_orbits, num_workers, orbits_chunk_size, min_chunk_size=1 - ) - orbits_split = [chunk for chunk in _iterate_chunks(orbits, chunk_size_)] - - obs_oids = [] - for observations_c in yieldChunks(observations, observations_chunk_size): - obs_oids.append(ray.put(observations_c)) - - p = [] - for obs_oid in obs_oids: - for orbit_i in orbits_split: - p.append( - attribution_worker_ray.remote( - orbit_i, - obs_oid, - eps=eps, - include_probabilistic=include_probabilistic, - propagator=propagator, - propagator_kwargs=propagator_kwargs, - ) - ) - - attribution_dfs_i = ray.get(p) - attribution_dfs += attribution_dfs_i - - elif parallel_backend == "mp": - p = mp.Pool( - processes=num_workers, - initializer=_initWorker, - ) - - # Send up to orbits_chunk_size orbits to each OD worker for processing - chunk_size_ = calcChunkSize( - num_orbits, num_workers, orbits_chunk_size, min_chunk_size=1 - ) - orbits_split = [chunk for chunk in _iterate_chunks(orbits, chunk_size_)] - - for observations_c in yieldChunks(observations, observations_chunk_size): - - obs = [observations_c for i in range(len(orbits_split))] - attribution_dfs_i = p.starmap( - partial( - attribution_worker, - eps=eps, - include_probabilistic=include_probabilistic, + orbit_ids = orbits.orbit_id + observation_indices = np.arange(0, len(observations)) + + attributions_list = [] + if max_processes is None or max_processes > 1: + + if not ray.is_initialized(): + ray.init(address="auto") + + observations_ref = ray.put(observations) + orbits_ref = ray.put(orbits) + + futures = [] + for orbit_id_chunk in _iterate_chunks(orbit_ids, orbits_chunk_size): + for observations_indices_chunk in _iterate_chunks( + observation_indices, observations_chunk_size + ): + futures.append( + attribution_worker_remote.remote( + orbit_id_chunk, + observations_indices_chunk, + orbits_ref, + observations_ref, + radius=radius, propagator=propagator, propagator_kwargs=propagator_kwargs, - ), - zip( - orbits_split, - obs, - ), - ) - attribution_dfs += attribution_dfs_i - - p.close() - - elif parallel_backend == "cf": - with cf.ProcessPoolExecutor( - max_workers=num_workers, initializer=_initWorker - ) as executor: - futures = [] - for observations_c in yieldChunks( - observations, observations_chunk_size - ): - for orbit_c in _iterate_chunks(orbits, orbits_chunk_size): - futures.append( - executor.submit( - attribution_worker, - orbit_c, - observations_c, - eps=eps, - include_probabilistic=include_probabilistic, - propagator=propagator, - propagator_kwargs=propagator_kwargs, - ) - ) - attribution_dfs = [] - for future in cf.as_completed(futures): - attribution_dfs.append(future.result()) - - else: - raise ValueError( - "Invalid parallel_backend '{}'. Must be one of 'ray', 'mp', or 'cf'.".format( - parallel_backend + ) ) - ) + + while futures: + finished, futures = ray.wait(futures, num_returns=1) + attributions_list.append(ray.get(finished[0])) else: - for observations_c in yieldChunks(observations, observations_chunk_size): - for orbit_c in _iterate_chunks(orbits, orbits_chunk_size): + for orbit_id_chunk in _iterate_chunks(orbit_ids, orbits_chunk_size): + for observations_indices_chunk in _iterate_chunks( + observation_indices, observations_chunk_size + ): attribution_df_i = attribution_worker( - orbit_c, - observations_c, - eps=eps, - include_probabilistic=include_probabilistic, + orbit_id_chunk, + observations_indices_chunk, + orbits, + observations, + radius=radius, propagator=propagator, propagator_kwargs=propagator_kwargs, ) - attribution_dfs.append(attribution_df_i) + attributions_list.append(attribution_df_i) - attributions = pd.concat(attribution_dfs) - attributions.sort_values( - by=["orbit_id", "mjd_utc", "distance"], inplace=True, ignore_index=True - ) + attributions = qv.concatenate(attributions_list) + attributions = attributions.sort_by(["orbit_id", "obs_id", "distance"]) time_end = time.time() logger.info( - "Attributed {} observations to {} orbits.".format( - attributions["obs_id"].nunique(), attributions["orbit_id"].nunique() - ) + f"Attributed {len(attributions.obs_id.unique())} observations to {len(attributions.orbit_id.unique())} orbits." ) logger.info( "Attribution completed in {:.3f} seconds.".format(time_end - time_start) @@ -350,26 +307,25 @@ def attributeObservations( return attributions -def mergeAndExtendOrbits( - orbits, - orbit_members, - observations, - min_obs=6, - min_arc_length=1.0, - contamination_percentage=20.0, - rchi2_threshold=5, - eps=1 / 3600, - delta=1e-8, - max_iter=20, - method="central", - fit_epoch=False, +def merge_and_extend_orbits( + orbits: FittedOrbits, + orbit_members: FittedOrbitMembers, + observations: Observations, + min_obs: int = 6, + min_arc_length: float = 1.0, + contamination_percentage: float = 20.0, + rchi2_threshold: float = 5, + radius: float = 1 / 3600, + delta: float = 1e-8, + max_iter: int = 20, + method: Literal["central", "finite"] = "central", + fit_epoch: bool = False, propagator: Literal["PYOORB"] = "PYOORB", propagator_kwargs: dict = {}, - orbits_chunk_size=10, - observations_chunk_size=100000, - num_jobs=60, - parallel_backend="cf", -): + orbits_chunk_size: int = 10, + observations_chunk_size: int = 100000, + max_processes: Optional[int] = 1, +) -> Tuple[FittedOrbits, FittedOrbitMembers]: """ Attempt to extend an orbit's observational arc by running attribution on the observations. This is an iterative process: attribution @@ -392,76 +348,47 @@ def mergeAndExtendOrbits( time_start = time.time() logger.info("Running orbit extension and merging...") - if len(observations) > 0: - orbits_iter, orbit_members_iter = sortLinkages( - orbits, orbit_members, observations, linkage_id_col="orbit_id" - ) - - else: - orbits_iter = orbits.copy() - orbit_members_iter = orbit_members.copy() + # Set the running variables + orbits_iter = orbits + orbit_members_iter = orbit_members + observations_iter = observations iterations = 0 - odp_orbits_dfs = [] - odp_orbit_members_dfs = [] - observations_iter = observations.copy() + odp_orbits_list = [] + odp_orbit_members_list = [] if len(orbits_iter) > 0 and len(observations_iter) > 0: - converged = False + converged = False while not converged: # Run attribution - attributions = attributeObservations( - Orbits.from_flat_dataframe(orbits_iter), + attributions = attribute_observations( + orbits_iter.to_orbits(), observations_iter, - eps=eps, - include_probabilistic=True, + radius=radius, propagator=propagator, propagator_kwargs=propagator_kwargs, orbits_chunk_size=orbits_chunk_size, observations_chunk_size=observations_chunk_size, - num_jobs=num_jobs, - parallel_backend=parallel_backend, + max_processes=max_processes, ) - assert np.all( - np.isin( - orbit_members_iter["obs_id"].unique(), - observations_iter["obs_id"].unique(), - ) - ) + # For orbits with coincident observations: multiple observations attributed at + # the same time, keep only observation with smallest distance + attributions = attributions.drop_coincident_attributions(observations) - # Attributions are sorted by orbit ID, observation time and - # angular distance. Keep only the one observation with smallest distance - # for any orbits that have multiple observations attributed at the same observation time. - attributions.drop_duplicates( - subset=["orbit_id", "mjd_utc"], - keep="first", - inplace=True, - ignore_index=True, + # Create a new orbit members table with the newly attributed observations and + # filter the orbits to only include those that still have observations + orbit_members_iter = FittedOrbitMembers.from_kwargs( + orbit_id=attributions.orbit_id, + obs_id=attributions.obs_id, + residuals=attributions.residuals, ) - orbit_members_iter = attributions[ - [ - "orbit_id", - "obs_id", - "residual_ra_arcsec", - "residual_dec_arcsec", - "chi2", - ] - ] - orbits_iter = orbits_iter[ - orbits_iter["orbit_id"].isin(orbit_members_iter["orbit_id"].unique()) - ] - - orbits_iter, orbit_members_iter = sortLinkages( - orbits_iter, - orbit_members_iter[["orbit_id", "obs_id"]], - observations_iter, + orbits_iter = orbits_iter.apply_mask( + pc.is_in(orbits_iter.orbit_id, orbit_members_iter.orbit_id.unique()) ) - # Run differential orbit correction on all orbits - # with the newly added observations to the orbits - # that had observations attributed to them - orbits_iter, orbit_members_iter = differentialCorrection( + # Run differential orbit correction + orbits_iter, orbit_members_iter = differential_correction( orbits_iter, orbit_members_iter, observations_iter, @@ -472,118 +399,123 @@ def mergeAndExtendOrbits( delta=delta, method=method, max_iter=max_iter, - fit_epoch=False, + fit_epoch=fit_epoch, propagator=propagator, propagator_kwargs=propagator_kwargs, chunk_size=orbits_chunk_size, - num_jobs=num_jobs, - parallel_backend=parallel_backend, + max_processes=max_processes, + ) + orbit_members_iter = orbit_members_iter.drop_outliers() + + # Remove any duplicate orbits + orbits_iter = orbits_iter.sort_by([("reduced_chi2", "ascending")]) + orbits_iter, orbit_members_iter = orbits_iter.drop_duplicates( + orbit_members_iter, + subset=[ + "coordinates.time.days", + "coordinates.time.nanos", + "coordinates.x", + "coordinates.y", + "coordinates.z", + "coordinates.vx", + "coordinates.vy", + "coordinates.vz", + ], + keep="first", ) - orbit_members_iter = orbit_members_iter[orbit_members_iter["outlier"] == 0] - orbit_members_iter.reset_index(inplace=True, drop=True) # Remove the orbits that were not improved from the pool of available orbits. Orbits that were not improved # are orbits that have already iterated to their best-fit solution given the observations available. These orbits # are unlikely to recover more observations in subsequent iterations and so can be saved for output. - not_improved = orbits_iter[orbits_iter["improved"] == False][ - "orbit_id" - ].values - orbits_out = orbits_iter[orbits_iter["orbit_id"].isin(not_improved)].copy() - orbit_members_out = orbit_members_iter[ - orbit_members_iter["orbit_id"].isin(not_improved) - ].copy() - not_improved_obs_ids = orbit_members_out["obs_id"].values - - # If some orbits that were not improved still share observations, keep the orbit with the lowest - # reduced chi2 in the pool of orbits but delete the others. - obs_id_occurences = orbit_members_out["obs_id"].value_counts() - duplicate_obs_ids = obs_id_occurences.index.values[ - obs_id_occurences.values > 1 - ] + not_improved_mask = pc.equal(orbits_iter.improved, False) + orbits_out = orbits_iter.apply_mask(not_improved_mask) + orbit_members_out = orbit_members_iter.apply_mask( + pc.is_in(orbit_members_iter.orbit_id, orbits_out.orbit_id) + ) - logger.info( - "There are {} observations that appear in more than one orbit.".format( - len(duplicate_obs_ids) - ) + # If some of the orbits that haven't improved in their orbit fit still share observations + # then assign those observations to the orbit with the most observations, longest arc length, + # and lowest reduced chi2 (and remove the other instances of that observation) + # We will one final iteration of OD on the output orbits later + orbits_out, orbit_members_out = orbits_out.assign_duplicate_observations( + orbit_members_out ) - orbits_out, orbit_members_out = removeDuplicateObservations( - orbits_out, - orbit_members_out, - min_obs=min_obs, - linkage_id_col="orbit_id", - filter_cols=["num_obs", "arc_length", "r_sigma", "v_sigma"], - ascending=[False, False, True, True], + + # Add these orbits to the output list + odp_orbits_list.append(orbits_out) + odp_orbit_members_list.append(orbit_members_out) + + # Remove observations that have been added to the output list of orbits + observations_iter = observations_iter.apply_mask( + pc.invert( + pc.is_in( + observations_iter.detections.id, + orbit_members_out.obs_id.unique(), + ) + ) ) - observations_iter = observations_iter[ - ~observations_iter["obs_id"].isin(orbit_members_out["obs_id"].values) - ] - orbit_members_iter = orbit_members_iter[ - ~orbit_members_iter["orbit_id"].isin(orbits_out["orbit_id"].values) - ] - orbit_members_iter = orbit_members_iter[ - orbit_members_iter["obs_id"].isin(observations_iter["obs_id"].values) - ] - orbits_iter = orbits_iter[ - orbits_iter["orbit_id"].isin(orbit_members_iter["orbit_id"].unique()) - ] - orbit_members_iter = orbit_members_iter[["orbit_id", "obs_id"]] + # Identify the orbit that could still be improved more + improved_mask = pc.invert(not_improved_mask) + orbits_iter = orbits_iter.apply_mask(improved_mask) + orbit_members_iter = orbit_members_iter.apply_mask( + pc.is_in(orbit_members_iter.orbit_id, orbits_iter.orbit_id) + ) - odp_orbits_dfs.append(orbits_out) - odp_orbit_members_dfs.append(orbit_members_out) + # Remove observations that have been added to the output list of orbits + # from the orbits that we will continue iterating over + orbit_members_iter = orbit_members_iter.apply_mask( + pc.invert( + pc.is_in( + orbit_members_iter.obs_id, + orbit_members_out.obs_id.unique(), + ) + ) + ) + orbits_iter = orbits_iter.apply_mask( + pc.is_in(orbits_iter.orbit_id, orbit_members_iter.orbit_id.unique()) + ) iterations += 1 if len(orbits_iter) == 0: converged = True - odp_orbits = pd.concat(odp_orbits_dfs) - odp_orbit_members = pd.concat(odp_orbit_members_dfs) - - odp_orbits.drop(columns=["improved"], inplace=True) - odp_orbits, odp_orbit_members = sortLinkages( - odp_orbits, odp_orbit_members, observations, linkage_id_col="orbit_id" - ) + odp_orbits = qv.concatenate(odp_orbits_list) + odp_orbit_members = qv.concatenate(odp_orbit_members_list) + + if len(odp_orbits) > 0: + # Do one final iteration of OD on the output orbits. This + # will update any fits of orbits that might have had observations + # removed during the assign_duplicate_observations step + odp_orbits, odp_orbit_members = differential_correction( + odp_orbits, + odp_orbit_members, + observations, + rchi2_threshold=rchi2_threshold, + min_obs=min_obs, + min_arc_length=min_arc_length, + contamination_percentage=contamination_percentage, + delta=delta, + method=method, + max_iter=max_iter, + fit_epoch=fit_epoch, + propagator=propagator, + propagator_kwargs=propagator_kwargs, + chunk_size=orbits_chunk_size, + max_processes=max_processes, + ) + odp_orbit_members = odp_orbit_members.drop_outliers() else: - odp_orbits = pd.DataFrame( - columns=[ - "orbit_id", - "mjd_tdb", - "x", - "y", - "z", - "vx", - "vy", - "vz", - "covariance", - "arc_length", - "num_obs", - "chi2", - "rchi2", - ] - ) - - odp_orbit_members = pd.DataFrame( - columns=[ - "orbit_id", - "obs_id", - "residual_ra_arcsec", - "residual_dec_arcsec", - "chi2", - "outlier", - ] - ) + odp_orbits = FittedOrbits.empty() + odp_orbit_members = FittedOrbitMembers.empty() time_end = time.time() logger.info( - "Number of attribution / differential correction iterations: {}".format( - iterations - ) + f"Number of attribution / differential correction iterations: {iterations}" ) logger.info( - "Extended and/or merged {} orbits into {} orbits.".format( - len(orbits), len(odp_orbits) - ) + f"Extended and/or merged {len(orbits)} orbits into {len(odp_orbits)} orbits." ) logger.info( "Orbit extension and merging completed in {:.3f} seconds.".format( diff --git a/thor/orbits/gauss.py b/thor/orbits/gauss.py index b639bd2d..9932e4b3 100644 --- a/thor/orbits/gauss.py +++ b/thor/orbits/gauss.py @@ -1,4 +1,5 @@ import numpy as np +from adam_core.constants import Constants as c from adam_core.coordinates import ( CartesianCoordinates, Origin, @@ -7,11 +8,9 @@ ) from adam_core.orbits import Orbits from adam_core.time import Timestamp -from astropy.time import Time from numba import jit from numpy import roots -from ..constants import Constants as c from .gibbs import calcGibbs from .herrick_gibbs import calcHerrickGibbs from .iterators import iterateStateTransition diff --git a/thor/orbits/gibbs.py b/thor/orbits/gibbs.py index 0b815f4d..5b7be573 100644 --- a/thor/orbits/gibbs.py +++ b/thor/orbits/gibbs.py @@ -1,6 +1,5 @@ import numpy as np - -from ..constants import Constants as c +from adam_core.constants import Constants as c __all__ = ["calcGibbs"] diff --git a/thor/orbits/herrick_gibbs.py b/thor/orbits/herrick_gibbs.py index 54f8432e..fff3967c 100644 --- a/thor/orbits/herrick_gibbs.py +++ b/thor/orbits/herrick_gibbs.py @@ -1,6 +1,5 @@ import numpy as np - -from ..constants import Constants as c +from adam_core.constants import Constants as c __all__ = ["calcHerrickGibbs"] diff --git a/thor/orbits/iod.py b/thor/orbits/iod.py index 67fbe54d..39c8afa3 100644 --- a/thor/orbits/iod.py +++ b/thor/orbits/iod.py @@ -1,45 +1,32 @@ -import os -from typing import Literal - -os.environ["OMP_NUM_THREADS"] = "1" -os.environ["OPENBLAS_NUM_THREADS"] = "1" -os.environ["MKL_NUM_THREADS"] = "1" -os.environ["VECLIB_MAXIMUM_THREADS"] = "1" -os.environ["NUMEXPR_NUM_THREADS"] = "1" - -import concurrent.futures as cf import logging -import multiprocessing as mp import time -from functools import partial from itertools import combinations +from typing import Literal, Optional, Tuple, Type, Union import numpy as np -import pandas as pd +import numpy.typing as npt +import pyarrow as pa +import pyarrow.compute as pc import quivr as qv -from adam_core.observers import Observers -from adam_core.propagator import PYOORB -from adam_core.time import Timestamp -from astropy.time import Time - -from ..utils import ( - _checkParallel, - _initWorker, - calcChunkSize, - identifySubsetLinkages, - sortLinkages, - yieldChunks, -) +import ray +from adam_core.coordinates.residuals import Residuals +from adam_core.propagator import PYOORB, Propagator +from adam_core.propagator.utils import _iterate_chunks + +from ..clusters import ClusterMembers +from ..observations.observations import Observations +from ..orbit_determination.fitted_orbits import FittedOrbitMembers, FittedOrbits from .gauss import gaussIOD -from .residuals import calcResiduals -from .utils import _ephemeris_to_dataframe logger = logging.getLogger(__name__) -__all__ = ["selectObservations", "iod", "iod_worker", "initialOrbitDetermination"] +__all__ = ["initial_orbit_determination"] -def selectObservations(observations, method="combinations"): +def select_observations( + observations: Observations, + method: Literal["combinations", "first+middle+last", "thirds"] = "combinations", +) -> npt.NDArray[np.str_]: """ Selects which three observations to use for IOD depending on the method. @@ -64,12 +51,12 @@ def selectObservations(observations, method="combinations"): An array of selected observation IDs. If three unique observations could not be selected then returns an empty array. """ - obs_ids = observations["obs_id"].values + obs_ids = observations.detections.id.to_numpy(zero_copy_only=False) if len(obs_ids) < 3: return np.array([]) indexes = np.arange(0, len(obs_ids)) - times = observations["mjd_utc"].values + times = observations.detections.time.mjd().to_numpy(zero_copy_only=False) if method == "first+middle+last": selected_times = np.percentile(times, [0, 50, 100], interpolation="nearest") @@ -123,68 +110,86 @@ def selectObservations(observations, method="combinations"): def iod_worker( - observations_list, - observation_selection_method="combinations", - min_obs=6, - min_arc_length=1.0, - rchi2_threshold=10**3, - contamination_percentage=0.0, - iterate=False, - light_time=True, - linkage_id_col="cluster_id", - propagator: Literal["PYOORB"] = "PYOORB", + linkage_ids: npt.NDArray[np.str_], + observations: Union[Observations, ray.ObjectRef], + linkage_members: Union[ClusterMembers, FittedOrbitMembers, ray.ObjectRef], + min_obs: int = 6, + min_arc_length: float = 1.0, + contamination_percentage: float = 0.0, + rchi2_threshold: float = 200, + observation_selection_method: Literal[ + "combinations", "first+middle+last", "thirds" + ] = "combinations", + linkage_id_col: str = "cluster_id", + iterate: bool = False, + light_time: bool = True, + propagator: Type[Propagator] = PYOORB, propagator_kwargs: dict = {}, -): - iod_orbits_dfs = [] - iod_orbit_members_dfs = [] - for observations in observations_list: - assert np.all( - sorted(observations["mjd_utc"].values) == observations["mjd_utc"].values - ) +) -> Tuple[FittedOrbits, FittedOrbitMembers]: + + prop = propagator(**propagator_kwargs) + + iod_orbits_list = [] + iod_orbit_members_list = [] + for linkage_id in linkage_ids: time_start = time.time() - linkage_id = observations[linkage_id_col].unique()[0] logger.debug(f"Finding initial orbit for linkage {linkage_id}...") + obs_ids = linkage_members.apply_mask( + pc.equal(linkage_members.column(linkage_id_col), linkage_id) + ).obs_id + observations_linkage = observations.apply_mask( + pc.is_in(observations.detections.id, obs_ids) + ) + iod_orbit, iod_orbit_members = iod( - observations, - observation_selection_method=observation_selection_method, + observations_linkage, min_obs=min_obs, min_arc_length=min_arc_length, rchi2_threshold=rchi2_threshold, contamination_percentage=contamination_percentage, + observation_selection_method=observation_selection_method, iterate=iterate, light_time=light_time, - propagator=propagator, - propagator_kwargs=propagator_kwargs, + propagator=prop, ) if len(iod_orbit) > 0: - iod_orbit.insert(1, linkage_id_col, linkage_id) + iod_orbit = iod_orbit.set_column("orbit_id", pa.array([linkage_id])) + iod_orbit_members = iod_orbit_members.set_column( + "orbit_id", + pa.array([linkage_id for i in range(len(iod_orbit_members))]), + ) time_end = time.time() duration = time_end - time_start logger.debug(f"IOD for linkage {linkage_id} completed in {duration:.3f}s.") - iod_orbits_dfs.append(iod_orbit) - iod_orbit_members_dfs.append(iod_orbit_members) + iod_orbits_list.append(iod_orbit) + iod_orbit_members_list.append(iod_orbit_members) - iod_orbits = pd.concat(iod_orbits_dfs, ignore_index=True) - iod_orbit_members = pd.concat(iod_orbit_members_dfs, ignore_index=True) + iod_orbits = qv.concatenate(iod_orbits_list) + iod_orbit_members = qv.concatenate(iod_orbit_members_list) return iod_orbits, iod_orbit_members +iod_worker_remote = ray.remote(iod_worker) +iod_worker_remote.options(num_returns=1, num_cpus=1) + + def iod( - observations, - min_obs=6, - min_arc_length=1.0, - contamination_percentage=0.0, - rchi2_threshold=200, - observation_selection_method="combinations", - iterate=False, - light_time=True, - propagator: Literal["PYOORB"] = "PYOORB", - propagator_kwargs: dict = {}, -): + observations: Observations, + min_obs: int = 6, + min_arc_length: float = 1.0, + contamination_percentage: float = 0.0, + rchi2_threshold: float = 200, + observation_selection_method: Literal[ + "combinations", "first+middle+last", "thirds" + ] = "combinations", + iterate: bool = False, + light_time: bool = True, + propagator: Propagator = PYOORB(), +) -> Tuple[FittedOrbits, FittedOrbitMembers]: """ Run initial orbit determination on a set of observations believed to belong to a single object. @@ -265,49 +270,12 @@ def iod( if len(observations) == 0: processable = False - # Extract column names - obs_id_col = "obs_id" - time_col = "mjd_utc" - ra_col = "RA_deg" - dec_col = "Dec_deg" - ra_err_col = "RA_sigma_deg" - dec_err_col = "Dec_sigma_deg" - obs_code_col = "observatory_code" - obs_x_col = "obs_x" - obs_y_col = "obs_y" - obs_z_col = "obs_z" - - # Extract observation IDs, sky-plane positions, sky-plane position uncertainties, times of observation, - # and the location of the observer at each time - obs_ids_all = observations[obs_id_col].values - coords_all = observations[[ra_col, dec_col]].values - sigmas_all = observations[[ra_err_col, dec_err_col]].values - coords_obs_all = observations[[obs_x_col, obs_y_col, obs_z_col]].values - times_all = observations[time_col].values - times_all = Time(times_all, scale="utc", format="mjd") - - observers_list = [] - for observatory_code in observations["observatory_code"].unique(): - observers_list.append( - Observers.from_code( - observatory_code, - Timestamp.from_mjd( - observations[ - observations["observatory_code"].isin([observatory_code]) - ]["mjd_utc"].unique(), - scale="utc", - ), - ) - ) - observers = qv.concatenate(observers_list) - observers = observers.sort_by( - ["coordinates.time.days", "coordinates.time.nanos", "code"] - ) - - if propagator == "PYOORB": - prop = PYOORB(**propagator_kwargs) - else: - raise ValueError(f"Invalid propagator '{propagator}'.") + obs_ids_all = observations.detections.id.to_numpy(zero_copy_only=False) + coords_all = observations.to_spherical_coordinates() + observers_with_states = observations.get_observers() + observers = observers_with_states.observers + coords_obs_all = observers_with_states.observers.coordinates.r + times_all = coords_all.time.mjd().to_numpy(zero_copy_only=False) chi2_sol = 1e10 orbit_sol = None @@ -322,7 +290,7 @@ def iod( num_outliers = np.maximum(np.minimum(num_obs - min_obs, num_outliers), 0) # Select observation IDs to use for IOD - obs_ids = selectObservations( + obs_ids = select_observations( observations, method=observation_selection_method, ) @@ -341,14 +309,14 @@ def iod( # Grab sky-plane positions of the selected observations, the heliocentric ecliptic position of the observer, # and the times at which the observations occur - coords = coords_all[mask, :] + coords = coords_all.apply_mask(mask) coords_obs = coords_obs_all[mask, :] times = times_all[mask] # Run IOD iod_orbits = gaussIOD( - coords, - times.utc.mjd, + coords.values[:, 1:3], + times, coords_obs, light_time=light_time, iterate=iterate, @@ -360,23 +328,22 @@ def iod( continue # Propagate initial orbit to all observation times - ephemeris = prop._generate_ephemeris(iod_orbits, observers) - ephemeris = _ephemeris_to_dataframe(ephemeris) + ephemeris = propagator.generate_ephemeris( + iod_orbits, observers, chunk_size=1, max_processes=1 + ) # For each unique initial orbit calculate residuals and chi-squared # Find the orbit which yields the lowest chi-squared orbit_ids = iod_orbits.orbit_id.to_numpy(zero_copy_only=False) for i, orbit_id in enumerate(orbit_ids): - ephemeris_orbit = ephemeris[ephemeris["orbit_id"] == orbit_id] + ephemeris_orbit = ephemeris.select("orbit_id", orbit_id) # Calculate residuals and chi2 - residuals, stats = calcResiduals( + residuals = Residuals.calculate( coords_all, - ephemeris_orbit[["RA_deg", "Dec_deg"]].values, - sigmas_actual=sigmas_all, - include_probabilistic=False, + ephemeris_orbit.coordinates, ) - chi2 = stats[0] + chi2 = residuals.chi2.to_numpy() chi2_total = np.sum(chi2) rchi2 = chi2_total / (2 * num_obs - 6) @@ -403,7 +370,7 @@ def iod( rchi2_sol = rchi2 residuals_sol = residuals outliers = np.array([]) - arc_length = times_all.utc.mjd.max() - times_all.utc.mjd.min() + arc_length = times_all.max() - times_all.min() converged = True break @@ -431,10 +398,7 @@ def iod( rchi2_new = chi2_new / (2 * num_obs_new - 6) ids_mask = np.isin(obs_ids_all, obs_id_outlier, invert=True) - arc_length = ( - times_all[ids_mask].utc.mjd.max() - - times_all[ids_mask].utc.mjd.min() - ) + arc_length = times_all[ids_mask].max() - times_all[ids_mask].min() # If the updated reduced chi2 total is lower than our desired # threshold, accept the soluton. If not, keep going. @@ -448,8 +412,7 @@ def iod( num_obs = num_obs_new ids_mask = np.isin(obs_ids_all, outliers, invert=True) arc_length = ( - times_all[ids_mask].utc.mjd.max() - - times_all[ids_mask].utc.mjd.min() + times_all[ids_mask].max() - times_all[ids_mask].min() ) chi2_sol = chi2 converged = True @@ -462,79 +425,49 @@ def iod( if not converged or not processable: - orbit = pd.DataFrame( - columns=[ - "orbit_id", - "mjd_tdb", - "x", - "y", - "z", - "vx", - "vy", - "vz", - "arc_length", - "num_obs", - "chi2", - "rchi2", - ] - ) + return FittedOrbits.empty(), FittedOrbitMembers.empty() - orbit_members = pd.DataFrame( - columns=[ - "orbit_id", - "obs_id", - "residual_ra_arcsec", - "residual_dec_arcsec", - "chi2", - "gauss_sol", - "outlier", - ] + else: + + orbit = FittedOrbits.from_kwargs( + orbit_id=orbit_sol.orbit_id, + object_id=orbit_sol.object_id, + coordinates=orbit_sol.coordinates, + arc_length=[arc_length], + num_obs=[num_obs], + chi2=[chi2_total_sol], + reduced_chi2=[rchi2_sol], ) - else: - orbit = orbit_sol.to_dataframe() - orbit["arc_length"] = arc_length - orbit["num_obs"] = num_obs - orbit["chi2"] = chi2_total_sol - orbit["rchi2"] = rchi2_sol - - orbit_members = pd.DataFrame( - { - "orbit_id": [ - orbit_sol.orbit_id[0].as_py() for i in range(len(obs_ids_all)) - ], - "obs_id": obs_ids_all, - "residual_ra_arcsec": residuals_sol[:, 0] * 3600, - "residual_dec_arcsec": residuals_sol[:, 1] * 3600, - "chi2": chi2_sol, - "gauss_sol": np.zeros(len(obs_ids_all), dtype=int), - "outlier": np.zeros(len(obs_ids_all), dtype=int), - } + orbit_members = FittedOrbitMembers.from_kwargs( + orbit_id=np.full(len(obs_ids_all), orbit_sol.orbit_id[0].as_py()), + obs_id=obs_ids_all, + residuals=residuals_sol, + solution=np.isin(obs_ids_all, obs_ids_sol), + outlier=np.isin(obs_ids_all, outliers), ) - orbit_members.loc[orbit_members["obs_id"].isin(outliers), "outlier"] = 1 - orbit_members.loc[orbit_members["obs_id"].isin(obs_ids_sol), "gauss_sol"] = 1 return orbit, orbit_members -def initialOrbitDetermination( - observations, - linkage_members, - min_obs=6, - min_arc_length=1.0, - contamination_percentage=20.0, - rchi2_threshold=10**3, - observation_selection_method="combinations", - iterate=False, - light_time=True, - linkage_id_col="cluster_id", - identify_subsets=True, - propagator: Literal["PYOORB"] = "PYOORB", +def initial_orbit_determination( + observations: Union[Observations, ray.ObjectRef], + linkage_members: Union[ClusterMembers, FittedOrbitMembers, ray.ObjectRef], + min_obs: int = 6, + min_arc_length: float = 1.0, + contamination_percentage: float = 20.0, + rchi2_threshold: float = 10**3, + observation_selection_method: Literal[ + "combinations", "first+middle+last", "thirds" + ] = "combinations", + iterate: bool = False, + light_time: bool = True, + linkage_id_col: str = "cluster_id", + propagator: Type[Propagator] = PYOORB, propagator_kwargs: dict = {}, - chunk_size=1, - num_jobs=1, - parallel_backend="cf", -): + chunk_size: int = 1, + max_processes: Optional[int] = 1, +) -> Tuple[FittedOrbits, FittedOrbitMembers]: """ Run initial orbit determination on linkages found in observations. @@ -624,226 +557,90 @@ def initialOrbitDetermination( time_start = time.time() logger.info("Running initial orbit determination...") + iod_orbits_list = [] + iod_orbit_members_list = [] if len(observations) > 0 and len(linkage_members) > 0: - iod_orbits_dfs = [] - iod_orbit_members_dfs = [] - - start = time.time() - logger.debug("Merging observations on linkage members...") - linked_observations = linkage_members.merge(observations, on="obs_id") - logger.debug("Sorting observations by linkage ID and mjd_utc...") - linked_observations.sort_values( - by=[linkage_id_col, "mjd_utc"], inplace=True, ignore_index=True - ) - duration = time.time() - start - logger.debug(f"Merging and sorting completed in {duration:.3f}s.") - - start = time.time() - logger.debug("Grouping observations by linkage ID...") - grouped_observations = linked_observations.groupby(by=[linkage_id_col]) - logger.debug("Splitting grouped observations by linkage ID...") - observations_split = [ - grouped_observations.get_group(g).reset_index(drop=True) - for g in grouped_observations.groups - ] - duration = time.time() - start - logger.debug(f"Grouping and splitting completed in {duration:.3f}s.") - - parallel, num_workers = _checkParallel(num_jobs, parallel_backend) - if parallel: + # Extract linkage IDs + linkage_ids = linkage_members.column(linkage_id_col).unique() - # The number of linkages that need to be fit for an initial orbit - num_linkages = linkage_members[linkage_id_col].nunique() + if max_processes is None or max_processes > 1: - if parallel_backend == "ray": + if not ray.is_initialized(): + ray.init(address="auto") - import ray + observations_ref = ray.put(observations) + linkage_members_ref = ray.put(linkage_members) - if not ray.is_initialized(): - ray.init(address="auto") - - iod_worker_ray = ray.remote(iod_worker) - iod_worker_ray = iod_worker_ray.options(num_returns=2, num_cpus=1) - - # Send up to chunk_size linkages to each IOD worker for processing - chunk_size_ = calcChunkSize( - num_linkages, num_workers, chunk_size, min_chunk_size=1 - ) - logger.info( - f"Distributing linkages in chunks of {chunk_size_} to {num_workers} ray workers." - ) - - # Put the observations into ray's local object storage ("plasma") - observation_oids = [] - for observations_i in yieldChunks(observations_split, chunk_size_): - observation_oids.append(ray.put(observations_i)) - - iod_orbits_oids = [] - iod_orbit_members_oids = [] - for observations_oid in observation_oids: - - iod_orbits_oid, iod_orbit_members_oid = iod_worker_ray.remote( - observations_oid, - observation_selection_method=observation_selection_method, - rchi2_threshold=rchi2_threshold, + futures = [] + for linkage_id_chunk in _iterate_chunks(linkage_ids, chunk_size): + futures.append( + iod_worker_remote.remote( + linkage_id_chunk, + observations_ref, + linkage_members_ref, min_obs=min_obs, min_arc_length=min_arc_length, contamination_percentage=contamination_percentage, - iterate=iterate, - light_time=light_time, - linkage_id_col=linkage_id_col, - propagator=propagator, - propagator_kwargs=propagator_kwargs, - ) - iod_orbits_oids.append(iod_orbits_oid) - iod_orbit_members_oids.append(iod_orbit_members_oid) - - iod_orbits_dfs = ray.get(iod_orbits_oids) - iod_orbit_members_dfs = ray.get(iod_orbit_members_oids) - - elif parallel_backend == "mp": - - chunk_size_ = calcChunkSize( - num_linkages, num_workers, chunk_size, min_chunk_size=1 - ) - logger.info( - f"Distributing linkages in chunks of {chunk_size_} to {num_workers} workers." - ) - - p = mp.Pool( - processes=num_workers, - initializer=_initWorker, - ) - - results = p.starmap( - partial( - iod_worker, - observation_selection_method=observation_selection_method, rchi2_threshold=rchi2_threshold, - min_obs=min_obs, - min_arc_length=min_arc_length, - contamination_percentage=contamination_percentage, + observation_selection_method=observation_selection_method, iterate=iterate, light_time=light_time, linkage_id_col=linkage_id_col, propagator=propagator, propagator_kwargs=propagator_kwargs, - ), - zip(yieldChunks(observations_split, chunk_size_)), + ) ) - p.close() - - results = list(zip(*results)) - iod_orbits_dfs = results[0] - iod_orbit_members_dfs = results[1] - - elif parallel_backend == "cf": - with cf.ProcessPoolExecutor( - max_workers=num_workers, initializer=_initWorker - ) as executor: - futures = [] - for observations_i in yieldChunks(observations_split, chunk_size): - futures.append( - executor.submit( - iod_worker, - observations_i, - observation_selection_method=observation_selection_method, - rchi2_threshold=rchi2_threshold, - min_obs=min_obs, - min_arc_length=min_arc_length, - contamination_percentage=contamination_percentage, - iterate=iterate, - light_time=light_time, - linkage_id_col=linkage_id_col, - propagator=propagator, - propagator_kwargs=propagator_kwargs, - ) - ) - iod_orbits_dfs = [] - iod_orbit_members_dfs = [] - for f in cf.as_completed(futures): - iod_orbits_df, iod_orbit_members_df = f.result() - iod_orbits_dfs.append(iod_orbits_df) - iod_orbit_members_dfs.append(iod_orbit_members_df) - else: - raise ValueError( - f"Unknown parallel backend: {parallel_backend}. Must be one of: 'ray', 'mp', 'cf'." - ) + while futures: + finished, futures = ray.wait(futures, num_returns=1) + result = ray.get(finished[0]) + iod_orbits_list.append(result[0]) + iod_orbit_members_list.append(result[1]) else: - - for observations_i in yieldChunks(observations_split, chunk_size): - iod_orbits_df, iod_orbit_members_df = iod_worker( - observations_i, - observation_selection_method=observation_selection_method, - rchi2_threshold=rchi2_threshold, + for linkage_id_chunk in _iterate_chunks(linkage_ids, chunk_size): + iod_orbits_chunk, iod_orbit_members_chunk = iod_worker( + linkage_id_chunk, + observations, + linkage_members, min_obs=min_obs, min_arc_length=min_arc_length, contamination_percentage=contamination_percentage, + rchi2_threshold=rchi2_threshold, + observation_selection_method=observation_selection_method, iterate=iterate, light_time=light_time, linkage_id_col=linkage_id_col, propagator=propagator, propagator_kwargs=propagator_kwargs, ) - iod_orbits_dfs.append(iod_orbits_df) - iod_orbit_members_dfs.append(iod_orbit_members_df) - - iod_orbits = pd.concat(iod_orbits_dfs, ignore_index=True) - iod_orbit_members = pd.concat(iod_orbit_members_dfs, ignore_index=True) - - for col in ["num_obs"]: - iod_orbits[col] = iod_orbits[col].astype(int) - for col in ["gauss_sol", "outlier"]: - iod_orbit_members[col] = iod_orbit_members[col].astype(int) + iod_orbits_list.append(iod_orbits_chunk) + iod_orbit_members_list.append(iod_orbit_members_chunk) + + iod_orbits = qv.concatenate(iod_orbits_list) + iod_orbit_members = qv.concatenate(iod_orbit_members_list) + + iod_orbits, iod_orbit_members = iod_orbits.drop_duplicates( + iod_orbit_members, + subset=[ + "coordinates.time.days", + "coordinates.time.nanos", + "coordinates.x", + "coordinates.y", + "coordinates.z", + "coordinates.vx", + "coordinates.vy", + "coordinates.vz", + ], + keep="first", + ) logger.info("Found {} initial orbits.".format(len(iod_orbits))) - if identify_subsets and len(iod_orbits) > 0: - iod_orbits, iod_orbit_members = identifySubsetLinkages( - iod_orbits, iod_orbit_members, linkage_id_col="orbit_id" - ) - logger.info( - "{} subset orbits identified.".format( - len(iod_orbits[~iod_orbits["subset_of"].isna()]) - ) - ) - - iod_orbits, iod_orbit_members = sortLinkages( - iod_orbits, iod_orbit_members, observations, linkage_id_col="orbit_id" - ) - else: - iod_orbits = pd.DataFrame( - columns=[ - "orbit_id", - "mjd_tdb", - "x", - "y", - "z", - "vx", - "vy", - "vz", - "arc_length", - "num_obs", - "chi2", - "rchi2", - ] - ) - - iod_orbit_members = pd.DataFrame( - columns=[ - "orbit_id", - "obs_id", - "residual_ra_arcsec", - "residual_dec_arcsec", - "chi2", - "gauss_sol", - "outlier", - ] - ) + iod_orbits = FittedOrbits.empty() + iod_orbit_members = FittedOrbitMembers.empty() time_end = time.time() logger.info( diff --git a/thor/orbits/iterators.py b/thor/orbits/iterators.py index d4e8b074..ece94ee3 100644 --- a/thor/orbits/iterators.py +++ b/thor/orbits/iterators.py @@ -1,7 +1,10 @@ import numpy as np -from adam_core import dynamics +from adam_core.constants import Constants as c +from adam_core.dynamics.lagrange import ( + apply_lagrange_coefficients, + calc_lagrange_coefficients, +) -from ..constants import Constants as c from .state_transition import calcStateTransitionMatrix __all__ = ["iterateStateTransition"] @@ -100,10 +103,10 @@ def iterateStateTransition( # differential equation: # d\chi / dt = \sqrt{mu} / r # and the corresponding state vector - lagrange_coeffs, stumpff_coeffs, chi = dynamics.calc_lagrange_coefficients( + lagrange_coeffs, stumpff_coeffs, chi = calc_lagrange_coefficients( r, v, dt, mu=mu, max_iter=max_iter, tol=tol ) - r_new, v_new = dynamics.apply_lagrange_coefficients(r, v, *lagrange_coeffs) + r_new, v_new = apply_lagrange_coefficients(r, v, *lagrange_coeffs) # Calculate the state transition matrix STM = calcStateTransitionMatrix( diff --git a/thor/orbits/od.py b/thor/orbits/od.py index eca015a2..e7fbc153 100644 --- a/thor/orbits/od.py +++ b/thor/orbits/od.py @@ -1,81 +1,62 @@ -import os -from typing import Literal - -os.environ["OMP_NUM_THREADS"] = "1" -os.environ["OPENBLAS_NUM_THREADS"] = "1" -os.environ["MKL_NUM_THREADS"] = "1" -os.environ["VECLIB_MAXIMUM_THREADS"] = "1" -os.environ["NUMEXPR_NUM_THREADS"] = "1" - -import concurrent.futures as cf -import copy import logging -import multiprocessing as mp import time -from functools import partial +from typing import Literal, Optional, Tuple import numpy as np -import pandas as pd +import numpy.typing as npt +import pyarrow.compute as pc import quivr as qv +import ray from adam_core.coordinates import CartesianCoordinates, CoordinateCovariances -from adam_core.observers import Observers +from adam_core.coordinates.residuals import Residuals from adam_core.orbits import Orbits -from adam_core.propagator import PYOORB -from adam_core.time import Timestamp +from adam_core.propagator import PYOORB, _iterate_chunks from scipy.linalg import solve -from ..utils import ( - _checkParallel, - _initWorker, - calcChunkSize, - sortLinkages, - yieldChunks, -) -from .residuals import calcResiduals -from .utils import _ephemeris_to_dataframe +from ..observations.observations import Observations +from ..orbit_determination import FittedOrbitMembers, FittedOrbits logger = logging.getLogger(__name__) -__all__ = ["od_worker", "od", "differentialCorrection"] +__all__ = ["differential_correction"] def od_worker( - orbits_list, - observations_list, - rchi2_threshold=100, - min_obs=5, - min_arc_length=1.0, - contamination_percentage=20, - delta=1e-6, - max_iter=20, - method="central", - fit_epoch=False, + orbit_ids: npt.NDArray[np.str_], + orbits: FittedOrbits, + orbit_members: FittedOrbitMembers, + observations: Observations, + rchi2_threshold: float = 100, + min_obs: int = 5, + min_arc_length: float = 1.0, + contamination_percentage: float = 0.0, + delta: float = 1e-6, + max_iter: int = 20, + method: Literal["central", "finite"] = "central", + fit_epoch: bool = False, propagator: Literal["PYOORB"] = "PYOORB", propagator_kwargs: dict = {}, -): - od_orbits_dfs = [] - od_orbit_members_dfs = [] - for orbit, observations in zip(orbits_list, observations_list): - try: - assert orbit.orbit_id[0].as_py() == observations["orbit_id"].unique()[0] - assert np.all( - sorted(observations["mjd_utc"].values) == observations["mjd_utc"].values - ) - assert len(np.unique(observations["mjd_utc"].values)) == len( - observations["mjd_utc"].values - ) - except: - err = ( - "Invalid observations and orbit have been passed to the OD code.\n" - "Orbit ID: {}".format(orbit.orbit_id[0].as_py()) - ) - raise ValueError(err) +) -> Tuple[FittedOrbits, FittedOrbitMembers]: + + od_orbits_list = [] + od_orbit_members_list = [] + + for orbit_id in orbit_ids: time_start = time.time() - logger.debug(f"Differentially correcting orbit {orbit.orbit_id[0].as_py()}...") + logger.debug(f"Differentially correcting orbit {orbit_id}...") + + orbit = orbits.select("orbit_id", orbit_id) + obs_ids = orbit_members.apply_mask( + pc.equal(orbit_members.orbit_id, orbit_id) + ).obs_id + orbit_observations = observations.apply_mask( + pc.is_in(observations.detections.id, obs_ids) + ) + od_orbit, od_orbit_members = od( orbit, - observations, + orbit_observations, rchi2_threshold=rchi2_threshold, min_obs=min_obs, min_arc_length=min_arc_length, @@ -89,32 +70,35 @@ def od_worker( ) time_end = time.time() duration = time_end - time_start - logger.debug( - f"OD for orbit {orbit.orbit_id[0].as_py()} completed in {duration:.3f}s." - ) + logger.debug(f"OD for orbit {orbit_id} completed in {duration:.3f}s.") - od_orbits_dfs.append(od_orbit) - od_orbit_members_dfs.append(od_orbit_members) + od_orbits_list.append(od_orbit) + od_orbit_members_list.append(od_orbit_members) - od_orbits = pd.concat(od_orbits_dfs, ignore_index=True) - od_orbit_members = pd.concat(od_orbit_members_dfs, ignore_index=True) + od_orbits = qv.concatenate(od_orbits_list) + od_orbit_members = qv.concatenate(od_orbit_members_list) return od_orbits, od_orbit_members +od_worker_remote = ray.remote(od_worker) +od_worker_remote.options(num_returns=1, num_cpus=1) + + def od( - orbit, - observations, - rchi2_threshold=100, - min_obs=5, - min_arc_length=1.0, - contamination_percentage=0.0, - delta=1e-6, - max_iter=20, - method="central", - fit_epoch=False, + orbit: FittedOrbits, + observations: Observations, + rchi2_threshold: float = 100, + min_obs: int = 5, + min_arc_length: float = 1.0, + contamination_percentage: float = 0.0, + delta: float = 1e-6, + max_iter: int = 20, + method: Literal["central", "finite"] = "central", + fit_epoch: bool = False, propagator: Literal["PYOORB"] = "PYOORB", propagator_kwargs: dict = {}, -): +) -> Tuple[FittedOrbits, FittedOrbitMembers]: + if propagator == "PYOORB": prop = PYOORB(**propagator_kwargs) else: @@ -124,30 +108,12 @@ def od( err = "method should be one of 'central' or 'finite'." raise ValueError(err) - observables = ["RA_deg", "Dec_deg"] - - obs_ids_all = observations["obs_id"].values - coords = observations[observables].values - coords_sigma = observations[["RA_sigma_deg", "Dec_sigma_deg"]].values - - # Create Observers table - observers_list = [] - for observatory_code in observations["observatory_code"].unique(): - observers_list.append( - Observers.from_code( - observatory_code, - Timestamp.from_mjd( - observations[ - observations["observatory_code"].isin([observatory_code]) - ]["mjd_utc"].unique(), - scale="utc", - ), - ) - ) - observers = qv.concatenate(observers_list) - observers = observers.sort_by( - ["coordinates.time.days", "coordinates.time.nanos", "code"] - ) + obs_ids_all = observations.detections.id.to_numpy(zero_copy_only=False) + coords = observations.to_spherical_coordinates() + coords_sigma = coords.covariance.sigmas[:, 1:3] + observers_with_states = observations.get_observers() + observers = observers_with_states.observers + times_all = coords.time.mjd().to_numpy(zero_copy_only=False) # FLAG: can we stop iterating to find a solution? converged = False @@ -175,20 +141,23 @@ def od( # Calculate chi2 for residuals on the given observations # for the current orbit, the goal is for the orbit to improve # such that the chi2 improves - orbit_prev_ = copy.deepcopy(orbit) + orbit_prev_ = orbit.to_orbits() - ephemeris_prev_ = prop._generate_ephemeris(orbit_prev_, observers) - ephemeris_prev_ = _ephemeris_to_dataframe(ephemeris_prev_) - # TODO: replace with newer types + ephemeris_prev_ = prop.generate_ephemeris( + orbit_prev_, observers, chunk_size=1, max_processes=1 + ) - residuals_prev_, stats_prev_ = calcResiduals( + # Calculate residuals and chi2 + residuals_prev_ = Residuals.calculate( coords, - ephemeris_prev_[observables].values, - sigmas_actual=coords_sigma, - include_probabilistic=False, + ephemeris_prev_.coordinates, ) + residuals_prev_array = np.stack( + residuals_prev_.values.to_numpy(zero_copy_only=False) + )[:, 1:3] + num_obs_ = len(observations) - chi2_prev_ = stats_prev_[0] + chi2_prev_ = residuals_prev_.chi2.to_numpy() chi2_total_prev_ = np.sum(chi2_prev_) rchi2_prev_ = np.sum(chi2_prev_) / (2 * num_obs - 6) @@ -203,7 +172,7 @@ def od( rchi2_prev = rchi2_prev_ ids_mask = np.array([True for i in range(num_obs)]) - times_all = ephemeris_prev["mjd_utc"].values + times_all = ephemeris_prev.coordinates.time.mjd().to_numpy() obs_id_outlier = [] delta_prev = delta iterations = 0 @@ -250,16 +219,14 @@ def od( else: num_params = 6 - A = np.zeros((coords.shape[1], num_params, num_obs)) + A = np.zeros((2, num_params, num_obs)) ATWA = np.zeros((num_params, num_params, num_obs)) ATWb = np.zeros((num_params, 1, num_obs)) # Generate ephemeris with current nominal orbit - ephemeris_nom = prop._generate_ephemeris(orbit_prev, observers) - ephemeris_nom = _ephemeris_to_dataframe(ephemeris_nom) - # TODO: replace with newer types - - coords_nom = ephemeris_nom[observables].values + ephemeris_nom = prop.generate_ephemeris( + orbit_prev, observers, chunk_size=1, max_processes=1 + ) # Modify each component of the state by a small delta d = np.zeros((1, 7)) @@ -301,10 +268,9 @@ def od( ) # Calculate the modified ephemerides - ephemeris_mod_p = prop._generate_ephemeris(orbit_iter_p, observers) - ephemeris_mod_p = _ephemeris_to_dataframe(ephemeris_mod_p) - # TODO: replace with newer types - coords_mod_p = ephemeris_mod_p[observables].values + ephemeris_mod_p = prop.generate_ephemeris( + orbit_iter_p, observers, chunk_size=1, max_processes=1 + ) delta_denom = d[0, i] if method == "central": @@ -326,27 +292,28 @@ def od( ) # Calculate the modified ephemerides - ephemeris_mod_n = prop._generate_ephemeris(orbit_iter_n, observers) - ephemeris_mod_n = _ephemeris_to_dataframe(ephemeris_mod_n) - # TODO: replace with newer types - coords_mod_n = ephemeris_mod_n[observables].values + ephemeris_mod_n = prop.generate_ephemeris( + orbit_iter_n, observers, chunk_size=1, max_processes=1 + ) delta_denom *= 2 else: - coords_mod_n = coords_nom + ephemeris_mod_n = ephemeris_nom - residuals_mod, _ = calcResiduals( - coords_mod_p, - coords_mod_n, - sigmas_actual=None, - include_probabilistic=False, + residuals_mod = Residuals.calculate( + ephemeris_mod_p.coordinates, + ephemeris_mod_n.coordinates, ) + residuals_mod = np.stack( + residuals_mod.values.to_numpy(zero_copy_only=False) + ) + residuals_mod_array = residuals_mod[:, 1:3] for n in range(num_obs): try: A[:, i : i + 1, n] = ( - residuals_mod[ids_mask][n : n + 1].T / delta_denom + residuals_mod_array[ids_mask][n : n + 1].T / delta_denom ) except RuntimeError: print(orbit_prev.orbit_id) @@ -354,7 +321,7 @@ def od( for n in range(num_obs): W = np.diag(1 / coords_sigma[n] ** 2) ATWA[:, :, n] = A[:, :, n].T @ W @ A[:, :, n] - ATWb[:, :, n] = A[:, :, n].T @ W @ residuals_prev[n : n + 1].T + ATWb[:, :, n] = A[:, :, n].T @ W @ residuals_prev_array[n : n + 1].T ATWA = np.sum(ATWA, axis=2) ATWb = np.sum(ATWb, axis=2) @@ -444,14 +411,12 @@ def od( continue # Generate ephemeris with current nominal orbit - ephemeris_iter = prop._generate_ephemeris(orbit_iter, observers) - ephemeris_iter = _ephemeris_to_dataframe(ephemeris_iter) - coords_iter = ephemeris_iter[observables].values - - residuals, stats = calcResiduals( - coords, coords_iter, sigmas_actual=coords_sigma, include_probabilistic=False + ephemeris_iter = prop.generate_ephemeris( + orbit_iter, observers, chunk_size=1, max_processes=1 ) - chi2_iter = stats[0] + + residuals = Residuals.calculate(coords, ephemeris_iter.coordinates) + chi2_iter = residuals.chi2.to_numpy() chi2_total_iter = np.sum(chi2_iter[ids_mask]) rchi2_iter = chi2_total_iter / (2 * num_obs - num_params) arc_length = times_all[ids_mask].max() - times_all[ids_mask].min() @@ -541,94 +506,58 @@ def od( if not solution_found or not processable or first_solution: - od_orbit = pd.DataFrame( - columns=[ - "orbit_id", - "mjd_tdb", - "x", - "y", - "z", - "vx", - "vy", - "vz", - "covariance", - "r", - "r_sigma", - "v", - "v_sigma", - "arc_length", - "num_obs", - "num_params", - "num_iterations", - "chi2", - "rchi2", - "improved", - ] - ) + od_orbit = FittedOrbits.empty() + od_orbit_members = FittedOrbitMembers.empty() - od_orbit_members = pd.DataFrame( - columns=[ - "orbit_id", - "obs_id", - "residual_ra_arcsec", - "residual_dec_arcsec", - "chi2", - "outlier", - ] + else: + + obs_times = observations.detections.time.mjd().to_numpy()[ids_mask] + arc_length_ = obs_times.max() - obs_times.min() + assert arc_length == arc_length_ + + od_orbit = FittedOrbits.from_kwargs( + orbit_id=orbit_prev.orbit_id, + object_id=orbit_prev.object_id, + coordinates=orbit_prev.coordinates, + arc_length=[arc_length_], + num_obs=[num_obs], + chi2=[chi2_total_prev], + reduced_chi2=[rchi2_prev], + improved=[improved], ) - else: - obs_times = observations["mjd_utc"].values[ids_mask] - od_orbit = orbit_prev.to_dataframe() - od_orbit["r"] = orbit_prev.coordinates.r_mag - od_orbit["r_sigma"] = orbit_prev.coordinates.sigma_r_mag - od_orbit["v"] = orbit_prev.coordinates.v_mag - od_orbit["v_sigma"] = orbit_prev.coordinates.sigma_v_mag - od_orbit["arc_length"] = np.max(obs_times) - np.min(obs_times) - od_orbit["num_obs"] = num_obs - od_orbit["num_params"] = num_params - od_orbit["num_iterations"] = iterations - od_orbit["chi2"] = chi2_total_prev - od_orbit["rchi2"] = rchi2_prev - od_orbit["improved"] = improved - - od_orbit_members = pd.DataFrame( - { - "orbit_id": [ - orbit_prev.orbit_id[0].as_py() for i in range(len(obs_ids_all)) - ], - "obs_id": obs_ids_all, - "residual_ra_arcsec": residuals_prev[:, 0] * 3600, - "residual_dec_arcsec": residuals_prev[:, 1] * 3600, - "chi2": chi2_prev, - "outlier": np.zeros(len(obs_ids_all), dtype=int), - } + # od_orbit["num_params"] = num_params + # od_orbit["num_iterations"] = iterations + # od_orbit["improved"] = improved + + od_orbit_members = FittedOrbitMembers.from_kwargs( + orbit_id=np.full(len(obs_ids_all), orbit_prev.orbit_id[0].as_py()), + obs_id=obs_ids_all, + residuals=residuals_prev, + solution=np.isin(obs_ids_all, obs_id_outlier, invert=True), + outlier=np.isin(obs_ids_all, obs_id_outlier), ) - od_orbit_members.loc[ - od_orbit_members["obs_id"].isin(obs_id_outlier), "outlier" - ] = 1 return od_orbit, od_orbit_members -def differentialCorrection( - orbits, - orbit_members, - observations, - min_obs=5, - min_arc_length=1.0, - contamination_percentage=20, - rchi2_threshold=100, - delta=1e-8, - max_iter=20, - method="central", - fit_epoch=False, +def differential_correction( + orbits: FittedOrbits, + orbit_members: FittedOrbitMembers, + observations: Observations, + min_obs: int = 5, + min_arc_length: float = 1.0, + contamination_percentage: float = 20, + rchi2_threshold: float = 100, + delta: float = 1e-8, + max_iter: int = 20, + method: Literal["central", "finite"] = "central", + fit_epoch: bool = False, propagator: Literal["PYOORB"] = "PYOORB", propagator_kwargs: dict = {}, - chunk_size=10, - num_jobs=60, - parallel_backend="cf", -): + chunk_size: int = 10, + max_processes: Optional[int] = 1, +) -> Tuple[FittedOrbits, FittedOrbitMembers]: """ Differentially correct (via finite/central differencing). @@ -648,103 +577,27 @@ def differentialCorrection( if len(orbits) > 0 and len(orbit_members) > 0: - orbits_, orbit_members_ = sortLinkages(orbits, orbit_members, observations) + orbit_ids = orbits.orbit_id.to_numpy(zero_copy_only=False) - start = time.time() - logger.debug("Merging observations on linkage members...") - linked_observations = orbit_members_[ - orbit_members_[["orbit_id", "obs_id"]]["orbit_id"].isin( - orbits_["orbit_id"].values - ) - ].merge(observations, on="obs_id", how="left") - duration = time.time() - start - logger.debug(f"Merging completed in {duration:.3f}s.") - - start = time.time() - logger.debug("Grouping observations by orbit ID...") - grouped_observations = linked_observations.groupby(by=["orbit_id"]) - logger.debug("Splitting grouped observations by orbit ID...") - observations_split = [ - grouped_observations.get_group(g).reset_index(drop=True) - for g in grouped_observations.groups - ] - duration = time.time() - start - logger.debug(f"Grouping and splitting completed in {duration:.3f}s.") - - orbits_initial = Orbits.from_flat_dataframe(orbits_) - orbits_split = [orbit for orbit in orbits_initial] - num_orbits = len(orbits) - - parallel, num_workers = _checkParallel(num_jobs, parallel_backend) - if num_workers > 1: - - if parallel_backend == "ray": - import ray - - if not ray.is_initialized(): - ray.init(address="auto") - - od_worker_ray = ray.remote(od_worker) - od_worker_ray = od_worker_ray.options(num_returns=2, num_cpus=1) - - # Send up to chunk_size orbits to each OD worker for processing - chunk_size_ = calcChunkSize( - num_orbits, num_workers, chunk_size, min_chunk_size=1 - ) - logger.info( - f"Distributing linkages in chunks of {chunk_size_} to {num_workers} ray workers." - ) + od_orbits_list = [] + od_orbit_members_list = [] + if max_processes is None or max_processes > 1: - # Put the observations and orbits into ray's local object storage ("plasma") - orbit_oids = [] - observation_oids = [] - for orbits_i, observations_i in zip( - yieldChunks(orbits_split, chunk_size_), - yieldChunks(observations_split, chunk_size_), - ): - orbit_oids.append(ray.put(orbits_i)) - observation_oids.append(ray.put(observations_i)) - - od_orbits_oids = [] - od_orbit_members_oids = [] - for orbits_oid, observations_oid in zip(orbit_oids, observation_oids): - - od_orbits_oid, od_orbit_members_oid = od_worker_ray.remote( - orbits_oid, - observations_oid, - rchi2_threshold=rchi2_threshold, - min_obs=min_obs, - min_arc_length=min_arc_length, - contamination_percentage=contamination_percentage, - delta=delta, - max_iter=max_iter, - method=method, - fit_epoch=fit_epoch, - propagator=propagator, - propagator_kwargs=propagator_kwargs, - ) - od_orbits_oids.append(od_orbits_oid) - od_orbit_members_oids.append(od_orbit_members_oid) + if not ray.is_initialized(): + ray.init(address="auto") - od_orbits_dfs = ray.get(od_orbits_oids) - od_orbit_members_dfs = ray.get(od_orbit_members_oids) + orbits_ref = ray.put(orbits) + orbit_members_ref = ray.put(orbit_members) + observations_ref = ray.put(observations) - elif parallel_backend == "mp": - - chunk_size_ = calcChunkSize( - num_orbits, num_workers, chunk_size, min_chunk_size=1 - ) - logger.info( - f"Distributing linkages in chunks of {chunk_size_} to {num_workers} workers." - ) - - p = mp.Pool( - processes=num_workers, - initializer=_initWorker, - ) - results = p.starmap( - partial( - od_worker, + futures = [] + for orbit_ids_chunk in _iterate_chunks(orbit_ids, chunk_size): + futures.append( + od_worker_remote.remote( + orbit_ids_chunk, + orbits_ref, + orbit_members_ref, + observations_ref, rchi2_threshold=rchi2_threshold, min_obs=min_obs, min_arc_length=min_arc_length, @@ -755,68 +608,23 @@ def differentialCorrection( fit_epoch=fit_epoch, propagator=propagator, propagator_kwargs=propagator_kwargs, - ), - zip( - yieldChunks(orbits_split, chunk_size_), - yieldChunks(observations_split, chunk_size_), - ), + ) ) - p.close() - - results = list(zip(*results)) - od_orbits_dfs = results[0] - od_orbit_members_dfs = results[1] - - elif parallel_backend == "cf": - with cf.ProcessPoolExecutor( - max_workers=num_workers, initializer=_initWorker - ) as executor: - futures = [] - for orbits_i, observations_i in zip( - yieldChunks(orbits_split, chunk_size), - yieldChunks(observations_split, chunk_size), - ): - futures.append( - executor.submit( - od_worker, - orbits_i, - observations_i, - rchi2_threshold=rchi2_threshold, - min_obs=min_obs, - min_arc_length=min_arc_length, - contamination_percentage=contamination_percentage, - delta=delta, - max_iter=max_iter, - method=method, - fit_epoch=fit_epoch, - propagator=propagator, - propagator_kwargs=propagator_kwargs, - ) - ) - od_orbits_dfs = [] - od_orbit_members_dfs = [] - for future in cf.as_completed(futures): - od_orbits_df, od_orbit_members_df = future.result() - od_orbits_dfs.append(od_orbits_df) - od_orbit_members_dfs.append(od_orbit_members_df) - else: - raise ValueError( - f"Unknown parallel backend: {parallel_backend}. Must be one of: 'ray', 'mp', 'cf'." - ) + while futures: + finished, futures = ray.wait(futures, num_returns=1) + results = ray.get(finished[0]) + od_orbits_list.append(results[0]) + od_orbit_members_list.append(results[1]) else: - od_orbits_dfs = [] - od_orbit_members_dfs = [] - for orbits_i, observations_i in zip( - yieldChunks(orbits_split, chunk_size), - yieldChunks(observations_split, chunk_size), - ): - - od_orbits_df, od_orbit_members_df = od_worker( - orbits_i, - observations_i, + for orbit_ids_chunk in _iterate_chunks(orbit_ids, chunk_size): + od_orbits_chunk, od_orbit_members_chunk = od_worker( + orbit_ids_chunk, + orbits, + orbit_members, + observations, rchi2_threshold=rchi2_threshold, min_obs=min_obs, min_arc_length=min_arc_length, @@ -828,54 +636,15 @@ def differentialCorrection( propagator=propagator, propagator_kwargs=propagator_kwargs, ) - od_orbits_dfs.append(od_orbits_df) - od_orbit_members_dfs.append(od_orbit_members_df) - - od_orbits = pd.concat(od_orbits_dfs, ignore_index=True) - od_orbit_members = pd.concat(od_orbit_members_dfs, ignore_index=True) + od_orbits_list.append(od_orbits_chunk) + od_orbit_members_list.append(od_orbit_members_chunk) - for col in ["num_obs"]: - od_orbits[col] = od_orbits[col].astype(int) - for col in ["outlier"]: - od_orbit_members[col] = od_orbit_members[col].astype(int) - - od_orbits, od_orbit_members = sortLinkages( - od_orbits, od_orbit_members, observations, linkage_id_col="orbit_id" - ) + od_orbits = qv.concatenate(od_orbits_list) + od_orbit_members = qv.concatenate(od_orbit_members_list) else: - od_orbits = pd.DataFrame( - columns=[ - "orbit_id", - "mjd_tdb", - "x", - "y", - "z", - "vx", - "vy", - "vz", - "covariance", - "r", - "r_sigma", - "v", - "v_sigma", - "arc_length", - "num_obs", - "chi2", - "rchi2", - ] - ) - - od_orbit_members = pd.DataFrame( - columns=[ - "orbit_id", - "obs_id", - "residual_ra_arcsec", - "residual_dec_arcsec", - "chi2", - "outlier", - ] - ) + od_orbits = FittedOrbits.empty() + od_orbit_members = FittedOrbitMembers.empty() time_end = time.time() logger.info("Differentially corrected {} orbits.".format(len(od_orbits))) diff --git a/thor/orbits/residuals.py b/thor/orbits/residuals.py deleted file mode 100644 index 29d227df..00000000 --- a/thor/orbits/residuals.py +++ /dev/null @@ -1,191 +0,0 @@ -import numpy as np -from scipy.spatial.distance import mahalanobis -from scipy.stats import chi2 - -__all__ = ["calcResiduals", "calcSimpleResiduals", "calcProbabilisticResiduals"] - - -def calcResiduals( - coords_actual, - coords_desired, - sigmas_actual=None, - covariances_actual=None, - include_probabilistic=True, -): - """ - Calculate residuals (actual - desired) and the associated chi2 if the - 1-sigma uncertainties on the actual quantities are provided. If desired, - a probabilistic measure of residual can also be calculated using the 1-sigma - uncertainties (which are converted into a covariance matrix) or by providing the - coveriance matrices for each coordinate. The probabilistic statistics include - the chi2 for each measurement, the mahalanobis distance and the associated - probability (calculated as 1 - chi2.cdf(mahalanobis_distance, degrees of freedom)). - - Parameters - ---------- - coords_actual : `~numpy.ndarray` (N, M) - Actual N coordinates in M dimensions. - coords_desired : `~numpy.ndarray` (N, M) - The desired N coordinates in M dimensions. - sigmas_actual : `~numpy.ndarray` (N, M), optional - The 1-sigma uncertainties of the actual coordinates. Can be - None, in which case chi2 will be return as a NaN. - covariances_actual : list of N `~numpy.ndarray`s (M, M) - The covariance matrix in M dimensions for each - actual observation if available. - include_probabilistic : bool, optional - Include the calculation of mahalanobis distance and the associated - probability. - - Returns - ------- - residuals : `~numpy.ndarray` (N, 2) - Residuals for each coordinate (actual - desired) - stats : tuple (1 or 3) - chi2 : `~numpy.ndarray` (N) - Chi-squared for each observation given the 1-sigma - uncertainties. NaN if no sigmas_actual is None. - p : `~numpy.ndarray` (N) - The probability that the actual coordinates given their uncertainty - belong to the same multivariate normal distribution as the desired - coordinates. - d : `~numpy.ndarray` (N) - The Mahalanobis distance of each coordinate compared to the desired - coordinates. - """ - if ( - covariances_actual is None - and sigmas_actual is not None - and include_probabilistic - ): - covariances_actual_ = [np.diag(i**2) for i in sigmas_actual] - sigmas_actual_ = sigmas_actual - elif covariances_actual is not None and sigmas_actual is None: - sigmas_actual_ = np.zeros_like(coords_actual) - for i in range(len(coords_actual)): - sigmas_actual_[i] = np.diagonal(covariances_actual[i]) - sigmas_actual_ = np.sqrt(sigmas_actual_) - else: - covariances_actual_ = covariances_actual - sigmas_actual_ = sigmas_actual - - residuals, chi2 = calcSimpleResiduals( - coords_actual, coords_desired, sigmas_actual=sigmas_actual_ - ) - - if include_probabilistic: - p, d = calcProbabilisticResiduals( - coords_actual, coords_desired, covariances_actual_ - ) - stats = (chi2, p, d) - else: - - stats = (chi2,) - - return residuals, stats - - -def calcSimpleResiduals( - coords_actual, - coords_desired, - sigmas_actual=None, -): - """ - Calculate residuals and the associated chi2 if the - 1-sigma uncertainties on the actual quantities are provided. - - Parameters - ---------- - coords_actual : `~numpy.ndarray` (N, M) - Actual N coordinates in M dimensions. - coords_desired : `~numpy.ndarray` (N, M) - The desired N coordinates in M dimensions. - sigmas_actual : `~numpy.ndarray` (N, M), optional - The 1-sigma uncertainties of the actual coordinates. Can be - None, in which case chi2 will be return as a NaN. - - Returns - ------- - residuals : `~numpy.ndarray` (N, 2) - Residuals for each coordinate (actual - desired) - chi2 : `~numpy.ndarray` (N) - Chi-squared for each observation given the 1-sigma - uncertainties. NaN if no sigmas_actual is None. - """ - residuals = np.zeros_like(coords_actual) - - ra = coords_actual[:, 0] - dec = coords_actual[:, 1] - ra_pred = coords_desired[:, 0] - dec_pred = coords_desired[:, 1] - - # Calculate residuals in RA, make sure to fix any potential wrap around errors - residual_ra = ra - ra_pred - residual_ra = np.where(residual_ra > 180.0, 360.0 - residual_ra, residual_ra) - residual_ra *= np.cos(np.radians(dec_pred)) - - # Calculate residuals in Dec - residual_dec = dec - dec_pred - - if isinstance(sigmas_actual, np.ndarray): - sigma_ra = sigmas_actual[:, 0] - sigma_dec = sigmas_actual[:, 1] - - # Calculate chi2 - chi2 = (residual_ra**2 / sigma_ra**2) + (residual_dec**2 / sigma_dec**2) - else: - chi2 = np.NaN - - residuals[:, 0] = residual_ra - residuals[:, 1] = residual_dec - - return residuals, chi2 - - -def calcProbabilisticResiduals(coords_actual, coords_desired, covariances_actual): - """ - Calculate the probabilistic residual. - - Parameters - ---------- - coords_actual : `~numpy.ndarray` (N, M) - Actual N coordinates in M dimensions. - coords_desired : `~numpy.ndarray` (N, M) - The desired N coordinates in M dimensions. - sigmas_actual : `~numpy.ndarray` (N, M) - The 1-sigma uncertainties of the actual coordinates. - covariances_actual : list of N `~numpy.ndarray`s (M, M) - The covariance matrix in M dimensions for each - actual observation if available. - - Returns - ------- - p : `~numpy.ndarray` (N) - The probability that the actual coordinates given their uncertainty - belong to the same multivariate normal distribution as the desired - coordinates. - d : `~numpy.ndarray` (N) - The Mahalanobis distance of each coordinate compared to the desired - coordinates. - """ - d = np.zeros(len(coords_actual)) - p = np.zeros(len(coords_actual)) - - for i, (actual, desired, covar) in enumerate( - zip(coords_actual, coords_desired, covariances_actual) - ): - # Calculate the degrees of freedom - k = len(actual) - - # Calculate the mahalanobis distance between the two coordinates - d_i = mahalanobis(actual, desired, np.linalg.inv(covar)) - - # Calculate the probability that both sets of coordinates are drawn from - # the same multivariate normal - p_i = 1 - chi2.cdf(d_i, k) - - # Append results - d[i] = d_i - p[i] = p_i - - return p, d diff --git a/thor/orbits/state_transition.py b/thor/orbits/state_transition.py index 494ffcc8..d4df5e20 100644 --- a/thor/orbits/state_transition.py +++ b/thor/orbits/state_transition.py @@ -1,7 +1,6 @@ import numpy as np from adam_core import dynamics - -from ..constants import Constants as c +from adam_core.constants import Constants as c __all__ = ["calcMMatrix", "calcStateTransitionMatrix"] diff --git a/thor/orbits/tests/test_attribution.py b/thor/orbits/tests/test_attribution.py new file mode 100644 index 00000000..1b1276cb --- /dev/null +++ b/thor/orbits/tests/test_attribution.py @@ -0,0 +1,36 @@ +import pytest +from adam_core.observations import PointSourceDetections +from adam_core.time import Timestamp + +from ...observations.observations import Observations +from ..attribution import Attributions + + +def test_Attributions_drop_coincident_attributions(): + observations = Observations.from_kwargs( + detections=PointSourceDetections.from_kwargs( + id=["01", "02", "03", "04"], + exposure_id=["e01", "e01", "e02", "e02"], + time=Timestamp.from_mjd([59001.1, 59001.1, 59002.1, 59002.1], scale="utc"), + ra=[1, 2, 3, 4], + dec=[5, 6, 7, 8], + mag=[10, 11, 12, 13], + ), + state_id=[0, 0, 1, 1], + observatory_code=["500", "500", "500", "500"], + ) + + attributions = Attributions.from_kwargs( + orbit_id=["o01", "o01", "o02", "o03"], + obs_id=["01", "02", "03", "03"], + distance=[0.5 / 3600, 1 / 3600, 2 / 3600, 1 / 3600], + ) + + filtered = attributions.drop_coincident_attributions(observations) + # Orbit 1 gets linked to two observations at the same time + # We should expect to only keep the one with the smallest distance + # Orbit 2 and 3 get linked to the same observation but we should keep both + assert len(filtered) == 3 + assert filtered.orbit_id.to_pylist() == ["o01", "o02", "o03"] + assert filtered.obs_id.to_pylist() == ["01", "03", "03"] + assert filtered.distance.to_pylist() == [0.5 / 3600, 2 / 3600, 1 / 3600] diff --git a/thor/orbits/utils.py b/thor/orbits/utils.py deleted file mode 100644 index 44b1b75b..00000000 --- a/thor/orbits/utils.py +++ /dev/null @@ -1,24 +0,0 @@ -import pandas as pd -from adam_core.orbits import Ephemeris - - -def _ephemeris_to_dataframe(ephemeris: Ephemeris) -> pd.DataFrame: - ephemeris = ephemeris.sort_by( - [ - "orbit_id", - "coordinates.time.days", - "coordinates.time.nanos", - "coordinates.origin.code", - ] - ) - ephemeris_df = ephemeris.to_dataframe() - ephemeris_df.insert(2, "mjd_utc", ephemeris.coordinates.time.mjd()) - ephemeris_df.rename( - columns={ - "coordinates.lon": "RA_deg", - "coordinates.lat": "Dec_deg", - "coordinates.origin.code": "observatory_code", - }, - inplace=True, - ) - return ephemeris_df diff --git a/thor/range_and_transform.py b/thor/range_and_transform.py new file mode 100644 index 00000000..4feb63d4 --- /dev/null +++ b/thor/range_and_transform.py @@ -0,0 +1,213 @@ +import logging +import time +from typing import Optional, Type, Union + +import pyarrow.compute as pc +import quivr as qv +import ray +from adam_core.coordinates import ( + CartesianCoordinates, + OriginCodes, + transform_coordinates, +) +from adam_core.propagator import PYOORB, Propagator + +from .observations.observations import Observations +from .orbit import TestOrbit, TestOrbitEphemeris +from .projections import GnomonicCoordinates + +__all__ = [ + "TransformedDetections", + "range_and_transform", +] + + +logger = logging.getLogger(__name__) + + +class TransformedDetections(qv.Table): + id = qv.StringColumn() + coordinates = GnomonicCoordinates.as_column() + state_id = qv.Int64Column() + + +def range_and_transform_worker( + ranged_detections: CartesianCoordinates, + observations: Observations, + ephemeris: TestOrbitEphemeris, + state_id: int, +) -> TransformedDetections: + """ + Given ranged detections and their original observations, transform these to the gnomonic tangent + plane centered on the motion of the test orbit for a single state. + + Parameters + ---------- + ranged_detections + Cartesian detections ranged so that their heliocentric distance is the same as the test orbit + for each state + observations + The observations from which the ranged detections originate. These should be sorted one-to-one + with the ranged detections + ephemeris + Ephemeris from which to extract the test orbit's aberrated state. + state_id + The ID for this particular state. + + Returns + ------- + transformed_detections + Detections transformed to a gnomonic tangent plane centered on the motion of the + test orbit. + """ + # Select the detections and ephemeris for this state id + mask = pc.equal(state_id, observations.state_id) + ranged_detections_state = ranged_detections.apply_mask(mask) + ephemeris_state = ephemeris.select("id", state_id) + observations_state = observations.select("state_id", state_id) + + # Transform the detections into the co-rotating frame + return TransformedDetections.from_kwargs( + id=observations_state.detections.id, + coordinates=GnomonicCoordinates.from_cartesian( + ranged_detections_state, + center_cartesian=ephemeris_state.ephemeris.aberrated_coordinates, + ), + state_id=observations_state.state_id, + ) + + +range_and_transform_remote = ray.remote(range_and_transform_worker) + + +def range_and_transform( + test_orbit: TestOrbit, + observations: Union[Observations, ray.ObjectRef], + propagator: Type[Propagator] = PYOORB, + propagator_kwargs: dict = {}, + max_processes: Optional[int] = 1, +) -> TransformedDetections: + """ + Range observations for a single test orbit and transform them into a + gnomonic projection centered on the motion of the test orbit (co-rotating + frame). + + Parameters + ---------- + test_orbit : `~thor.orbit.TestOrbit` + Test orbit to use to gather and transform observations. + observations : `~thor.observations.observations.Observations` + Observations from which range and transform the detections. + propagator : `~adam_core.propagator.propagator.Propagator` + Propagator to use to propagate the test orbit and generate + ephemerides. + max_processes : int, optional + Maximum number of processes to use for parallelization. If + an existing ray cluster is already running, this parameter + will be ignored if larger than 1 or not None. + + Returns + ------- + transformed_detections : `~thor.main.TransformedDetections` + The transformed detections as gnomonic coordinates + of the observations in the co-rotating frame. + """ + time_start = time.perf_counter() + logger.info("Running range and transform...") + logger.info(f"Assuming r = {test_orbit.orbit.coordinates.r[0]} au") + logger.info(f"Assuming v = {test_orbit.orbit.coordinates.v[0]} au/d") + + if isinstance(observations, ray.ObjectRef): + observations = ray.get(observations) + + prop = propagator(**propagator_kwargs) + + if len(observations) > 0: + # Compute the ephemeris of the test orbit (this will be cached) + ephemeris = test_orbit.generate_ephemeris_from_observations( + observations, + propagator=prop, + max_processes=max_processes, + ) + + # Assume that the heliocentric distance of all point sources in + # the observations are the same as that of the test orbit + ranged_detections_spherical = test_orbit.range_observations( + observations, + propagator=prop, + max_processes=max_processes, + ) + + # Transform from spherical topocentric to cartesian heliocentric coordinates + ranged_detections_cartesian = transform_coordinates( + ranged_detections_spherical.coordinates, + representation_out=CartesianCoordinates, + frame_out="ecliptic", + origin_out=OriginCodes.SUN, + ) + + transformed_detection_list = [] + if max_processes is None or max_processes > 1: + + if not ray.is_initialized(): + logger.debug( + f"Ray is not initialized. Initializing with {max_processes}..." + ) + ray.init(num_cpus=max_processes) + + if isinstance(observations, ray.ObjectRef): + observations_ref = observations + observations = ray.get(observations_ref) + else: + observations_ref = ray.put(observations) + + if isinstance(ephemeris, ray.ObjectRef): + ephemeris_ref = ephemeris + else: + ephemeris_ref = ray.put(ephemeris) + + ranged_detections_cartesian_ref = ray.put(ranged_detections_cartesian) + + # Get state IDs + state_ids = observations.state_id.unique().sort() + futures = [] + for state_id in state_ids: + futures.append( + range_and_transform_remote.remote( + ranged_detections_cartesian_ref, + observations_ref, + ephemeris_ref, + state_id, + ) + ) + + while futures: + finished, futures = ray.wait(futures, num_returns=1) + transformed_detection_list.append(ray.get(finished[0])) + + else: + # Get state IDs + state_ids = observations.state_id.unique().sort() + for state_id in state_ids: + mask = pc.equal(state_id, observations.state_id) + transformed_detection_list.append( + range_and_transform_worker( + ranged_detections_cartesian.apply_mask(mask), + observations.select("state_id", state_id), + ephemeris.select("id", state_id), + state_id, + ) + ) + + transformed_detections = qv.concatenate(transformed_detection_list) + transformed_detections = transformed_detections.sort_by(by=["state_id"]) + + else: + transformed_detections = TransformedDetections.empty() + + time_end = time.perf_counter() + logger.info(f"Transformed {len(transformed_detections)} observations.") + logger.info( + f"Range and transform completed in {time_end - time_start:.3f} seconds." + ) + return transformed_detections diff --git a/thor/tests/test_main.py b/thor/tests/test_main.py index f0d08125..2feea461 100644 --- a/thor/tests/test_main.py +++ b/thor/tests/test_main.py @@ -3,10 +3,11 @@ from adam_core.utils.helpers import make_observations, make_real_orbits from ..config import Config -from ..main_2 import link_test_orbit, range_and_transform +from ..main import link_test_orbit from ..observations import Observations from ..observations.filters import TestOrbitRadiusObservationFilter from ..orbit import TestOrbit as THORbit +from ..range_and_transform import range_and_transform OBJECT_IDS = [ "594913 'Aylo'chaxnim (2020 AV2)", @@ -243,7 +244,7 @@ def test_link_test_orbit( assert len(recovered_orbit_members) == len(obs_ids_expected) # Ensure we get all the object IDs back that we expect - obs_ids_actual = recovered_orbit_members["obs_id"].values + obs_ids_actual = recovered_orbit_members.obs_id assert pc.all(pc.equal(obs_ids_actual, obs_ids_expected)) @@ -270,5 +271,5 @@ def test_benchmark_link_test_orbit( assert len(recovered_orbit_members) == len(obs_ids_expected) # Ensure we get all the object IDs back that we expect - obs_ids_actual = recovered_orbit_members["obs_id"].values + obs_ids_actual = recovered_orbit_members.obs_id assert pc.all(pc.equal(obs_ids_actual, obs_ids_expected)) diff --git a/thor/utils/__init__.py b/thor/utils/__init__.py index e5f0e3a9..747ff773 100644 --- a/thor/utils/__init__.py +++ b/thor/utils/__init__.py @@ -1,7 +1,3 @@ from .ades import * -from .astropy import * -from .linkages import * from .logging import * -from .multiprocessing import * -from .observations import * -from .patches import * +from .quivr import * diff --git a/thor/utils/astropy.py b/thor/utils/astropy.py deleted file mode 100644 index 3698083c..00000000 --- a/thor/utils/astropy.py +++ /dev/null @@ -1,33 +0,0 @@ -from astropy.time import Time - -__all__ = ["_checkTime"] - - -def _checkTime(time, arg_name): - """ - Check that 'time' is an astropy time object, if not, raise an error. - - Parameters - ---------- - time : `~astropy.time.core.Time` - - arg_name : str - Name of argument in function. - - Returns - ------- - None - - Raises - ------ - ValueError : If time is not an astropy time object. - """ - err = ( - "Time ({}) has to be an `~astropy.time.core.Time` object.\n" - "Convert using:\n\n" - "from astropy.time import Time\n" - "times = Time(t_array, scale='...', format='...')" - ) - if type(time) != Time: - raise TypeError(err.format(arg_name)) - return diff --git a/thor/utils/linkages.py b/thor/utils/linkages.py deleted file mode 100644 index 029c810a..00000000 --- a/thor/utils/linkages.py +++ /dev/null @@ -1,387 +0,0 @@ -import logging -import time - -import numpy as np -import pandas as pd - -__all__ = [ - "generateCombinations", - "sortLinkages", - "identifySubsetLinkages", - "removeDuplicateLinkages", - "removeDuplicateObservations", - "calcDeltas", -] - -logger = logging.getLogger(__name__) - - -def generateCombinations(x, idx=None, ct=None, reps=None): - # Magic from the wizard himself: Mario Juric - # recursively generate all combinations of idx, assuming - # ct is the list of repeat counts of idx - if x is not None: - # initialization; find indices of where the repetitions are - _, idx, ct = np.unique(x, return_counts=True, return_index=True) - reps = np.nonzero(ct > 1)[0] - if len(reps) == 0: - yield idx - return - i = reps[0] - idx = idx.copy() - for _ in range(ct[i]): - yield from generateCombinations(None, idx, ct, reps[1:]) - idx[i] += 1 - - -def sortLinkages(linkages, linkage_members, observations, linkage_id_col="orbit_id"): - """ - Check that linkages and linkage_members have their linkage IDs in the same order. If not, - sort both by linkage ID. Second, check that linkage_members is additionally sorted by - mjd_utc. If linkage_members does not contain the mjd_utc column, then observations will be merged - to retrieve the observation time. - - Parameters - ---------- - linkages : `~pandas.DataFrame` - DataFrame containing at least a linkage ID column (linkage_id_col). Each unique linkage should - be only present once. - linkage_members : `~pandas.DataFrame` - DataFrame containing at least a linkage ID column (linkage_id_col) and an observation ID column ('obs_id'). The observation ID - column is used to merge on observations so that the observation time can be retrieved. - observations : `~pandas.DataFrame` - DataFrame containing observations with at least an observation ID column ('obs_id') and a observation time - column ('mjd_utc'). - linkage_id_col : str - Name of the linkage ID column. - - Returns - ------- - linkages : `~pandas.DataFrame` - Linkages sorted by linkage IDs. - linkage_members : `~pandas.DataFrame` - Linkages sorted by linkage IDs and observation times. - """ - time_start = time.time() - logger.debug("Verifying linkages...") - - linkages_verified = linkages.copy() - linkage_members_verified = linkage_members.copy() - - reset_index = False - id_sorted = np.all( - linkages_verified[linkage_id_col].values - == linkage_members_verified[linkage_id_col].unique() - ) - if not id_sorted: - logger.debug( - f"Linkages and linkage_members dataframes are not equally sorted by {linkage_id_col}. Sorting..." - ) - # Sort by linkage_id - sort_start = time.time() - linkages_verified.sort_values(by=[linkage_id_col], inplace=True) - linkage_members_verified.sort_values(by=[linkage_id_col], inplace=True) - sort_end = time.time() - duration = sort_end - sort_start - logger.debug(f"Sorting completed in {duration:.3f}s.") - reset_index = True - - time_present = True - if "mjd_utc" not in linkage_members_verified.columns: - logger.debug( - "Observation time column ('mjd_utc') is not in linkage_members, merging with observations..." - ) - - # Merge with observations to get the observation time for each observation in linkage_members - merge_start = time.time() - linkage_members_verified = linkage_members_verified.merge( - observations[["obs_id", "mjd_utc"]], on="obs_id", how="left" - ) - merge_end = time.time() - duration = merge_end - merge_start - logger.debug(f"Merging completed in {duration:.3f}s.") - time_present = False - - linkage_members_verified_ = linkage_members_verified.sort_values( - by=[linkage_id_col, "mjd_utc"] - ) - time_sorted = np.all( - linkage_members_verified_[[linkage_id_col, "obs_id"]].values - == linkage_members_verified[[linkage_id_col, "obs_id"]].values - ) - if not time_sorted: - logger.debug( - f"Linkage_members is not sorted by {linkage_id_col} and mjd_utc. Sorting..." - ) - - # Sort by linkage_id, mjd_utc, and finally obs_id - sort_start = time.time() - linkage_members_verified.sort_values( - by=[linkage_id_col, "mjd_utc", "obs_id"], inplace=True - ) - sort_end = time.time() - duration = sort_end - sort_start - logger.debug(f"Sorting completed in {duration:.3f}s.") - reset_index = True - - if reset_index: - for df in [linkages_verified, linkage_members_verified]: - df.reset_index(inplace=True, drop=True) - - if not time_present: - linkage_members_verified.drop(columns=["mjd_utc"], inplace=True) - - time_end = time.time() - duration = time_end - time_start - logger.debug(f"Linkages verified in {duration:.3f}s.") - return linkages_verified, linkage_members_verified - - -def identifySubsetLinkages(linkage_members, linkage_id_col="orbit_id"): - """ - Identify subset linkages. A subset is defined as a linkage which contains - the a subset of observation IDs of a larger or equally sized linkaged. - - For example, if a linkage B has constituent observations: obs0001, obs0002, obs0003, obs0004. - Then a linkage A with constituent observations: obs0001, obs0002, obs0003; is a subset of - B. If linkage A and B share exactly the same observations they will be identified as subsets of each - other. - - Parameters - ---------- - linkage_members : `~pandas.DataFrame` - DataFrame containing at least a linkage ID column (linkage_id_col) and an observation ID column ('obs_id'). - linkage_id_col : str - Name of the linkage ID column. - - Returns - ------- - subsets : `~pandas.DataFrame` - DataFrame containing a column with the linkage_id and a second column containing the linkages identified - as subsets. A linkage with multiple subsets will appear once for every subset linkage found. - """ - # Create a dictionary keyed on linkage ID with a set of each linkage's observation - # ID as values - time_start = time.time() - linkage_dict = {} - for linkage_id in linkage_members[linkage_id_col].unique(): - obs_ids = linkage_members[linkage_members[linkage_id_col] == linkage_id][ - "obs_id" - ].values - linkage_dict[linkage_id] = set(obs_ids) - time_end = time.time() - duration = time_end - time_start - logger.debug(f"Linkage dictionary created in {duration:.3f}s.") - - time_start = time.time() - subset_dict = {} - for linkage_id_a in linkage_dict.keys(): - # Grab linkage A's observations - obs_ids_a = linkage_dict[linkage_id_a] - - for linkage_id_b in linkage_dict.keys(): - # If linkage A is not linkage B then - # check if linkage B is a subset of linkage A - if linkage_id_b != linkage_id_a: - - # Grab linkage B's observations - obs_ids_b = linkage_dict[linkage_id_b] - if obs_ids_b.issubset(obs_ids_a): - - # Linkage B is a subset of Linkage A, so lets - # add this result to the subset dictionary - if linkage_id_a not in subset_dict.keys(): - subset_dict[linkage_id_a] = [linkage_id_b] - else: - subset_dict[linkage_id_a].append(linkage_id_b) - time_end = time.time() - duration = time_end - time_start - logger.debug(f"Linkage dictionary scanned for subsets in {duration:.3f}s.") - - time_start = time.time() - linkage_ids = [] - subset_linkages = [] - for linkage_id, subset_ids in subset_dict.items(): - subset_linkages += subset_ids - linkage_ids += [linkage_id for i in range(len(subset_ids))] - subsets = pd.DataFrame({"linkage_id": linkage_ids, "subset_ids": subset_linkages}) - time_end = time.time() - duration = time_end - time_start - logger.debug(f"Subset dataframe created in {duration:.3f}s.") - - return subsets - - -def removeDuplicateLinkages(linkages, linkage_members, linkage_id_col="orbit_id"): - """ - Removes linkages that have identical observations as another linkage. Linkage quality is not taken - into account. - - Parameters - ---------- - linkages : `~pandas.DataFrame` - DataFrame containing at least the linkage ID. - linkage_members : `~pandas.DataFrame` - Dataframe containing the linkage ID and the observation ID for each of the linkage's - constituent observations. Each observation ID should be in a single row. - linkage_id_col : str, optional - Linkage ID column name (must be the same in both DataFrames). - - Returns - ------- - linkages : `~pandas.DataFrame` - DataFrame with duplicate linkages removed. - linkage_members : `~pandas.DataFrame` - DataFrame with duplicate linkages removed. - """ - linkages_ = linkages.copy() - linkage_members_ = linkage_members.copy() - - # Expand observation IDs into columns, then remove duplicates using pandas functionality - expanded = ( - linkage_members_[[linkage_id_col, "obs_id"]] - .groupby(by=[linkage_id_col])["obs_id"] - .apply(list) - .to_frame(name="obs_ids") - ) - expanded = expanded["obs_ids"].apply(pd.Series) - linkage_ids = expanded.drop_duplicates().index.values - - linkages_ = linkages_[linkages_[linkage_id_col].isin(linkage_ids)] - linkage_members_ = linkage_members_[ - linkage_members_[linkage_id_col].isin(linkage_ids) - ] - - for df in [linkages_, linkage_members_]: - df.reset_index(inplace=True, drop=True) - - return linkages_, linkage_members_ - - -def removeDuplicateObservations( - linkages, - linkage_members, - min_obs=5, - linkage_id_col="orbit_id", - filter_cols=["num_obs", "arc_length"], - ascending=[False, False], -): - """ - Removes duplicate observations using the filter columns. The filter columns are used to sort the linkages - as desired by the user. The first instance of the observation is kept and all other instances are removed. - If any linkage's number of observations drops below min_obs, that linkage is removed. - - Parameters - ---------- - linkages : `~pandas.DataFrame` - DataFrame containing at least the linkage ID. - linkage_members : `~pandas.DataFrame` - Dataframe containing the linkage ID and the observation ID for each of the linkage's - constituent observations. Each observation ID should be in a single row. - min_obs : int, optional - Minimum number of observations for a linkage to be viable. - linkage_id_col : str, optional - Linkage ID column name (must be the same in both DataFrames). - filter_cols : list, optional - List of column names to use to sort the linkages. - ascending : list, optional - Sort the filter_cols in ascending or descending order. - - Returns - ------- - linkages : `~pandas.DataFrame` - DataFrame with duplicate observations removed. - linkage_members : `~pandas.DataFrame` - DataFrame with duplicate observations removed. - """ - linkages_ = linkages.copy() - linkage_members_ = linkage_members.copy() - - # Sort linkages by the desired columns - linkages_.sort_values( - by=filter_cols, ascending=ascending, inplace=True, ignore_index=True - ) - - # Set both dataframe's indices to the linkage ID for - # faster querying - linkages_.set_index(linkage_id_col, inplace=True) - linkage_members_.set_index(linkage_id_col, inplace=True) - - # Sort linkage members the same way as linkages - linkage_members_ = linkage_members_.loc[linkages_.index.values] - linkage_members_.reset_index(inplace=True) - - # Drop all but the first duplicate observation - linkage_members_ = linkage_members_.drop_duplicates(subset=["obs_id"], keep="first") - - # Make sure that the remaining linkages have enough observations (>= min_obs) - linkage_occurences = linkage_members_[linkage_id_col].value_counts() - linkages_to_keep = linkage_occurences.index.values[ - linkage_occurences.values >= min_obs - ] - linkages_ = linkages_[linkages_.index.isin(linkages_to_keep)] - linkage_members_ = linkage_members_[ - linkage_members_[linkage_id_col].isin(linkages_to_keep) - ] - - # Reset indices - linkages_.reset_index(inplace=True) - linkage_members_.reset_index(inplace=True, drop=True) - return linkages_, linkage_members_ - - -def calcDeltas( - linkage_members, - observations=None, - groupby_cols=["orbit_id", "night_id"], - delta_cols=["mjd_utc", "RA_deg", "Dec_deg", "mag"], -): - """ - Calculate deltas for the desired columns. For example, if groupby columns are given to be orbit_id and night id, then - the linkages are grouped first by orbit_id then night_id, and then the difference in quantities are calculated for - each column in delta_cols. This can be used to calculate the nightly time difference in observations per linkage, or the - amount of motion a linkage has between observations, etc... - - Parameters - ---------- - linkage_members : `~pandas.DataFrame` - DataFrame containing at least a linkage ID column (linkage_id_col) and an observation ID column ('obs_id'). The observation ID - column is used to merge on observations so that the columns from the observations dataframe can be retrieved if necessary. - observations : `~pandas.DataFrame` - DataFrame containing observations with at least an observation ID column ('obs_id'). - groupby_cols : list - Columns by which to group the linkages and calculate deltas. - delta_cols : list - Columns for which to calculate deltas. - - Returns - ------- - linkage_members : `~pandas.DataFrame` - Copy of the linkage_members dataframe with the delta columns added. - """ - linkage_members_ = linkage_members.copy() - - # Check to see if each column on which a delta should be - # calculated is in linkage_members, if not look for it - # in observations - cols = [] - for col in delta_cols + groupby_cols: - if col not in linkage_members_.columns: - if col not in observations.columns or observations is None: - err = f"{col} could not be found in either linkage_members or observations." - raise ValueError(err) - - cols.append(col) - - if len(cols) > 0: - linkage_members_ = linkage_members_.merge( - observations[["obs_id"] + cols], on="obs_id", how="left" - ) - - nightly = linkage_members_.groupby(by=groupby_cols) - - deltas = nightly[delta_cols].diff() - deltas.rename(columns={c: f"d{c}" for c in delta_cols}, inplace=True) - linkage_members_ = linkage_members_.join(deltas) - - return linkage_members_ diff --git a/thor/utils/logging.py b/thor/utils/logging.py index 9eb99b1f..c78a11af 100644 --- a/thor/utils/logging.py +++ b/thor/utils/logging.py @@ -1,9 +1,8 @@ import logging import os import sys -import time -__all__ = ["setupLogger", "Timer"] +__all__ = ["setupLogger"] logger = logging.getLogger(__name__) @@ -57,72 +56,3 @@ def setupLogger(name, out_dir=None): logger.addHandler(file_handler) return logger - - -class Timer: - def __init__( - self, - file_name=None, - file_dir="/tmp/thor/", - prepend_data=[], - sep=",", - open_kwargs={ - "mode": "a", - "buffering": -1, - "encoding": "utf-8", - }, - ): - """ - Timing context manager that stores timing results and given user - data to a file if desired. - - - Parameters - ---------- - file_name : {str, None} - Name of file including extension but excluding file directory. - file_dir : str - Directory where to save file. Defaults to /tmp/thor/. - prepend_data : list - Additional data that should be prepended to outputs. - sep : str - If prepend_data is not an empty list, this separator will be used to concatenate - data and the time elapsed into a single line. - open_kwargs : dict - Parameters with which to open the file. - """ - self.time_start = 0 - self.time_end = 0 - self.file_name = file_name - self.file_dir = file_dir - self.file_path = None - self.file = None - self.prepend_data = prepend_data - self.sep = sep - - if isinstance(file_name, str): - if not os.path.isdir(file_dir): - os.makedirs(file_dir, exist_ok=False) - self.file_path = os.path.join(file_dir, file_name) - self.file = open(self.file_path, **open_kwargs) - return - - def __enter__(self): - self.time_start = time.time() - return - - def __exit__(self, type, value, traceback): - self.time_end = time.time() - duration = self.time_end - self.time_start - data = self.prepend_data + [duration] - string = self.sep.join([str(d) for d in data]) - - if self.file is not None: - self.file.writelines([string + "\n"]) - self.file.close() - else: - if len(self.prepend_data) == 0: - string = f"Time elapsed: {duration:.8f}s" - logger.info(string) - - return diff --git a/thor/utils/multiprocessing.py b/thor/utils/multiprocessing.py deleted file mode 100644 index 8058b12e..00000000 --- a/thor/utils/multiprocessing.py +++ /dev/null @@ -1,153 +0,0 @@ -import logging -import multiprocessing as mp -import signal - -import numpy as np -import pandas as pd - -__all__ = ["Timeout", "yieldChunks", "calcChunkSize", "_initWorker", "_checkParallel"] - -logger = logging.getLogger(__name__) - - -class Timeout: - ### Taken from https://stackoverflow.com/a/22348885 - def __init__(self, seconds=30, error_message="Timeout"): - self.seconds = seconds - self.error_message = error_message - - def handle_timeout(self, signum, frame): - raise TimeoutError(self.error_message) - - def __enter__(self): - signal.signal(signal.SIGALRM, self.handle_timeout) - signal.alarm(self.seconds) - - def __exit__(self, type, value, traceback): - signal.alarm(0) - - -def yieldChunks(indexable, chunk_size): - """ - Generator that yields chunks of size chunk_size. - - Parameters - ---------- - indexable : list, `~numpy.ndarray`, `~pandas.DataFrame`, `~pandas.Series` (N) - Indexable object that needs to be divided into - chunks. - chunk_size : int - Size of each chunk. - - Yields - ------ - chunk : indexable (<=chunk_size) - Chunks of indexable - """ - if isinstance(indexable, list) or isinstance(indexable, np.ndarray): - for c in range(0, len(indexable), chunk_size): - yield indexable[c : c + chunk_size] - elif isinstance(indexable, pd.DataFrame) or isinstance(indexable, pd.Series): - for c in range(0, len(indexable), chunk_size): - yield indexable.iloc[c : c + chunk_size] - else: - err = "Indexable should be one of {list, `~numpy.ndarray`, `~pandas.DataFrame`, `~pandas.Series`}" - raise ValueError(err) - - -def calcChunkSize(n, num_workers, max_chunk_size, min_chunk_size=1): - """ - Calculate the optimal chunk size such that each worker gets at - least min_chunk_size chunks but does not get more than max_chunk_size - chunks. The goal is for no worker to be idle in terms of the number of items - it recieves - - Parameters - ---------- - n : int - Number of items. - num_workers : int - Number of workers to which items will be distributed. - max_chunk_size : int - Maximum chunk size to be given to each worker. - min_chunk_size : int, optional - Minimum chunk size to be given to each worker. - - Yields - ------ - chunk_size : int - Chunk_size between min_chunk_size and max_chunk_size. - """ - # Calculate the number of n that should be sent to each worker - c = np.maximum(np.floor(n / num_workers), min_chunk_size).astype(int) - - # Make sure this number does not exceed the maximum chunk size - chunk_size = np.minimum(c, max_chunk_size) - return chunk_size - - -def _initWorker(): - """ - Tell multiprocessing worker to ignore signals, will only - listen to parent process. - """ - signal.signal(signal.SIGINT, signal.SIG_IGN) - return - - -def _checkParallel(num_jobs, parallel_backend): - """ - Helper function to determine how many workers (jobs) should be submitted. - If the parallelization backend is Python's multiprocessing ('mp') and num_jobs - is either "auto" or None, then mp.cpu_count() will be used to determine the number - of jobs. If num_jobs is not "auto" or None, then that number will be used instead. - If the parallelization backend is ray, then the number of resources on the cluster will - determine the number of workers, and num_jobs is ignored. - - Parameters - ---------- - num_jobs : {None, "auto", int} - Number of jobs to launch. - parallel_backend : str - Name of backend. Should be one of {'ray', 'mp', 'cf'}. - - Returns - ------- - enable_parallel : bool - True if parallelization should be used (true for cases where num_jobs > 1). - num_workers : int - The number of workers to use. - - Raises - ------ - ValueError : If parallel_backend is not one of {'ray', 'mp', 'cf'}. - """ - if isinstance(num_jobs, str) or (num_jobs is None) or (num_jobs > 1): - - # Check that pareallel_backend is one of the support types - backends = ["ray", "mp", "cf"] - if parallel_backend not in backends: - err = "parallel_backend should be one of {'ray', 'mp', 'cf'}" - raise ValueError(err) - - enable_parallel = True - if parallel_backend == "ray": - import ray - - if not num_jobs == "auto" or num_jobs is not None: - logger.warning( - "This process is running with the ray parallelization backend: num_jobs parameter will be ignored." - ) - - num_workers = int(ray.cluster_resources()["CPU"]) - - else: - if num_jobs == "auto" or num_jobs is None: - num_workers = mp.cpu_count() - else: - num_workers = num_jobs - else: - num_workers = 1 - enable_parallel = False - - return enable_parallel, num_workers diff --git a/thor/utils/observations.py b/thor/utils/observations.py deleted file mode 100644 index 1c8ab1bf..00000000 --- a/thor/utils/observations.py +++ /dev/null @@ -1,74 +0,0 @@ -import numpy as np -from astropy.time import Time - -from .astropy import _checkTime - -__all__ = ["calcLinkingWindow", "calcNight"] - - -def calcLinkingWindow(times: Time, length: float) -> np.ndarray: - """ - Calculate the linking window for each given time. Linking windows are counted - from the earliest time. For example, if given the following times and a defined - window size of 5 days then the times will be assigned as follows: - Time: Linking Window - 2021-08-08T12:00:00.000 1 - 2021-08-09T12:00:00.000 1 - 2021-08-10T12:00:00.000 1 - 2021-08-11T12:00:00.000 1 - 2021-08-12T12:00:00.000 1 - 2021-08-13T12:00:00.000 2 - 2021-08-14T12:00:00.000 2 - 2021-08-15T12:00:00.000 2 - 2021-08-16T12:00:00.000 2 - 2021-08-17T12:00:00.000 2 - - Parameters - ---------- - times : `~astropy.time.core.Time` - Observation times that need to be assigned to a linking window. - length : float, int - Length of the desired linking window in units of decimal days. - - Returns - ------- - window : `~numpy.ndarray` (N) - The linking window number to which each time was assigned. - """ - _checkTime(times, "times") - dt = times.mjd - times.mjd.min() - window = np.floor(dt / length) + 1 - return window.astype(int) - - -def calcNight(times: Time, local_midnight: float) -> np.ndarray: - """ - Given a set of observation times, calculates the night - on which those observations were made. - - Parameters - ---------- - times : `~astropy.core.Time` (N) - Observation times. - local_midnight : float - The change in time from UTC at which local midnight occurs for the observer - or observatory that made the observations. This should be a float value in units - of decimal days. For those locations East of the Prime Meridian these - would be positive values (e.g., Brussels would be +1/24, Shanghai would be +8/24). - For locations West of the Prime Meridian these values should be negative - (e.g., New York would be -4/24, Seattle would be -7/24). - - Returns - ------- - night : `~numpy.ndarray` (N) - Night on which the observation was made expressed as an integer MJD. - """ - _checkTime(times, "times") - mjd_utc = times.utc.mjd - local_mjd = mjd_utc - local_midnight - # Observations that occur after midnight should be assigned the night value - # of the observations that occurred before midnight, so shift all the observations - # 12 hours earlier and use that as the night value - nights = local_mjd - 12 / 24 - nights = nights.astype(int) - return nights diff --git a/thor/utils/patches.py b/thor/utils/patches.py deleted file mode 100644 index fab9444f..00000000 --- a/thor/utils/patches.py +++ /dev/null @@ -1,91 +0,0 @@ -import healpy as hp -import numpy as np - -__all__ = ["assignPatchesSquare", "assignPatchesHEALPix"] - - -def assignPatchesSquare( - ra: np.ndarray, - dec: np.ndarray, - ra_width: float = 15.0, - dec_width: float = 15.0, -) -> np.ndarray: - """ - Assign a patch ID to each coordinate where a patch is a square region - on the sky plane of ra_width in RA and of dec_width in Dec. - - RA must be between 0 and 360 degrees. - Dec must be between -90 and 90 degrees. - - Patches are created first in increasing Declination and second in increasing - Right Ascension. The first patch will border RA of 0 and Dec of -90, while - the last patch will border RA of 360 and Dec of 90. - - Parameters - ---------- - ra : `~numpy.ndarray` (N) - Right Ascension in degrees. - dec : `~numpy.ndarray` (N) - Declination in degrees. - ra_width : float - Width of patch in RA in degrees. - dec_with : float - Width of patch in Dec in degrees. - - Returns - ------- - patch_ids : `~numpy.ndarray` (N) - The patch ID for each coordinate. - """ - ras = np.arange(0, 360 + ra_width, ra_width) - decs = np.arange(-90, 90 + dec_width, dec_width) - - patch_id = 0 - patch_ids = -1 * np.ones(len(ra), dtype=int) - for ra_i, ra_f in zip(ras[:-1], ras[1:]): - for dec_i, dec_f in zip(decs[:-1], decs[1:]): - - mask = np.where( - ((ra >= ra_i) & (ra < ra_f) & (dec >= dec_i) & (dec < dec_f)) - ) - patch_ids[mask] = patch_id - - patch_id += 1 - - return patch_ids - - -def assignPatchesHEALPix( - ra: np.ndarray, - dec: np.ndarray, - nside: int = 1024, - nest: bool = True, - lonlat: bool = True, -) -> np.ndarray: - """ - Assign patches using a HEALPix schema. - For details see Górski et al. (2005). - - Parameters - ---------- - ra : `~numpy.ndarray` (N) - Right Ascension in degrees. - dec : `~numpy.ndarray` (N) - Declination in degrees. - nside : int - HEALPix nside parameter (must be a power of 2). - - Returns - ------- - patch_ids : `~numpy.ndarray` (N) - The patch ID for each coordinate. - - References - ---------- - [1] Górski, K. M., Hivon, E., Banday, A. J., Wandelt, B. D., Hansen, F. K., Reinecke, M., & Bartelmann, M. (2005). - HEALPix: A Framework for High-Resolution Discretization and Fast Analysis of Data Distributed on the Sphere. - The Astrophysical Journal, 622(2), 759. https://doi.org/10.1086/427976 - """ - patch_ids = hp.ang2pix(nside, ra, dec, nest=nest, lonlat=lonlat) - - return patch_ids diff --git a/thor/utils/quivr.py b/thor/utils/quivr.py new file mode 100644 index 00000000..aaa705db --- /dev/null +++ b/thor/utils/quivr.py @@ -0,0 +1,62 @@ +from typing import List, Literal, Optional + +import numpy as np +import pyarrow as pa +import quivr as qv + +__all__ = [ + "drop_duplicates", +] + + +def drop_duplicates( + table: qv.AnyTable, + subset: Optional[List[str]] = None, + keep: Literal["first", "last"] = "first", +) -> qv.AnyTable: + """ + Drop duplicate rows from a `~quivr.Table`. This function is similar to + `~pandas.DataFrame.drop_duplicates` but it supports nested columns (representing + nested tables). + + Parameters + ---------- + table : `~quivr.Table` + Table to drop duplicate rows from. + subset : list of str, optional + Subset of columns to consider when dropping duplicates. If not specified then + all columns are used. + keep : {'first', 'last'}, default 'first' + If there are duplicate rows then keep the first or last row. + + Returns + ------- + table : `~quivr.Table` + Table with duplicate rows removed. + """ + # Flatten the table so nested columns are dot-delimited at the top level + flattened_table = table.flattened_table() + + # If subset is not specified then use all the columns + if subset is None: + subset = [c for c in flattened_table.column_names] + + # Add an index column to the flattened table + flattened_table = flattened_table.add_column( + 0, "index", pa.array(np.arange(len(flattened_table))) + ) + + if keep == "first": + agg_func = keep + elif keep == "last": + agg_func = keep + else: + raise ValueError("keep must be either 'first' or 'last'") + indices = ( + flattened_table.group_by(subset, use_threads=False) + .aggregate([("index", agg_func)]) + .column(f"index_{agg_func}") + ) + + # Take the indices from the flattened table and use them to index into the original table + return table.take(indices) diff --git a/thor/utils/tests/__init__.py b/thor/utils/tests/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/thor/utils/tests/test_astropy.py b/thor/utils/tests/test_astropy.py deleted file mode 100644 index 0727fe8f..00000000 --- a/thor/utils/tests/test_astropy.py +++ /dev/null @@ -1,21 +0,0 @@ -import numpy as np -import pytest -from astropy.time import Time - -from ..astropy import _checkTime - - -def test__checkTime(): - # Create an array of epochs - times = np.linspace(59580, 59590, 100) - - # Test that an error is raised when times are not - # an astropy time object - with pytest.raises(TypeError): - _checkTime(times, "test") - - # Test that _checkTime passes when an astropy time object is - # given as intended - times_astropy = Time(times, format="mjd", scale="utc") - - return diff --git a/thor/utils/tests/test_linkages.py b/thor/utils/tests/test_linkages.py deleted file mode 100644 index e313969a..00000000 --- a/thor/utils/tests/test_linkages.py +++ /dev/null @@ -1,227 +0,0 @@ -import numpy as np -import pandas as pd -import pytest - -from ..linkages import calcDeltas, identifySubsetLinkages, sortLinkages - -### Create test data set -linkage_ids = ["a", "b", "c"] -linkage_lengths = [4, 5, 6] - -linkage_members_ids = [] -for i, lid in enumerate(linkage_ids): - linkage_members_ids += [lid for j in range(linkage_lengths[i])] -obs_ids = [f"o{i:04d}" for i in range(len(linkage_members_ids))] - -times = [] -for i in range(len(linkage_ids)): - times += [np.arange(59000, 59000 + linkage_lengths[i])] -times = np.concatenate(times) - -LINKAGES = pd.DataFrame({"linkage_id": linkage_ids}) -LINKAGE_MEMBERS = pd.DataFrame( - { - "linkage_id": linkage_members_ids, - "obs_id": obs_ids, - "mjd_utc": times, - } -) -OBSERVATIONS = pd.DataFrame( - { - "obs_id": obs_ids, - "mjd_utc": times, - } -) -OBSERVATIONS.sort_values(by=["mjd_utc", "obs_id"], inplace=True, ignore_index=True) - - -def test_sortLinkages_timePresent(): - np.random.seed(42) - - # Scramble the linkages dataframe - len_linkages = len(LINKAGES) - scramble = np.random.choice(len_linkages, len_linkages, replace=False) - linkages_unsorted = LINKAGES.loc[scramble].reset_index(drop=True) - - # Scramble the linkage_members dataframe - len_members = len(LINKAGE_MEMBERS) - scramble = np.random.choice(len_members, len_members, replace=False) - linkage_members_unsorted = LINKAGE_MEMBERS.loc[scramble].reset_index(drop=True) - - # Sort scrambled linkages - linkages_sorted, linkage_members_sorted = sortLinkages( - linkages_unsorted, - linkage_members_unsorted, - OBSERVATIONS, - linkage_id_col="linkage_id", - ) - - # Make sure they returned dataframes match those created - pd.testing.assert_frame_equal(LINKAGES, linkages_sorted) - pd.testing.assert_frame_equal(LINKAGE_MEMBERS, linkage_members_sorted) - - -def test_sortLinkages_timeMissing(): - np.random.seed(42) - - # Scramble the linkages dataframe - len_linkages = len(LINKAGES) - scramble = np.random.choice(len_linkages, len_linkages, replace=False) - linkages_unsorted = LINKAGES.loc[scramble].reset_index(drop=True) - - # Scramble the linkage_members dataframe - len_members = len(LINKAGE_MEMBERS) - scramble = np.random.choice(len_members, len_members, replace=False) - linkage_members_unsorted = LINKAGE_MEMBERS.loc[scramble].reset_index(drop=True) - linkage_members_unsorted.drop(columns=["mjd_utc"], inplace=True) - - # Sort scrambled linkages - linkages_sorted, linkage_members_sorted = sortLinkages( - linkages_unsorted, - linkage_members_unsorted, - OBSERVATIONS, - linkage_id_col="linkage_id", - ) - - # Make sure they returned dataframes match those created - pd.testing.assert_frame_equal(LINKAGES, linkages_sorted) - pd.testing.assert_frame_equal( - LINKAGE_MEMBERS[["linkage_id", "obs_id"]], linkage_members_sorted - ) - - -def test_calcDeltas(): - - # Calculate deltas for the time column and make sure they match the input dataset - linkages_members_ = calcDeltas( - LINKAGE_MEMBERS, - OBSERVATIONS, - groupby_cols=["linkage_id"], - delta_cols=["mjd_utc"], - ) - - assert "dmjd_utc" in linkages_members_.columns - assert ( - linkages_members_[linkages_members_["linkage_id"] == "a"]["dmjd_utc"].sum() == 3 - ) - assert ( - linkages_members_[linkages_members_["linkage_id"] == "b"]["dmjd_utc"].sum() == 4 - ) - assert ( - linkages_members_[linkages_members_["linkage_id"] == "c"]["dmjd_utc"].sum() == 5 - ) - - -def test_calcDeltas_columnInObservations(): - - # Calculate deltas for the time column and make sure they match the input dataset, but - # this time remove the column from linkage_members so that calcDeltas looks for it - # in observations - linkages_members_ = calcDeltas( - LINKAGE_MEMBERS[["linkage_id", "obs_id"]], - OBSERVATIONS, - groupby_cols=["linkage_id"], - delta_cols=["mjd_utc"], - ) - - assert "dmjd_utc" in linkages_members_.columns - assert ( - linkages_members_[linkages_members_["linkage_id"] == "a"]["dmjd_utc"].sum() == 3 - ) - assert ( - linkages_members_[linkages_members_["linkage_id"] == "b"]["dmjd_utc"].sum() == 4 - ) - assert ( - linkages_members_[linkages_members_["linkage_id"] == "c"]["dmjd_utc"].sum() == 5 - ) - - -def test_calcDeltas_missingColumn(): - - # Calculate deltas for the time column and make sure they match the input dataset - with pytest.raises(ValueError): - linkages_members_ = calcDeltas( - LINKAGE_MEMBERS[["linkage_id", "obs_id"]], - OBSERVATIONS[["obs_id"]], - groupby_cols=["linkage_id"], - delta_cols=["mjd_utc"], - ) - - -def test_identifySubsetLinkages_0subsets(): - - # No subsets should be found in the default dataset - subsets = identifySubsetLinkages(LINKAGE_MEMBERS, linkage_id_col="linkage_id") - assert len(subsets) == 0 - - return - - -def test_identifySubsetLinkages_3subsets(): - - # Make a copy of the linkage members dataframe - # Rename the observations so that A is a subset of B and C, and so that B is a subset of C. - linkage_members = LINKAGE_MEMBERS.copy() - for linkage_id in LINKAGE_MEMBERS["linkage_id"].unique(): - num_obs = len(linkage_members[linkage_members["linkage_id"].isin([linkage_id])]) - linkage_members.loc[ - linkage_members["linkage_id"].isin([linkage_id]), "obs_id" - ] = [f"o{i:04d}" for i in range(num_obs)] - - subsets = identifySubsetLinkages(linkage_members, linkage_id_col="linkage_id") - - # Make sure A and B have been identified as subsets of C - C_subsets = subsets[subsets["linkage_id"] == "c"]["subset_ids"].values - assert "a" in C_subsets - assert "b" in C_subsets - assert len(C_subsets) == 2 - - # Make sure A has been identifed as a subsets of B - B_subsets = subsets[subsets["linkage_id"] == "b"]["subset_ids"].values - assert "a" in B_subsets - assert len(B_subsets) == 1 - - # Make sure A has no subsets - A_subsets = subsets[subsets["linkage_id"] == "a"]["subset_ids"].values - assert len(A_subsets) == 0 - - return - - -def test_identifySubsetLinkages_3duplicates(): - - # Make a copy of the linkage members dataframe - # Rename the observations so that A is a subset of B and C, and so that B is a subset of C. - linkage_members = LINKAGE_MEMBERS.copy() - for linkage_id in LINKAGE_MEMBERS["linkage_id"].unique(): - num_obs = len(linkage_members[linkage_members["linkage_id"].isin([linkage_id])]) - linkage_members.loc[ - linkage_members["linkage_id"].isin([linkage_id]), "obs_id" - ] = [f"o{i:04d}" for i in range(num_obs)] - - # Trim the linkages so that they have exactly the same observations - linkage_members = linkage_members[ - ~linkage_members["obs_id"].isin(["o0004", "o0005"]) - ].copy() - - subsets = identifySubsetLinkages(linkage_members, linkage_id_col="linkage_id") - - # Make sure A and B have been identified as subsets of C - C_subsets = subsets[subsets["linkage_id"] == "c"]["subset_ids"].values - assert "a" in C_subsets - assert "b" in C_subsets - assert len(C_subsets) == 2 - - # Make sure A and C have been identified as subsets of B - B_subsets = subsets[subsets["linkage_id"] == "b"]["subset_ids"].values - assert "a" in B_subsets - assert "c" in B_subsets - assert len(B_subsets) == 2 - - # Make sure B and C have been identified as subsets of C - C_subsets = subsets[subsets["linkage_id"] == "c"]["subset_ids"].values - assert "a" in C_subsets - assert "b" in C_subsets - assert len(C_subsets) == 2 - - return diff --git a/thor/utils/tests/test_multiprocessing.py b/thor/utils/tests/test_multiprocessing.py deleted file mode 100644 index fd7cb6e5..00000000 --- a/thor/utils/tests/test_multiprocessing.py +++ /dev/null @@ -1,372 +0,0 @@ -import numpy as np -import pandas as pd -import pytest - -from ..multiprocessing import calcChunkSize, yieldChunks - - -def test_yieldChunks_list(): - # Create list of data - indexable = [i for i in range(15)] - - # Set chunk_size to 10 - chunk_size = 10 - generator = yieldChunks(indexable, chunk_size) - - # First iteration should yield 0 through 9 - chunk = next(generator) - desired = np.arange(0, 10, 1) - np.testing.assert_array_equal(np.array(chunk), desired) - - # Second iteration should yield 10 through 15 - chunk = next(generator) - desired = np.arange(10, 15, 1) - np.testing.assert_array_equal(np.array(chunk), desired) - - # Set chunk_size to 5 - chunk_size = 5 - generator = yieldChunks(indexable, chunk_size) - - # First iteration should yield 0 through 4 - chunk = next(generator) - desired = np.arange(0, 5, 1) - np.testing.assert_array_equal(np.array(chunk), desired) - - # Second iteration should yield 5 through 9 - chunk = next(generator) - desired = np.arange(5, 10, 1) - np.testing.assert_array_equal(np.array(chunk), desired) - - # Third iteration should yield 10 through 14 - chunk = next(generator) - desired = np.arange(10, 15, 1) - np.testing.assert_array_equal(np.array(chunk), desired) - - # Set chunk_size to 20 - chunk_size = 20 - generator = yieldChunks(indexable, chunk_size) - - # First iteration should yield 0 through 14 - chunk = next(generator) - desired = np.arange(0, 15, 1) - np.testing.assert_array_equal(np.array(chunk), desired) - return - - -def test_yieldChunks_array(): - # Create list of data - indexable = np.arange(0, 15, 1) - - # Set chunk_size to 10 - chunk_size = 10 - generator = yieldChunks(indexable, chunk_size) - - # First iteration should yield 0 through 9 - chunk = next(generator) - desired = np.arange(0, 10, 1) - np.testing.assert_array_equal(chunk, desired) - - # Second iteration should yield 10 through 15 - chunk = next(generator) - desired = np.arange(10, 15, 1) - np.testing.assert_array_equal(chunk, desired) - - # Set chunk_size to 5 - chunk_size = 5 - generator = yieldChunks(indexable, chunk_size) - - # First iteration should yield 0 through 4 - chunk = next(generator) - desired = np.arange(0, 5, 1) - np.testing.assert_array_equal(chunk, desired) - - # Second iteration should yield 5 through 9 - chunk = next(generator) - desired = np.arange(5, 10, 1) - np.testing.assert_array_equal(chunk, desired) - - # Third iteration should yield 10 through 14 - chunk = next(generator) - desired = np.arange(10, 15, 1) - np.testing.assert_array_equal(chunk, desired) - - # Set chunk_size to 20 - chunk_size = 20 - generator = yieldChunks(indexable, chunk_size) - - # First iteration should yield 0 through 14 - chunk = next(generator) - desired = np.arange(0, 15, 1) - np.testing.assert_array_equal(chunk, desired) - return - - -def test_yieldChunks_series(): - # Create series of data - indexable = pd.Series(np.arange(0, 15, 1)) - - # Set chunk_size to 10 - chunk_size = 10 - generator = yieldChunks(indexable, chunk_size) - - # First iteration should yield 0 through 9 - chunk = next(generator) - desired = np.arange(0, 10, 1) - np.testing.assert_array_equal(chunk.values, desired) - - # Second iteration should yield 10 through 15 - chunk = next(generator) - desired = np.arange(10, 15, 1) - np.testing.assert_array_equal(chunk.values, desired) - - # Set chunk_size to 5 - chunk_size = 5 - generator = yieldChunks(indexable, chunk_size) - - # First iteration should yield 0 through 4 - chunk = next(generator) - desired = np.arange(0, 5, 1) - np.testing.assert_array_equal(chunk.values, desired) - - # Second iteration should yield 5 through 9 - chunk = next(generator) - desired = np.arange(5, 10, 1) - np.testing.assert_array_equal(chunk.values, desired) - - # Third iteration should yield 10 through 14 - chunk = next(generator) - desired = np.arange(10, 15, 1) - np.testing.assert_array_equal(chunk.values, desired) - - # Set chunk_size to 20 - chunk_size = 20 - generator = yieldChunks(indexable, chunk_size) - - # First iteration should yield 0 through 14 - chunk = next(generator) - desired = np.arange(0, 15, 1) - np.testing.assert_array_equal(chunk.values, desired) - return - - -def test_yieldChunks_series_offsetIndex(): - # Create series of data - indexable = pd.Series(np.arange(0, 15, 1)) - # Offset the index and make sure chunking is done independent of - # the values of the index - indexable.index = np.arange(0, 150, 10) - - # Set chunk_size to 10 - chunk_size = 10 - generator = yieldChunks(indexable, chunk_size) - - # First iteration should yield 0 through 9 - chunk = next(generator) - desired = np.arange(0, 10, 1) - np.testing.assert_array_equal(chunk.values, desired) - - # Second iteration should yield 10 through 15 - chunk = next(generator) - desired = np.arange(10, 15, 1) - np.testing.assert_array_equal(chunk.values, desired) - - # Set chunk_size to 5 - chunk_size = 5 - generator = yieldChunks(indexable, chunk_size) - - # First iteration should yield 0 through 4 - chunk = next(generator) - desired = np.arange(0, 5, 1) - np.testing.assert_array_equal(chunk.values, desired) - - # Second iteration should yield 5 through 9 - chunk = next(generator) - desired = np.arange(5, 10, 1) - np.testing.assert_array_equal(chunk.values, desired) - - # Third iteration should yield 10 through 14 - chunk = next(generator) - desired = np.arange(10, 15, 1) - np.testing.assert_array_equal(chunk.values, desired) - - # Set chunk_size to 20 - chunk_size = 20 - generator = yieldChunks(indexable, chunk_size) - - # First iteration should yield 0 through 14 - chunk = next(generator) - desired = np.arange(0, 15, 1) - np.testing.assert_array_equal(chunk.values, desired) - return - - -def test_yieldChunks_dataframe_offsetIndex(): - # Create dataframe of data - data = {"x": np.arange(0, 15, 1)} - indexable = pd.DataFrame(data) - # Offset the index and make sure chunking is done independent of - # the values of the index - indexable.index = np.arange(0, 150, 10) - - # Set chunk_size to 10 - chunk_size = 10 - generator = yieldChunks(indexable, chunk_size) - - # First iteration should yield 0 through 9 - chunk = next(generator) - desired = np.arange(0, 10, 1) - np.testing.assert_array_equal(chunk["x"].values, desired) - - # Second iteration should yield 10 through 15 - chunk = next(generator) - desired = np.arange(10, 15, 1) - np.testing.assert_array_equal(chunk["x"].values, desired) - - # Set chunk_size to 5 - chunk_size = 5 - generator = yieldChunks(indexable, chunk_size) - - # First iteration should yield 0 through 4 - chunk = next(generator) - desired = np.arange(0, 5, 1) - np.testing.assert_array_equal(chunk["x"].values, desired) - - # Second iteration should yield 5 through 9 - chunk = next(generator) - desired = np.arange(5, 10, 1) - np.testing.assert_array_equal(chunk["x"].values, desired) - - # Third iteration should yield 10 through 14 - chunk = next(generator) - desired = np.arange(10, 15, 1) - np.testing.assert_array_equal(chunk["x"].values, desired) - - # Set chunk_size to 20 - chunk_size = 20 - generator = yieldChunks(indexable, chunk_size) - - # First iteration should yield 0 through 14 - chunk = next(generator) - desired = np.arange(0, 15, 1) - np.testing.assert_array_equal(chunk["x"].values, desired) - return - - -def test_yieldChunks_dataframe(): - # Create dataframe of data - data = {"x": np.arange(0, 15, 1)} - indexable = pd.DataFrame(data) - - # Set chunk_size to 10 - chunk_size = 10 - generator = yieldChunks(indexable, chunk_size) - - # First iteration should yield 0 through 9 - chunk = next(generator) - desired = np.arange(0, 10, 1) - np.testing.assert_array_equal(chunk["x"].values, desired) - - # Second iteration should yield 10 through 15 - chunk = next(generator) - desired = np.arange(10, 15, 1) - np.testing.assert_array_equal(chunk["x"].values, desired) - - # Set chunk_size to 5 - chunk_size = 5 - generator = yieldChunks(indexable, chunk_size) - - # First iteration should yield 0 through 4 - chunk = next(generator) - desired = np.arange(0, 5, 1) - np.testing.assert_array_equal(chunk["x"].values, desired) - - # Second iteration should yield 5 through 9 - chunk = next(generator) - desired = np.arange(5, 10, 1) - np.testing.assert_array_equal(chunk["x"].values, desired) - - # Third iteration should yield 10 through 14 - chunk = next(generator) - desired = np.arange(10, 15, 1) - np.testing.assert_array_equal(chunk["x"].values, desired) - - # Set chunk_size to 20 - chunk_size = 20 - generator = yieldChunks(indexable, chunk_size) - - # First iteration should yield 0 through 14 - chunk = next(generator) - desired = np.arange(0, 15, 1) - np.testing.assert_array_equal(chunk["x"].values, desired) - return - - -def test_yieldChunks_errors(): - - # Make sure yieldChunks raises an error for unsupported types - chunk_size = 1 - with pytest.raises(ValueError): - generator = yieldChunks(set(), chunk_size) - next(generator) - - return - - -def test_calcChunkSize(): - - num_workers = 60 - n = 100 - max_chunk_size = 10 - min_chunk_size = 1 - - # Number of workers is less than 2x the number of things to process, so - # chunk size should fall to the minimum value - chunk_size = calcChunkSize( - n, num_workers, max_chunk_size, min_chunk_size=min_chunk_size - ) - assert chunk_size == 1 - - min_chunk_size = 5 - # Number of workers is less than 2x the number of things to process, so - # chunk size should fall to the minimum value - chunk_size = calcChunkSize( - n, num_workers, max_chunk_size, min_chunk_size=min_chunk_size - ) - assert chunk_size == 5 - - num_workers = 10 - n = 1000 - max_chunk_size = 10 - min_chunk_size = 1 - - # Number of things to process is 10x the number of workers, the max chunk size is 10 so - # the chunk_size should be 10 - chunk_size = calcChunkSize( - n, num_workers, max_chunk_size, min_chunk_size=min_chunk_size - ) - assert chunk_size == 10 - - num_workers = 10 - n = 1000 - max_chunk_size = 5 - min_chunk_size = 1 - - # Number of things to process is 10x the number of workers, the max chunk size is 5 so - # the chunk_size should be 5 - chunk_size = calcChunkSize( - n, num_workers, max_chunk_size, min_chunk_size=min_chunk_size - ) - assert chunk_size == 5 - - num_workers = 100 - n = 10000 - max_chunk_size = 1000 - min_chunk_size = 1 - - # Number of things to process is 10x the number of workers, the max chunk size is 1000 so - # the chunk_size should be 10000/100 = 100 - chunk_size = calcChunkSize( - n, num_workers, max_chunk_size, min_chunk_size=min_chunk_size - ) - assert chunk_size == 100 - return diff --git a/thor/utils/tests/test_observations.py b/thor/utils/tests/test_observations.py deleted file mode 100644 index b9a3e63e..00000000 --- a/thor/utils/tests/test_observations.py +++ /dev/null @@ -1,108 +0,0 @@ -import numpy as np -import pytest -from astropy.time import Time - -from ..observations import calcLinkingWindow, calcNight - - -def test_calcLinkingWindow(): - - # Times are all in August 2021 - time = Time([59426 + 1 / 24], scale="utc", format="mjd") - times = time + np.arange(0, 30) - - # If the linking window exceeds the range of the observations - # then they all should be in the first linking window - length = 30 - linking_window = calcLinkingWindow(times, length) - np.testing.assert_equal(linking_window, np.ones(length, dtype=int)) - - # If the linking window is 15 days then we expect 15 observations - # in linking window 1 and 15 in window 2 - length = 15 - linking_window = calcLinkingWindow(times, length) - np.testing.assert_equal(linking_window[0:15], np.ones(length, dtype=int)) - np.testing.assert_equal(linking_window[15:30], 2 * np.ones(length, dtype=int)) - - # If the linking window is 10 days then we expect 10 observations - # in linking window 1 and 10 in window 2 and 10 in window 3 - length = 10 - linking_window = calcLinkingWindow(times, length) - np.testing.assert_equal(linking_window[0:10], np.ones(length, dtype=int)) - np.testing.assert_equal(linking_window[10:20], 2 * np.ones(length, dtype=int)) - np.testing.assert_equal(linking_window[20:30], 3 * np.ones(length, dtype=int)) - - return - - -def test_calcLinkingWindow_raises(): - - # Times are all in August 2021 - time = Time([59426 + 1 / 24], scale="utc", format="mjd") - times = time + np.arange(0, 30) - - # Passing a plain array should raise a TypeError - with pytest.raises(TypeError): - linking_window = calcLinkingWindow(times.mjd, 10) - - return - - -def test_calcNight_london(): - # Lets assume three days of observations were made - # somewhere in London with observations occuring - # +- 5 hours around midnight - dts = np.linspace(-5, 5, 20) - dts /= 24.0 - offset = 0 / 24 - days = [59001, 59002, 59003] - times = np.zeros(len(dts) * len(days)) - for i, day in enumerate(days): - times[i * len(dts) : (i + 1) * len(dts)] = day + dts + offset - observation_times = Time(times, scale="utc", format="mjd") - - night = calcNight(observation_times, offset) - assert np.all(night[0:20] == 59000) - assert np.all(night[20:40] == 59001) - assert np.all(night[40:60] == 59002) - return - - -def test_calcNight_seattle(): - # Lets assume three days of observations were made - # somewhere in Seattle with observations occuring - # +- 5 hours around midnight - dts = np.linspace(-5, 5, 20) - dts /= 24.0 - offset = -7 / 24 - days = [59001, 59002, 59003] - times = np.zeros(len(dts) * len(days)) - for i, day in enumerate(days): - times[i * len(dts) : (i + 1) * len(dts)] = day + dts + offset - observation_times = Time(times, scale="utc", format="mjd") - - night = calcNight(observation_times, offset) - assert np.all(night[0:20] == 59000) - assert np.all(night[20:40] == 59001) - assert np.all(night[40:60] == 59002) - return - - -def test_calcNight_shanghai(): - # Lets assume three days of observations were made - # somewhere in Shanghai with observations occuring - # +- 5 hours around midnight - dts = np.linspace(-5, 5, 20) - dts /= 24.0 - offset = 8 / 24 - days = [59001, 59002, 59003] - times = np.zeros(len(dts) * len(days)) - for i, day in enumerate(days): - times[i * len(dts) : (i + 1) * len(dts)] = day + dts + offset - observation_times = Time(times, scale="utc", format="mjd") - - night = calcNight(observation_times, offset) - assert np.all(night[0:20] == 59000) - assert np.all(night[20:40] == 59001) - assert np.all(night[40:60] == 59002) - return diff --git a/thor/utils/tests/test_patches.py b/thor/utils/tests/test_patches.py deleted file mode 100644 index 74553388..00000000 --- a/thor/utils/tests/test_patches.py +++ /dev/null @@ -1,45 +0,0 @@ -import numpy as np - -from ..patches import assignPatchesHEALPix, assignPatchesSquare - - -def test_assignPatchesSquare(): - - ra_width = 15 - dec_width = 15 - num_ra = 360 / ra_width - num_dec = 180 / dec_width - - # Create fake observations at the midpoint of each bin - ra = np.arange(0.0, 360, ra_width) - ra += ra_width / 2 - dec = np.arange(-90, 90.0, dec_width) - dec += dec_width / 2 - - dec, ra = np.meshgrid(dec, ra) - dec = dec.flatten() - ra = ra.flatten() - patch_ids_desired = np.arange(0, len(dec), dtype=int) - - patch_ids = assignPatchesSquare(ra, dec, ra_width=ra_width, dec_width=dec_width) - - # Check that the patch IDs match the expected order - # (increasing Dec first, then increasing RA) - np.testing.assert_equal(patch_ids_desired, patch_ids) - - -def test_assignPatchesHEALPix(): - - # Create observations - ra = np.arange(0.0, 360, 10) - dec = np.arange(-90, 90.0, 10) - - dec, ra = np.meshgrid(dec, ra) - dec = dec.flatten() - ra = ra.flatten() - nside = 32 - - # Not really testing for anything other than - # that the function call works.. - patch_ids = assignPatchesHEALPix(ra, dec, nside) - assert len(patch_ids) == len(ra)