Skip to content

Commit

Permalink
Script cleaning
Browse files Browse the repository at this point in the history
  • Loading branch information
GraysonBellamy committed Jun 20, 2024
1 parent 3a82d78 commit 7a3e210
Show file tree
Hide file tree
Showing 5 changed files with 273 additions and 189 deletions.
109 changes: 85 additions & 24 deletions src/lab_etl/deatak_cone_parser.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import pyarrow as pa
import pyarrow.parquet as pq
from lab_etl.util import set_metadata
from lab_etl.util import set_metadata, get_hash
import polars as pl
from typing import Any


def load_cone_data(path: str) -> pa.Table:
Expand All @@ -20,22 +21,39 @@ def load_cone_data(path: str) -> pa.Table:
"Ext Coeff": "extinction_coefficient",
"Flame Verif": "flame_verification",
}
df = pl.read_excel(
path, engine="calamine", sheet_id=2, read_options={"skip_rows": 5}
)

# Read Excel data using Polars
try:
df = pl.read_excel(
path, engine="calamine", sheet_id=2, read_options={"skip_rows": 5}
)
except Exception as e:
raise ValueError(f"Error reading Excel file at {path}: {str(e)}")

# Get units and metadata
units = get_cone_units(path)
meta = get_cone_metadata(path)
df = df.drop("Names")

# Drop 'Names' column if it exists
if "Names" in df.columns:
df = df.drop("Names")

# Rename columns based on the mapping
df = df.rename(
{col: mapping.get(col, col).lower().replace(" ", "_") for col in df.columns}
)

# Convert Polars DataFrame to PyArrow Table
table = df.to_arrow()

# Add metadata to the PyArrow Table
table_meta = set_metadata(table, col_meta=units, tbl_meta=meta)

return table_meta


def get_cone_units(path: str):
"""Get the units from a Cone file.
def get_cone_units(path: str) -> dict:
"""Get the units from a Cone file using Polars.
Args:
path (str): The path to the Cone file.
Expand All @@ -51,20 +69,35 @@ def get_cone_units(path: str):
"Ext Coeff": "extinction_coefficient",
"Flame Verif": "flame_verification",
}
units = pl.read_excel(
path, engine="calamine", sheet_id=2, read_options={"n_rows": 1, "skip_rows": 3}
)
units = units.to_dicts()[0]
del units["Names"]
units = {
k_mapping.get(k, k).lower().replace(" ", "_"): {"unit": mapping.get(v, v)}
for k, v in units.items()
if v is not None
}
return units

try:
# Read Excel file using Polars
units = pl.read_excel(
path,
engine="calamine",
sheet_id=2,
read_options={"n_rows": 1, "skip_rows": 3},
)
units_dict = units.to_dicts()[0]
except Exception as e:
raise ValueError(f"Error reading Excel file at {path}: {str(e)}")

# Process units dictionary
if "Names" in units_dict:
del units_dict["Names"]

units_result = {}

for k, v in units_dict.items():
if v is not None:
standardized_key = k_mapping.get(k, k).lower().replace(" ", "_")
unit = mapping.get(v, v)
units_result[standardized_key] = {"unit": unit}

def get_cone_metadata(path: str):
return units_result


def get_cone_metadata(path: str) -> dict:
"""Get the metadata from a Cone file.
Args:
Expand All @@ -80,26 +113,54 @@ def get_cone_metadata(path: str):
"pre_test_cmt": "comment",
"post_test_cmt": "comment",
}
meta = pl.read_excel(
path, engine="calamine", sheet_id=1, read_options={"header_row": None}
)
meta_dict = {}

# Get file metadata
file_hash = get_hash(path)

# Read Excel file using Polars
try:
meta = pl.read_excel(
path, engine="calamine", sheet_id=1, read_options={"header_row": None}
)
except Exception as e:
raise ValueError(f"Error reading Excel file at {path}: {str(e)}")

meta_dict: dict[str, Any] = {}

# Process each row in the DataFrame
for row in meta.iter_rows():
if len(row) < 2:
continue

key = row[0].strip().lower().replace(" ", "_")
value = row[1].strip()

if key in mapping:
key = mapping[key]

# Convert value to int or float if possible
try:
value = int(value)
except ValueError:
try:
value = float(value)
except ValueError:
pass

# Aggregate values into dictionary
if key in meta_dict:
meta_dict[key] = [meta_dict[key], value]
if not isinstance(meta_dict[key], list):
meta_dict[key] = [meta_dict[key]]
meta_dict[key].append(value)
else:
meta_dict[key] = value

# Add hash to metadata
meta_dict["file_hash"] = {
"file": path.split("/")[-1],
"method": "BLAKE2b",
"hash": file_hash,
}
return meta_dict


Expand Down
Loading

0 comments on commit 7a3e210

Please sign in to comment.