Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into preserve-dtypes
Browse files Browse the repository at this point in the history
  • Loading branch information
rjzamora committed Jul 26, 2024
2 parents acb1688 + 9a758df commit 91d1135
Showing 1 changed file with 3 additions and 2 deletions.
5 changes: 3 additions & 2 deletions tests/unit/io/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -579,6 +579,7 @@ def test_hive_partitioned_data(tmpdir, cpu):
# Make sure the directory structure is hive-like
df_expect = ddf.compute()
df_expect = df_expect.sort_values(["id", "x", "y"]).reset_index(drop=True)
ts_dtype = df_expect["timestamp"].dtype
timestamp_check = df_expect["timestamp"].iloc[0]
name_check = df_expect["name"].iloc[0]
result_paths = glob.glob(
Expand All @@ -596,7 +597,7 @@ def test_hive_partitioned_data(tmpdir, cpu):
# Read back with dask.dataframe and check the data
df_check = dd.read_parquet(path, engine="pyarrow").compute()
df_check["name"] = df_check["name"].astype("object")
df_check["timestamp"] = df_check["timestamp"].astype("int64")
df_check["timestamp"] = df_check["timestamp"].astype(ts_dtype)
df_check = df_check.sort_values(["id", "x", "y"]).reset_index(drop=True)
for col in df_expect:
# Order of columns can change after round-trip partitioning
Expand All @@ -605,7 +606,7 @@ def test_hive_partitioned_data(tmpdir, cpu):
# Read back with NVT and check the data
df_check = merlin.io.Dataset(path, engine="parquet").to_ddf().compute()
df_check["name"] = df_check["name"].astype("object")
df_check["timestamp"] = df_check["timestamp"].astype("int64")
df_check["timestamp"] = df_check["timestamp"].astype(ts_dtype)
df_check = df_check.sort_values(["id", "x", "y"]).reset_index(drop=True)
for col in df_expect:
# Order of columns can change after round-trip partitioning
Expand Down

0 comments on commit 91d1135

Please sign in to comment.