Skip to content

Commit

Permalink
[BUG] Fix time type inference (#2441)
Browse files Browse the repository at this point in the history
Fixes JSON time type inference to use Time64 instead of Time32.

---------

Co-authored-by: Jay Chia <[email protected]>
  • Loading branch information
colin-ho and jaychia authored Jun 27, 2024
1 parent 3af6069 commit 7422f2f
Show file tree
Hide file tree
Showing 5 changed files with 17 additions and 13 deletions.
8 changes: 7 additions & 1 deletion src/daft-decoding/src/inference.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ pub fn infer_string(string: &str) -> DataType {
if is_date(string) {
DataType::Date32
} else if let Some(time_unit) = is_time(string) {
DataType::Time32(time_unit)
DataType::Time64(time_unit)
} else if let Some((time_unit, offset)) = is_datetime(string) {
// NOTE: We try to parse as a non-naive datatime (with timezone information) first,
// since is_datetime() will return false if timezone information is not present in the string,
Expand Down Expand Up @@ -77,6 +77,12 @@ fn is_date(string: &str) -> bool {
fn is_time(string: &str) -> Option<TimeUnit> {
if let Ok(t) = string.parse::<chrono::NaiveTime>() {
let time_unit = nanoseconds_to_time_unit(t.nanosecond());
// NOTE: We only support Time64 with nanosecond or microsecond granularity,
// so if the parsed timeunit is millisecond or second,
// map it to Time64 with microsecond granularity.
if time_unit != TimeUnit::Nanosecond {
return Some(TimeUnit::Microsecond);
}
return Some(time_unit);
}
None
Expand Down
4 changes: 2 additions & 2 deletions src/daft-json/src/inference.rs
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ pub(crate) fn coerce_data_type(mut datatypes: HashSet<DataType>) -> DataType {
(DataType::Int64, DataType::Boolean) | (DataType::Boolean, DataType::Int64) => {
DataType::Int64
}
(DataType::Time32(left_tu), DataType::Time32(right_tu)) => {
(DataType::Time64(left_tu), DataType::Time64(right_tu)) => {
// Set unified time unit to the lowest granularity time unit.
let unified_tu = if left_tu == right_tu
|| time_unit_to_ordinal(&left_tu) < time_unit_to_ordinal(&right_tu)
Expand All @@ -194,7 +194,7 @@ pub(crate) fn coerce_data_type(mut datatypes: HashSet<DataType>) -> DataType {
} else {
right_tu
};
DataType::Time32(unified_tu)
DataType::Time64(unified_tu)
}
(
DataType::Timestamp(left_tu, left_tz),
Expand Down
5 changes: 2 additions & 3 deletions src/daft-json/src/read.rs
Original file line number Diff line number Diff line change
Expand Up @@ -627,9 +627,8 @@ mod tests {
Field::new("str", DataType::Utf8),
Field::new("null", DataType::Null),
Field::new("date", DataType::Date),
// TODO(Clark): Add coverage for time parsing once we add support for representing time series in Daft.
// // Time unit should be coarest granularity found in file, i.e. seconds.
// Field::new("time", DataType::Time(TimeUnit::Nanoseconds)),
// Time unit should be coarest granularity found in file, i.e. microseconds.
Field::new("time", DataType::Time(TimeUnit::Microseconds)),
// Time unit should be coarsest granularity found in file, i.e. seconds due to naive date inclusion.
Field::new(
"naive_timestamp",
Expand Down
5 changes: 2 additions & 3 deletions src/daft-json/src/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -278,9 +278,8 @@ mod tests {
Field::new("str", DataType::Utf8),
Field::new("null", DataType::Null),
Field::new("date", DataType::Date),
// TODO(Clark): Add coverage for time parsing once we add support for representing time series in Daft.
// // Time unit should be coarest granularity found in file, i.e. seconds.
// Field::new("time", DataType::Time(TimeUnit::Nanoseconds)),
// // Time unit should be coarsest granularity found in file, i.e. microseconds.
Field::new("time", DataType::Time(TimeUnit::Microseconds)),
// Time unit should be coarsest granularity found in file, i.e. seconds due to naive date inclusion.
Field::new(
"naive_timestamp",
Expand Down
8 changes: 4 additions & 4 deletions src/daft-json/test/dtypes.jsonl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{"int": 1, "float": 2.3, "bool": false, "str": "foo", "null": null, "date": "2023-11-29", "naive_timestamp": "2023-11-29T06:31:52.342", "timestamp": "2023-11-29T06:31:52.342567Z", "list": [1, 2, 3], "obj": {"a": 1, "b": false}, "nested_list": [[{"a": "foo"}]], "nested_obj": {"obj": {"a": 4}, "list": [1, 2]}}
{"int": 2, "float": 3.3, "bool": true, "str": "bar", "null": null, "date": "2023/11/28", "naive_timestamp": "2023-11-29", "timestamp": "2023-11-29T06:31:52.342+07:00", "list": [4, 5], "obj": {"a": 2, "b": false}, "nested_list": [[{"a": "bar"}]], "nested_obj": {"obj": {"a": 6}, "list": [3, 4]}}
{"int": null, "float": null, "bool": null, "str": null, "null": null, "date": null, "naive_timestamp": null, "timestamp": null, "list": null, "obj": null, "nested_list": null, "nested_obj": null}
{"int": 3, "float": 4.3, "bool": false, "str": "baz", "null": null, "date": "2023-11-27", "naive_timestamp": "2023-11-29 06:31:52.342567", "timestamp": "2023-11-29 06:31:52.342-07:00", "list": [6, 7, null, 9], "obj": {"a": null, "b": false}, "nested_list": [[{"a": null}]], "nested_obj": {"obj": {"a": null}, "list": [5, null]}}
{"int": 1, "float": 2.3, "bool": false, "str": "foo", "null": null, "date": "2023-11-29", "time": "12:01:23.342", "naive_timestamp": "2023-11-29T06:31:52.342", "timestamp": "2023-11-29T06:31:52.342567Z", "list": [1, 2, 3], "obj": {"a": 1, "b": false}, "nested_list": [[{"a": "foo"}]], "nested_obj": {"obj": {"a": 4}, "list": [1, 2]}}
{"int": 2, "float": 3.3, "bool": true, "str": "bar", "null": null, "date": "2023/11/28", "time": "12:01:23", "naive_timestamp": "2023-11-29", "timestamp": "2023-11-29T06:31:52.342+07:00", "list": [4, 5], "obj": {"a": 2, "b": false}, "nested_list": [[{"a": "bar"}]], "nested_obj": {"obj": {"a": 6}, "list": [3, 4]}}
{"int": null, "float": null, "bool": null, "str": null, "null": null, "date": null, "time": null, "naive_timestamp": null, "timestamp": null, "list": null, "obj": null, "nested_list": null, "nested_obj": null}
{"int": 3, "float": 4.3, "bool": false, "str": "baz", "null": null, "date": "2023-11-27", "time": "12:01:23.342129312", "naive_timestamp": "2023-11-29 06:31:52.342567", "timestamp": "2023-11-29 06:31:52.342-07:00", "list": [6, 7, null, 9], "obj": {"a": null, "b": false}, "nested_list": [[{"a": null}]], "nested_obj": {"obj": {"a": null}, "list": [5, null]}}

0 comments on commit 7422f2f

Please sign in to comment.