-
Notifications
You must be signed in to change notification settings - Fork 147
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
c694c9e
commit b84a598
Showing
16 changed files
with
276 additions
and
113 deletions.
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
[dependencies] | ||
arrow2 = {workspace = true, features = ["io_parquet", "io_parquet_compression"]} | ||
common-error = {path = "../common/error", default-features = false} | ||
daft-core = {path = "../daft-core", default-features = false} | ||
daft-decoding = {path = "../daft-decoding", default-features = false} | ||
daft-schema = {path = "../daft-schema", default-features = false} | ||
daft-stats = {path = "../daft-stats", default-features = false} | ||
daft-table = {path = "../daft-table", default-features = false} | ||
|
||
[lints] | ||
workspace = true | ||
|
||
[package] | ||
edition = {workspace = true} | ||
name = "daft-hive" | ||
version = {workspace = true} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
use std::collections::HashMap; | ||
|
||
use common_error::DaftResult; | ||
use daft_core::{datatypes::Utf8Array, series::IntoSeries}; | ||
use daft_decoding::inference::infer; | ||
use daft_schema::{dtype::DaftDataType, field::Field, schema::Schema}; | ||
use daft_table::Table; | ||
|
||
/// Parses hive-style /key=value/ components from a uri. | ||
pub fn parse_hive_partitioning(uri: &str) -> HashMap<&str, &str> { | ||
let mut equality_pos = 0; | ||
let mut partition_start = 0; | ||
let mut valid_partition = true; | ||
let mut partitions = HashMap::new(); | ||
// Loops through the uri looking for valid partitions. Although we consume partitions only when | ||
// encountering a slash separator, we never need to check the last "partition" at the end of the | ||
// uri because this needs to be a file name. | ||
for (idx, c) in uri.char_indices() { | ||
match c { | ||
// A '?' char indicates the start of GET parameters, so stop parsing hive partitions. | ||
// We also ban '\n' for hive partitions, given all the edge cases that can arise. | ||
'?' | '\n' => break, | ||
'\\' | '/' => { | ||
if valid_partition && equality_pos > partition_start { | ||
let key = &uri[partition_start..equality_pos]; | ||
let value = &uri[equality_pos + 1..idx]; | ||
partitions.insert(key, value); | ||
} | ||
partition_start = idx + 1; | ||
valid_partition = true; | ||
} | ||
'=' => { | ||
// If we see more than one '=' in the partition, it is not a valid partition. | ||
if equality_pos > partition_start { | ||
valid_partition = false; | ||
} | ||
equality_pos = idx; | ||
} | ||
_ => (), | ||
} | ||
} | ||
partitions | ||
} | ||
|
||
/// Takes hive partition key-value pairs as `partitions`, and the schema of the containing table as | ||
/// `table_schema`, and returns a 1-dimensional table containing the partition keys as columns, and | ||
/// their partition values as the singular row of values. | ||
pub fn hive_partitions_to_1d_table( | ||
partitions: &HashMap<&str, &str>, | ||
table_schema: &Schema, | ||
) -> DaftResult<Table> { | ||
let uncasted_fields = partitions | ||
.keys() | ||
.map(|&key| Field::new(key, daft_schema::dtype::DataType::Utf8)) | ||
.collect(); | ||
let uncasted_schema = Schema::new(uncasted_fields)?; | ||
let uncasted_series = partitions | ||
.iter() | ||
.map(|(&key, &value)| { | ||
let arrow_array = arrow2::array::Utf8Array::from_iter_values(std::iter::once(&value)); | ||
let daft_array = Utf8Array::from((key, Box::new(arrow_array))); | ||
daft_array.into_series() | ||
}) | ||
.collect::<Vec<_>>(); | ||
let uncast_table = Table::new_unchecked(uncasted_schema, uncasted_series, /*num_rows=*/ 1); | ||
|
||
let partition_fields = table_schema | ||
.fields | ||
.clone() | ||
.into_iter() | ||
.map(|(_, field)| field) | ||
.filter(|field| partitions.contains_key(&field.name.as_str())) | ||
.collect(); | ||
let partition_schema = Schema::new(partition_fields)?; | ||
// TODO(desmond): There's probably a better way to do this. | ||
let casted_table = uncast_table.cast_to_schema(&partition_schema)?; | ||
Ok(casted_table) | ||
} | ||
|
||
/// Turns hive partition key-value pairs into a schema with the partitions' keys as field names, and | ||
/// inferring field types from the partitions' values. We don't do schema type inference here as the | ||
/// user is expected to provide the schema for hive-partitioned fields. | ||
pub fn hive_partitions_to_schema(partitions: &HashMap<&str, &str>) -> DaftResult<Schema> { | ||
let partition_fields: Vec<Field> = partitions | ||
.iter() | ||
.map(|(&key, &value)| Field::new(key, DaftDataType::from(&infer(value.as_bytes())))) | ||
.collect(); | ||
Schema::new(partition_fields) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.