Skip to content

Commit

Permalink
Added support for skip_rows_after_header option in reading csv (#782)
Browse files Browse the repository at this point in the history
  • Loading branch information
JonGretar authored Dec 21, 2023
1 parent f2a6d30 commit 3e04fdb
Show file tree
Hide file tree
Showing 9 changed files with 66 additions and 0 deletions.
2 changes: 2 additions & 0 deletions lib/explorer/backend/data_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ defmodule Explorer.Backend.DataFrame do
delimiter :: String.t(),
nil_values :: list(String.t()),
skip_rows :: integer(),
skip_rows_after_header :: integer(),
header? :: boolean(),
encoding :: String.t(),
max_rows :: option(integer()),
Expand All @@ -68,6 +69,7 @@ defmodule Explorer.Backend.DataFrame do
delimiter :: String.t(),
nil_values :: list(String.t()),
skip_rows :: integer(),
skip_rows_after_header :: integer(),
header? :: boolean(),
encoding :: String.t(),
max_rows :: option(integer()),
Expand Down
7 changes: 7 additions & 0 deletions lib/explorer/data_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,8 @@ defmodule Explorer.DataFrame do
* `:skip_rows` - The number of lines to skip at the beginning of the file. (default: `0`)
* `:skip_rows_after_header` - The number of lines to skip after the header row. (default: `0`)
* `:columns` - A list of column names or indexes to keep.
If present, only these columns are read into the dataframe. (default: `nil`)
Expand Down Expand Up @@ -558,6 +560,7 @@ defmodule Explorer.DataFrame do
max_rows: nil,
nil_values: [],
skip_rows: 0,
skip_rows_after_header: 0,
columns: nil,
infer_schema_length: @default_infer_schema_length,
parse_dates: false,
Expand All @@ -573,6 +576,7 @@ defmodule Explorer.DataFrame do
opts[:delimiter],
opts[:nil_values],
opts[:skip_rows],
opts[:skip_rows_after_header],
opts[:header],
opts[:encoding],
opts[:max_rows],
Expand Down Expand Up @@ -616,6 +620,7 @@ defmodule Explorer.DataFrame do
* `:max_rows` - Maximum number of lines to read. (default: `nil`)
* `:nil_values` - A list of strings that should be interpreted as a nil values. (default: `[]`)
* `:skip_rows` - The number of lines to skip at the beginning of the file. (default: `0`)
* `:skip_rows_after_header` - The number of lines to skip after the heqader row. (default: `0`)
* `:columns` - A list of column names or indexes to keep. If present, only these columns are read into the dataframe. (default: `nil`)
* `:infer_schema_length` Maximum number of rows read for schema inference. Setting this to nil will do a full table scan and will be slow (default: `1000`).
* `:parse_dates` - Automatically try to parse dates/ datetimes and time. If parsing fails, columns remain of dtype `string`
Expand All @@ -638,6 +643,7 @@ defmodule Explorer.DataFrame do
max_rows: nil,
nil_values: [],
skip_rows: 0,
skip_rows_after_header: 0,
columns: nil,
infer_schema_length: @default_infer_schema_length,
parse_dates: false,
Expand All @@ -652,6 +658,7 @@ defmodule Explorer.DataFrame do
opts[:delimiter],
opts[:nil_values],
opts[:skip_rows],
opts[:skip_rows_after_header],
opts[:header],
opts[:encoding],
opts[:max_rows],
Expand Down
6 changes: 6 additions & 0 deletions lib/explorer/polars_backend/data_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ defmodule Explorer.PolarsBackend.DataFrame do
delimiter,
nil_values,
skip_rows,
skip_rows_after_header,
header?,
encoding,
max_rows,
Expand All @@ -62,6 +63,7 @@ defmodule Explorer.PolarsBackend.DataFrame do
delimiter,
nil_values,
skip_rows,
skip_rows_after_header,
header?,
encoding,
max_rows,
Expand All @@ -83,6 +85,7 @@ defmodule Explorer.PolarsBackend.DataFrame do
<<delimiter::utf8>>,
nil_values,
skip_rows,
skip_rows_after_header,
header?,
encoding,
max_rows,
Expand All @@ -105,6 +108,7 @@ defmodule Explorer.PolarsBackend.DataFrame do
header?,
max_rows,
skip_rows,
skip_rows_after_header,
with_projection,
delimiter,
true,
Expand Down Expand Up @@ -174,6 +178,7 @@ defmodule Explorer.PolarsBackend.DataFrame do
<<delimiter::utf8>>,
nil_values,
skip_rows,
skip_rows_after_header,
header?,
encoding,
max_rows,
Expand All @@ -196,6 +201,7 @@ defmodule Explorer.PolarsBackend.DataFrame do
header?,
max_rows,
skip_rows,
skip_rows_after_header,
with_projection,
delimiter,
true,
Expand Down
6 changes: 6 additions & 0 deletions lib/explorer/polars_backend/lazy_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ defmodule Explorer.PolarsBackend.LazyFrame do
_,
_,
_,
_,
_
) do
{:error,
Expand All @@ -96,6 +97,7 @@ defmodule Explorer.PolarsBackend.LazyFrame do
<<delimiter::utf8>>,
nil_values,
skip_rows,
skip_rows_after_header,
header?,
encoding,
max_rows,
Expand All @@ -117,6 +119,7 @@ defmodule Explorer.PolarsBackend.LazyFrame do
header?,
max_rows,
skip_rows,
skip_rows_after_header,
delimiter,
true,
Map.to_list(dtypes),
Expand Down Expand Up @@ -145,6 +148,7 @@ defmodule Explorer.PolarsBackend.LazyFrame do
_,
_,
_,
_,
_
) do
{:error,
Expand Down Expand Up @@ -232,6 +236,7 @@ defmodule Explorer.PolarsBackend.LazyFrame do
delimiter,
nil_values,
skip_rows,
skip_rows_after_header,
header?,
encoding,
max_rows,
Expand All @@ -247,6 +252,7 @@ defmodule Explorer.PolarsBackend.LazyFrame do
delimiter,
nil_values,
skip_rows,
skip_rows_after_header,
header?,
encoding,
max_rows,
Expand Down
3 changes: 3 additions & 0 deletions lib/explorer/polars_backend/native.ex
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ defmodule Explorer.PolarsBackend.Native do
_has_header,
_stop_after_n_rows,
_skip_rows,
_skip_rows_after_header,
_projection,
_sep,
_rechunk,
Expand Down Expand Up @@ -127,6 +128,7 @@ defmodule Explorer.PolarsBackend.Native do
_has_header,
_stop_after_n_rows,
_skip_rows,
_skip_rows_after_header,
_projection,
_sep,
_rechunk,
Expand Down Expand Up @@ -234,6 +236,7 @@ defmodule Explorer.PolarsBackend.Native do
_has_header,
_stop_after_n_rows,
_skip_rows,
_skip_rows_after_header,
_sep,
_rechunk,
_dtypes,
Expand Down
4 changes: 4 additions & 0 deletions native/explorer/src/dataframe/io.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ pub fn df_from_csv(
has_header: bool,
stop_after_n_rows: Option<usize>,
skip_rows: usize,
skip_rows_after_header: usize,
projection: Option<Vec<usize>>,
delimiter_as_byte: u8,
do_rechunk: bool,
Expand All @@ -53,6 +54,7 @@ pub fn df_from_csv(
.with_n_rows(stop_after_n_rows)
.with_separator(delimiter_as_byte)
.with_skip_rows(skip_rows)
.with_skip_rows_after_header(skip_rows_after_header)
.with_projection(projection)
.with_rechunk(do_rechunk)
.with_encoding(encoding)
Expand Down Expand Up @@ -136,6 +138,7 @@ pub fn df_load_csv(
has_header: bool,
stop_after_n_rows: Option<usize>,
skip_rows: usize,
skip_rows_after_header: usize,
projection: Option<Vec<usize>>,
delimiter_as_byte: u8,
do_rechunk: bool,
Expand All @@ -160,6 +163,7 @@ pub fn df_load_csv(
.with_n_rows(stop_after_n_rows)
.with_separator(delimiter_as_byte)
.with_skip_rows(skip_rows)
.with_skip_rows_after_header(skip_rows_after_header)
.with_projection(projection)
.with_rechunk(do_rechunk)
.with_encoding(encoding)
Expand Down
2 changes: 2 additions & 0 deletions native/explorer/src/lazyframe/io.rs
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ pub fn lf_from_csv(
has_header: bool,
stop_after_n_rows: Option<usize>,
skip_rows: usize,
skip_rows_after_header: usize,
delimiter_as_byte: u8,
do_rechunk: bool,
dtypes: Vec<(&str, ExSeriesDtype)>,
Expand All @@ -213,6 +214,7 @@ pub fn lf_from_csv(
.with_n_rows(stop_after_n_rows)
.with_separator(delimiter_as_byte)
.with_skip_rows(skip_rows)
.with_skip_rows_after_header(skip_rows_after_header)
.with_rechunk(do_rechunk)
.with_encoding(encoding)
.with_dtype_overwrite(Some(schema_from_dtypes_pairs(dtypes)?.as_ref()))
Expand Down
1 change: 1 addition & 0 deletions notebooks/exploring_explorer.livemd
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ For CSV, your 'usual suspects' of options are available:
* `max_rows` - Maximum number of lines to read. (default: `nil`)
* `nil_values` - A list of strings that should be interpreted as a nil values. (default: `[]`)
* `skip_rows` - The number of lines to skip at the beginning of the file. (default: `0`)
* `skip_rows_after_header` - The number of lines to skip at the after the header row. (default: `0`)
* `columns` - A list of column names to keep. If present, only these columns are read into the dataframe. (default: `nil`)

`Explorer` also has multiple example datasets built in, which you can load from the `Explorer.Datasets` module like so:
Expand Down
35 changes: 35 additions & 0 deletions test/explorer/data_frame/csv_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,41 @@ defmodule Explorer.DataFrame.CSVTest do
}
end

@tag :tmp_dir
test "skip_rows_after_header", config do
csv =
tmp_csv(config.tmp_dir, """
a,b
c,d
e,f
""")

df = DF.from_csv!(csv, skip_rows_after_header: 1)

assert DF.to_columns(df, atom_keys: true) == %{
a: ["e"],
b: ["f"]
}
end

@tag :tmp_dir
test "skip_rows with skip_rows_after_header", config do
csv =
tmp_csv(config.tmp_dir, """
a,b
c,d
e,f
g,h
""")

df = DF.from_csv!(csv, skip_rows: 1, skip_rows_after_header: 1)

assert DF.to_columns(df, atom_keys: true) == %{
c: ["g"],
d: ["h"]
}
end

@tag :tmp_dir
test "columns - str", config do
csv =
Expand Down

0 comments on commit 3e04fdb

Please sign in to comment.