From c510b0180e722cac9dacb0e59f470b3861a3b5a1 Mon Sep 17 00:00:00 2001 From: Daniel Rizk Date: Thu, 26 Sep 2024 09:15:21 -0400 Subject: [PATCH 1/4] enhance window frame --- src/joins_sq.jl | 230 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 201 insertions(+), 29 deletions(-) diff --git a/src/joins_sq.jl b/src/joins_sq.jl index ea804db..163b8ab 100644 --- a/src/joins_sq.jl +++ b/src/joins_sq.jl @@ -25,41 +25,95 @@ end $docstring_left_join """ macro left_join(sqlquery, join_table, lhs_column, rhs_column) - # Convert column references to string + # Convert column references to strings lhs_col_str = string(lhs_column) rhs_col_str = string(rhs_column) - join_table = QuoteNode(join_table) + # Removed the QuoteNode wrapping to allow evaluation of join_table + # join_table = QuoteNode(join_table) return quote sq = $(esc(sqlquery)) + jq = $(esc(join_table)) # Evaluate join_table + if isa(sq, SQLQuery) needs_new_cte = !isempty(sq.select) || !isempty(sq.where) || sq.is_aggregated || !isempty(sq.ctes) if needs_new_cte sq.cte_count += 1 cte_name = "cte_" * string(sq.cte_count) - most_recent_source = !isempty(sq.ctes) ? "cte_" * string(sq.cte_count - 1) : sq.from - join_sql = " " * most_recent_source * ".*, " * get_join_columns(sq.db, string($(esc(join_table))), $lhs_col_str) * gbq_join_parse(most_recent_source) * - " LEFT JOIN " * string($(esc(join_table))) * " ON " * string(gbq_join_parse($(esc(join_table))), ".", $lhs_col_str, " = ", gbq_join_parse(most_recent_source), ".", $rhs_col_str) + if isa(jq, SQLQuery) + # Handle when join_table is an SQLQuery + needs_new_cte_jq = !isempty(jq.select) || !isempty(jq.where) || jq.is_aggregated || !isempty(jq.ctes) + if needs_new_cte_jq + jq.cte_count += 1 + cte_name_jq = "jcte_" * string(jq.cte_count) + most_recent_source_jq = !isempty(jq.ctes) ? "jcte_" * string(jq.cte_count - 1) : jq.from + select_sql_jq = "SELECT * FROM " * most_recent_source_jq + new_cte_jq = CTE(name=cte_name_jq, select=select_sql_jq) + push!(jq.ctes, new_cte_jq) + jq.from = cte_name_jq + end + # Combine CTEs and metadata + sq.ctes = vcat(sq.ctes, jq.ctes) + sq.metadata = vcat(sq.metadata, jq.metadata) + join_table_name = jq.from + else + # When join_table is a table name + join_table_name = string(jq) + if current_sql_mode[] != :athena + new_metadata = get_table_metadata(sq.db, join_table_name) + else + new_metadata = get_table_metadata_athena(sq.db, join_table_name, sq.athena_params) + end + sq.metadata = vcat(sq.metadata, new_metadata) + end + + join_sql = " " * most_recent_source * ".*, " * + get_join_columns(sq.db, join_table_name, $lhs_col_str) * gbq_join_parse(most_recent_source) * + " LEFT JOIN " * join_table_name * " ON " * + gbq_join_parse(join_table_name) * "." * $lhs_col_str * " = " * + gbq_join_parse(most_recent_source) * "." * $rhs_col_str # Create and add the new CTE new_cte = CTE(name=cte_name, select=join_sql) push!(sq.ctes, new_cte) - # Update the FROM clause sq.from = cte_name else - join_clause = " LEFT JOIN " * string($(esc(join_table))) * " ON " * string(gbq_join_parse($(esc(join_table))), ".", $lhs_col_str, " = ", gbq_join_parse(sq.from), ".", $rhs_col_str) + if isa(jq, SQLQuery) + # Handle when join_table is an SQLQuery + needs_new_cte_jq = !isempty(jq.select) || !isempty(jq.where) || jq.is_aggregated || !isempty(jq.ctes) + + if needs_new_cte_jq + jq.cte_count += 1 + cte_name_jq = "ncte_" * string(jq.cte_count) + most_recent_source_jq = !isempty(jq.ctes) ? "cte_" * string(jq.cte_count - 1) : jq.from + select_sql_jq = "SELECT * FROM " * most_recent_source_jq + new_cte_jq = CTE(name=cte_name_jq, select=select_sql_jq) + push!(jq.ctes, new_cte_jq) + jq.from = cte_name_jq + end + # Combine CTEs and metadata + sq.ctes = vcat(sq.ctes, jq.ctes) + sq.metadata = vcat(sq.metadata, jq.metadata) + join_table_name = jq.from + else + # When join_table is a table name + join_table_name = string(jq) + if current_sql_mode[] != :athena + new_metadata = get_table_metadata(sq.db, join_table_name) + else + new_metadata = get_table_metadata_athena(sq.db, join_table_name, sq.athena_params) + end + sq.metadata = vcat(sq.metadata, new_metadata) + end + + join_clause = " LEFT JOIN " * join_table_name * " ON " * + gbq_join_parse(join_table_name) * "." * $lhs_col_str * " = " * + gbq_join_parse(sq.from) * "." * $rhs_col_str sq.from *= join_clause end - - if current_sql_mode[] != :athena - new_metadata = get_table_metadata(sq.db, string($(esc(join_table)))) - else - new_metadata = get_table_metadata_athena(sq.db, string($(esc(join_table))), sq.athena_params) - end - sq.metadata = vcat(sq.metadata, new_metadata) else error("Expected sqlquery to be an instance of SQLQuery") end @@ -67,6 +121,7 @@ macro left_join(sqlquery, join_table, lhs_column, rhs_column) end end + """ $docstring_right_join """ @@ -167,42 +222,92 @@ end $docstring_full_join """ macro full_join(sqlquery, join_table, lhs_column, rhs_column) - # Convert column references to string + # Convert column references to strings lhs_col_str = string(lhs_column) rhs_col_str = string(rhs_column) - join_table = QuoteNode(join_table) return quote sq = $(esc(sqlquery)) + jq = $(esc(join_table)) # Evaluate join_table + if isa(sq, SQLQuery) needs_new_cte = !isempty(sq.select) || !isempty(sq.where) || sq.is_aggregated || !isempty(sq.ctes) - if needs_new_cte sq.cte_count += 1 cte_name = "cte_" * string(sq.cte_count) - most_recent_source = !isempty(sq.ctes) ? "cte_" * string(sq.cte_count - 1) : sq.from - join_sql = " " * most_recent_source * ".*, " * get_join_columns(sq.db, string($(esc(join_table))), $lhs_col_str) * gbq_join_parse(most_recent_source) * - " FULL JOIN " * string($(esc(join_table))) * " ON " * string(gbq_join_parse($(esc(join_table))), ".", $lhs_col_str, " = ", gbq_join_parse(most_recent_source), ".", $rhs_col_str) + if isa(jq, SQLQuery) + # Handle when join_table is an SQLQuery + needs_new_cte_jq = !isempty(jq.select) || !isempty(jq.where) || jq.is_aggregated || !isempty(jq.ctes) + if needs_new_cte_jq + jq.cte_count += 1 + cte_name_jq = "jcte_" * string(jq.cte_count) + most_recent_source_jq = !isempty(jq.ctes) ? "jcte_" * string(jq.cte_count - 1) : jq.from + select_sql_jq = "SELECT * FROM " * most_recent_source_jq + new_cte_jq = CTE(name=cte_name_jq, select=select_sql_jq) + push!(jq.ctes, new_cte_jq) + jq.from = cte_name_jq + end + # Combine CTEs and metadata + sq.ctes = vcat(sq.ctes, jq.ctes) + sq.metadata = vcat(sq.metadata, jq.metadata) + join_table_name = jq.from + else + # When join_table is a table name + join_table_name = string(jq) + if current_sql_mode[] != :athena + new_metadata = get_table_metadata(sq.db, join_table_name) + else + new_metadata = get_table_metadata_athena(sq.db, join_table_name, sq.athena_params) + end + sq.metadata = vcat(sq.metadata, new_metadata) + end + + join_sql = " " * most_recent_source * ".*, " * + get_join_columns(sq.db, join_table_name, $lhs_col_str) * gbq_join_parse(most_recent_source) * + " FULL JOIN " * join_table_name * " ON " * + gbq_join_parse(join_table_name) * "." * $lhs_col_str * " = " * + gbq_join_parse(most_recent_source) * "." * $rhs_col_str # Create and add the new CTE new_cte = CTE(name=cte_name, select=join_sql) push!(sq.ctes, new_cte) - # Update the FROM clause sq.from = cte_name else - join_clause = " FULL JOIN " * string($(esc(join_table))) * " ON " * string(gbq_join_parse($(esc(join_table))), ".", $lhs_col_str, " = ", gbq_join_parse(sq.from), ".", $rhs_col_str) + if isa(jq, SQLQuery) + # Handle when join_table is an SQLQuery + needs_new_cte_jq = !isempty(jq.select) || !isempty(jq.where) || jq.is_aggregated || !isempty(jq.ctes) + if needs_new_cte_jq + jq.cte_count += 1 + cte_name_jq = "ncte_" * string(jq.cte_count) + most_recent_source_jq = !isempty(jq.ctes) ? "cte_" * string(jq.cte_count - 1) : jq.from + select_sql_jq = "SELECT * FROM " * most_recent_source_jq + new_cte_jq = CTE(name=cte_name_jq, select=select_sql_jq) + push!(jq.ctes, new_cte_jq) + jq.from = cte_name_jq + end + # Combine CTEs and metadata + sq.ctes = vcat(sq.ctes, jq.ctes) + sq.metadata = vcat(sq.metadata, jq.metadata) + join_table_name = jq.from + else + # When join_table is a table name + join_table_name = string(jq) + if current_sql_mode[] != :athena + new_metadata = get_table_metadata(sq.db, join_table_name) + else + new_metadata = get_table_metadata_athena(sq.db, join_table_name, sq.athena_params) + end + sq.metadata = vcat(sq.metadata, new_metadata) + end + + join_clause = " FULL JOIN " * join_table_name * " ON " * + gbq_join_parse(join_table_name) * "." * $lhs_col_str * " = " * + gbq_join_parse(sq.from) * "." * $rhs_col_str sq.from *= join_clause end - - if current_sql_mode[] != :athena - new_metadata = get_table_metadata(sq.db, string($(esc(join_table)))) - else - new_metadata = get_table_metadata_athena(sq.db, string($(esc(join_table))), sq.athena_params) - end - sq.metadata = vcat(sq.metadata, new_metadata) else error("Expected sqlquery to be an instance of SQLQuery") end @@ -211,6 +316,7 @@ macro full_join(sqlquery, join_table, lhs_column, rhs_column) end + """ $docstring_semi_join """ @@ -305,3 +411,69 @@ macro anti_join(sqlquery, join_table, lhs_column, rhs_column) sq end end + +""" +$docstring_union +""" +macro union(sqlquery, union_query) + return quote + sq = $(esc(sqlquery)) + uq = $(esc(union_query)) + + if isa(sq, SQLQuery) + # Determine if sq needs a new CTE + needs_new_cte_sq = !isempty(sq.select) || !isempty(sq.where) || sq.is_aggregated || !isempty(sq.ctes) + if needs_new_cte_sq + sq.cte_count += 1 + cte_name_sq = "cte_" * string(sq.cte_count) + most_recent_source_sq = !isempty(sq.ctes) ? "cte_" * string(sq.cte_count - 1) : sq.from + select_sql_sq = "SELECT * FROM " * most_recent_source_sq + new_cte_sq = CTE(name=cte_name_sq, select=select_sql_sq) + push!(sq.ctes, new_cte_sq) + sq.from = cte_name_sq + end + + # Prepare the union query + if isa(uq, SQLQuery) + # Determine if uq needs a new CTE + needs_new_cte_uq = !isempty(uq.select) || !isempty(uq.where) || uq.is_aggregated || !isempty(uq.ctes) + if needs_new_cte_uq + uq.cte_count += 1 + cte_name_uq = "cte_" * string(uq.cte_count) + most_recent_source_uq = !isempty(uq.ctes) ? "cte_" * string(uq.cte_count - 1) : uq.from + select_sql_uq = "SELECT * FROM " * most_recent_source_uq + new_cte_uq = CTE(name=cte_name_uq, select=select_sql_uq) + push!(uq.ctes, new_cte_uq) + uq.from = cte_name_uq + end + + # Combine the queries using UNION + union_sql = "SELECT * FROM " * sq.from * " UNION SELECT * FROM " * uq.from + + # Merge CTEs and metadata + sq.ctes = vcat(sq.ctes, uq.ctes) + sq.metadata = vcat(sq.metadata, uq.metadata) + else + # Treat uq as a table name + union_sql = "SELECT * FROM " * sq.from * " UNION SELECT * FROM " * string(uq) + # Update metadata + if current_sql_mode[] != :athena + new_metadata = get_table_metadata(sq.db, string(uq)) + else + new_metadata = get_table_metadata_athena(sq.db, string(uq), sq.athena_params) + end + sq.metadata = vcat(sq.metadata, new_metadata) + end + + # Create a new CTE for the union + sq.cte_count += 1 + union_cte_name = "cte_" * string(sq.cte_count) + union_cte = CTE(name=union_cte_name, select=union_sql) + push!(sq.ctes, union_cte) + sq.from = union_cte_name + else + error("Expected sqlquery to be an instance of SQLQuery") + end + sq + end +end \ No newline at end of file From faf07da160256a08f08ca9a20284870b92e99ff5 Mon Sep 17 00:00:00 2001 From: Daniel Rizk Date: Thu, 26 Sep 2024 09:18:34 -0400 Subject: [PATCH 2/4] fix connect for duckdb file --- README.md | 15 +- docs/examples/UserGuide/from_queryex.jl | 2 +- docs/examples/UserGuide/getting_started.jl | 4 +- docs/examples/UserGuide/ibis_comp.jl | 2 - docs/src/index.md | 5 +- src/TBD_macros.jl | 86 ----------- src/TidierDB.jl | 7 +- src/docstrings.jl | 56 ++++++- src/windows.jl | 169 +++++++++++++++++++++ 9 files changed, 237 insertions(+), 109 deletions(-) create mode 100644 src/windows.jl diff --git a/README.md b/README.md index 14f8c6e..74984b3 100644 --- a/README.md +++ b/README.md @@ -44,9 +44,10 @@ TidierDB.jl currently supports the following top-level macros: | **Helper Functions** | `across`, `desc`, `if_else`, `case_when`, `n`, `starts_with`, `ends_with`, `contains`, `as_float`, `as_integer`, `as_string`, `is_missing`, `missing_if`, `replace_missing` | | **TidierStrings.jl Functions** | `str_detect`, `str_replace`, `str_replace_all`, `str_remove_all`, `str_remove` | | **TidierDates.jl Functions** | `year`, `month`, `day`, `hour`, `min`, `second`, `floor_date`, `difftime` | -| **Aggregate Functions** | `mean`, `minimum`, `maximum`, `std`, `sum`, `cumsum`, `cor`, `cov`, `var`, +| **Aggregate Functions** | `mean`, `minimum`, `maximum`, `std`, `sum`, `cumsum`, `cor`, `cov`, `var`, all SQL aggregate -`@summarize` supports any SQL aggregate function in addition to the list above. Simply write the function as written in SQL syntax and it will work. | +`@summarize` supports any SQL aggregate function in addition to the list above. Simply write the function as written in SQL syntax and it will work. +`@mutate` supports nearly all functions built into SQL as well. When using the DuckDB backend, if `db_table` recieves a file path (`.parquet`, `.json`, `.csv`, `iceberg` or `delta`), it does not copy it into memory. This allows for queries on files too big for memory. `db_table` also supports S3 bucket locations via DuckDB. @@ -67,7 +68,9 @@ import TidierDB as DB db = DB.connect(DB.duckdb()); path_or_name = "https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv" -@chain DB.db_table(db, path_or_name) begin +mtcars = DB.db_table(db, path_or_name); + +@chain DB.t(mtcars) begin DB.@filter(!starts_with(model, "M")) DB.@group_by(cyl) DB.@summarize(mpg = mean(mpg)) @@ -97,7 +100,7 @@ end We cannot do this using TidierDB. However, we can call `@pivot_longer()` from TidierData *after* the result of the query has been instantiated as a DataFrame, like this: ```julia -@chain DB.db_table(db, path_or_name) begin +@chain DB.t(mtcars) begin DB.@filter(!starts_with(model, "M")) DB.@group_by(cyl) DB.@summarize(mpg = mean(mpg)) @@ -136,7 +139,7 @@ end We can replace `DB.collect()` with `DB.@show_query` to reveal the underlying SQL query being generated by TidierDB. To handle complex queries, TidierDB makes heavy use of Common Table Expressions (CTE), which are a useful tool to organize long queries. ```julia -@chain DB.db_table(db, path_or_name) begin +@chain DB.t(mtcars) begin DB.@filter(!starts_with(model, "M")) DB.@group_by(cyl) DB.@summarize(mpg = mean(mpg)) @@ -176,7 +179,7 @@ SELECT * ## TidierDB is already quite fully-featured, supporting advanced TidierData functions like `across()` for multi-column selection. ```julia -@chain DB.db_table(db, path_or_name) begin +@chain DB.t(mtcars) begin DB.@group_by(cyl) DB.@summarize(across((starts_with("a"), ends_with("s")), (mean, sum))) DB.@collect diff --git a/docs/examples/UserGuide/from_queryex.jl b/docs/examples/UserGuide/from_queryex.jl index 8a39597..f92c451 100644 --- a/docs/examples/UserGuide/from_queryex.jl +++ b/docs/examples/UserGuide/from_queryex.jl @@ -1,4 +1,4 @@ -# While using TidierDB, you may need to generate part of a query and reuse it multiple times. `from_query()` enables a query portion to be reused multiple times as shown below. +# While using TidierDB, you may need to generate part of a query and reuse it multiple times. `from_query()` or `t()` enable a query portion to be reused multiple times as shown below. # ```julia # import TidierDB as DB diff --git a/docs/examples/UserGuide/getting_started.jl b/docs/examples/UserGuide/getting_started.jl index fef2524..e3e6774 100644 --- a/docs/examples/UserGuide/getting_started.jl +++ b/docs/examples/UserGuide/getting_started.jl @@ -53,10 +53,10 @@ # If you are working with a backend where compute cost is important, it will be important to minimize using `db_table` as this will requery for metadata each time. # Compute costs are relevant to backends such as AWS, databricks and Snowflake. -# To do this, save the results of `db_table` and use them with `from_query`. Using `from_query` pulls the relevant information (metadata, con, etc) from the mutable SQLquery struct, allowing you to repeatedly query and collect the table without requerying for the metadata each time +# To do this, save the results of `db_table` and use them with `t`. Using `t` pulls the relevant information (metadata, con, etc) from the mutable SQLquery struct, allowing you to repeatedly query and collect the table without requerying for the metadata each time # ```julia # table = DB.db_table(con, "path") -# @chain DB.from_query(table) begin +# @chain DB.t(table) begin # ## data wrangling here # end # ``` diff --git a/docs/examples/UserGuide/ibis_comp.jl b/docs/examples/UserGuide/ibis_comp.jl index 5e122c4..be99876 100644 --- a/docs/examples/UserGuide/ibis_comp.jl +++ b/docs/examples/UserGuide/ibis_comp.jl @@ -17,8 +17,6 @@ # ```julia # using TidierDB # db = connect(duckdb()) -# # This next line is optional, but it will let us avoid writing `db_table` or `from_query` for each query -# t(table) = from_query(table) # ``` # Of note, TidierDB does not yet have an "interactive mode" so each example result will be collected. diff --git a/docs/src/index.md b/docs/src/index.md index ad9fc8c..b3a4fc9 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -39,9 +39,10 @@ TidierDB.jl currently supports: | **Helper Functions** | `across`, `desc`, `if_else`, `case_when`, `n`, `starts_with`, `ends_with`, `contains`, `as_float`, `as_integer`, `as_string`, `is_missing`, `missing_if`, `replace_missing` | | **TidierStrings.jl Functions** | `str_detect`, `str_replace`, `str_replace_all`, `str_remove_all`, `str_remove` | | **TidierDates.jl Functions** | `year`, `month`, `day`, `hour`, `min`, `second`, `floor_date`, `difftime` | -| **Aggregate Functions** | `mean`, `minimum`, `maximum`, `std`, `sum`, `cumsum`, `cor`, `cov`, `var`, +| **Aggregate Functions** | `mean`, `minimum`, `maximum`, `std`, `sum`, `cumsum`, `cor`, `cov`, `var`, all SQL aggregate -`@summarize` supports any SQL aggregate function in addition to the list above. Simply write the function as written in SQL syntax and it will work. | +`@summarize` supports any SQL aggregate function in addition to the list above. Simply write the function as written in SQL syntax and it will work. +`@mutate` supports nearly all functions built into SQL as well. When using the DuckDB backend, if `db_table` recieves a file path ( `.parquet`, `.json`, `.csv`, `iceberg` or `delta`), it does not copy it into memory. This allows for queries on files too big for memory. `db_table` also supports S3 bucket locations via DuckDB. diff --git a/src/TBD_macros.jl b/src/TBD_macros.jl index 8609f31..70630dc 100644 --- a/src/TBD_macros.jl +++ b/src/TBD_macros.jl @@ -552,92 +552,6 @@ macro rename(sqlquery, renamings...) end end -""" -$docstring_window_order -""" -macro window_order(sqlquery, order_by_expr...) - order_by_expr = parse_interpolation2.(order_by_expr) - - return quote - sq = $(esc(sqlquery)) - if isa(sq, SQLQuery) - # Convert order_by_expr to SQL order by string - order_by_sql = join([expr_to_sql(expr, sq) for expr in $(esc(order_by_expr))], ", ") - - # Update the orderBy field of the SQLQuery instance - sq.window_order = order_by_sql - - # If this is the first operation after an aggregation, wrap current state in a CTE - if sq.post_aggregation - sq.post_aggregation = false - cte_name = "cte_" * string(sq.cte_count + 1) - cte_sql = "SELECT * FROM " * sq.from - - if !isempty(sq.where) - cte_sql *= " WHERE " * sq.where - end - - new_cte = CTE(name=cte_name, select=cte_sql, from=sq.from) - push!(sq.ctes, new_cte) - sq.cte_count += 1 - - # Reset the from to reference the new CTE - sq.from = cte_name - end - - # Note: Actual window functions would be applied in subsequent @mutate calls or similar, - # potentially using the orderBy set here for their OVER() clauses. - else - error("Expected sqlquery to be an instance of SQLQuery") - end - sq - end -end - -""" -$docstring_window_frame -""" -macro window_frame(sqlquery, frame_start::Int, frame_end::Int) - return quote - sq = $(esc(sqlquery)) - if isa(sq, SQLQuery) - # Validate frame_start and frame_end - if $frame_end < $frame_start - error("frame_end must be greater than or equal to frame_start") - end - - # Calculate absolute values for frame_start and frame_end - abs_frame_start = abs($frame_start) - abs_frame_end = abs($frame_end) - - # Determine the direction and clause for frame_start - frame_start_clause = if $frame_start < 0 - string(abs_frame_start, " PRECEDING") - else - string(abs_frame_start, " FOLLOWING") - end - - # Determine the direction and clause for frame_end - frame_end_clause = if $frame_end < 0 - string(abs_frame_end, " PRECEDING") - else - string(abs_frame_end, " FOLLOWING") - end - - # Construct the window frame clause - frame_clause = string("ROWS BETWEEN ", frame_start_clause, " AND ", frame_end_clause) - - # Update the windowFrame field of the SQLQuery instance - sq.windowFrame = frame_clause - else - error("Expected sqlquery to be an instance of SQLQuery") - end - sq - end -end - - - macro show_query(sqlquery) return quote diff --git a/src/TidierDB.jl b/src/TidierDB.jl index d41af89..e0181b3 100644 --- a/src/TidierDB.jl +++ b/src/TidierDB.jl @@ -18,7 +18,8 @@ using GZip @distinct, @left_join, @right_join, @inner_join, @count, @window_order, @window_frame, @show_query, @collect, @slice_max, @slice_min, @slice_sample, @rename, copy_to, duckdb_open, duckdb_connect, @semi_join, @full_join, @anti_join, connect, from_query, @interpolate, add_interp_parameter!, update_con, @head, - clickhouse, duckdb, sqlite, mysql, mssql, postgres, athena, snowflake, gbq, oracle, databricks, SQLQuery, show_tables, t + clickhouse, duckdb, sqlite, mysql, mssql, postgres, athena, snowflake, gbq, oracle, databricks, SQLQuery, show_tables, + t, @union abstract type SQLBackend end @@ -58,7 +59,7 @@ include("parsing_oracle.jl") include("parsing_databricks.jl") include("joins_sq.jl") include("slices_sq.jl") - +include("windows.jl") @@ -413,7 +414,7 @@ end function connect(::duckdb, token::String) if token == "md:" return DBInterface.connect(DuckDB.DB, "md:") - elseif endswith(token, ".duckdb") + elseif endswith(token, ".duckdb") || endswith(token, ".duck.db") return DuckDB.DB(token) else return DBInterface.connect(DuckDB.DB, "md:$token") diff --git a/src/docstrings.jl b/src/docstrings.jl index 557c31d..37e2273 100644 --- a/src/docstrings.jl +++ b/src/docstrings.jl @@ -20,7 +20,9 @@ julia> db = connect(duckdb()); julia> copy_to(db, df, "df_mem"); -julia> @chain db_table(db, :df_mem) begin +julia> df_mem = db_table(db, :df_mem); + +julia> @chain t(df_mem) begin @select(groups:percent) @collect end @@ -39,7 +41,7 @@ julia> @chain db_table(db, :df_mem) begin 9 │ bb 4 0.9 10 │ aa 5 1.0 -julia> @chain db_table(db, :df_mem) begin +julia> @chain t(df_mem) begin @select(contains("e")) @collect end @@ -925,20 +927,30 @@ julia> df = DataFrame(id = [string('A' + i ÷ 26, 'A' + i % 26) for i in 0:9], julia> db = connect(duckdb()); julia> copy_to(db, df, "df_mem"); + +julia> @chain db_table(db, :df_mem) begin + @group_by groups + @window_frame(3) + @window_order(desc(percent)) + @mutate(avg = mean(value)) + #@show_query + end; ``` """ const docstring_window_frame = """ - @window_frame(sql_query, frame_start::Int, frame_end::Int) + @window_frame(sql_query, args...) Define the window frame for window functions in a SQL query, specifying the range of rows to include in the calculation relative to the current row. # Arguments -sql_query: The SQL query to operate on, expected to be an instance of SQLQuery. -- `frame_start`: The starting point of the window frame. A positive value indicates the start after the current row (FOLLOWING), a negative value indicates before the current row (PRECEDING), and 0 indicates the current row. -- `frame_end`: The ending point of the window frame. A positive value indicates the end after the current row (FOLLOWING), a negative value indicates before the current row (PRECEDING), and 0 indicates the current row. - +- `sqlquery::SQLQuery`: The SQLQuery instance to which the window frame will be applied. +- `args...`: A variable number of arguments specifying the frame boundaries. These can be: + - `from`: The starting point of the frame. Can be a positive or negative integer, 0 or empty. When empty, it will use UNBOUNDED + - `to`: The ending point of the frame. Can be a positive or negative integer, 0 or empty. When empty, it will use UNBOUNDED + - if only one integer is provided without specifying `to` or `from` it will default to from, and to will be UNBOUNDED. + - if no arguments are given, both will be UNBOUNDED # Examples ```jldoctest julia> df = DataFrame(id = [string('A' + i ÷ 26, 'A' + i % 26) for i in 0:9], @@ -949,6 +961,36 @@ julia> df = DataFrame(id = [string('A' + i ÷ 26, 'A' + i % 26) for i in 0:9], julia> db = connect(duckdb()); julia> copy_to(db, df, "df_mem"); + +julia> df_mem = db_table(db, :df_mem); + +julia> @chain t(df_mem) begin + @group_by groups + @window_frame(3) + @mutate(avg = mean(percent)) + #@show_query + end; + +julia> @chain t(df_mem) begin + @group_by groups + @window_frame(-3, 3) + @mutate(avg = mean(percent)) + #@show_query + end; + +julia> @chain t(df_mem) begin + @group_by groups + @window_frame(to = -3) + @mutate(avg = mean(percent)) + #@show_query + end; + +julia> @chain t(df_mem) begin + @group_by groups + @window_frame() + @mutate(avg = mean(percent)) + #@show_query + end; ``` """ diff --git a/src/windows.jl b/src/windows.jl new file mode 100644 index 0000000..bae4f3a --- /dev/null +++ b/src/windows.jl @@ -0,0 +1,169 @@ +""" +$docstring_window_order +""" +macro window_order(sqlquery, order_by_expr...) + order_by_expr = parse_interpolation2.(order_by_expr) + + return quote + sq = $(esc(sqlquery)) + if isa(sq, SQLQuery) + # Convert order_by_expr to SQL order by string + order_specs = String[] + for expr in $(esc(order_by_expr)) + if isa(expr, Expr) && expr.head == :call && expr.args[1] == :desc + # Column specified with `desc()`, indicating descending order + push!(order_specs, string(expr.args[2]) * " DESC") + elseif isa(expr, Symbol) + # Plain column symbol, indicating ascending order + push!(order_specs, string(expr) * " ASC") + else + throw("Unsupported column specification in @window_order: $expr") + end + end + order_by_sql = join(order_specs, ", ") + + # Update the window_order field of the SQLQuery instance + sq.window_order = order_by_sql + + # If this is the first operation after an aggregation, wrap current state in a CTE + if sq.post_aggregation + sq.post_aggregation = false + cte_name = "cte_" * string(sq.cte_count + 1) + cte_sql = "SELECT * FROM " * sq.from + + if !isempty(sq.where) + cte_sql *= " WHERE " * sq.where + end + + new_cte = CTE(name=cte_name, select=cte_sql, from=sq.from) + push!(sq.ctes, new_cte) + sq.cte_count += 1 + + # Reset the from to reference the new CTE + sq.from = cte_name + end + + # Note: Actual window functions would be applied in subsequent @mutate calls or similar, + # potentially using the orderBy set here for their OVER() clauses. + else + error("Expected sqlquery to be an instance of SQLQuery") + end + sq + end +end + +""" +$docstring_window_frame +""" +macro window_frame(sqlquery, args...) + sqlquery_expr = esc(sqlquery) + # Initialize expressions for from and to values + frame_from_expr = nothing + frame_to_expr = nothing + + # Process the arguments at macro expansion time + for arg in args + if isa(arg, Expr) && arg.head == :(=) + # Named argument + arg_name = arg.args[1] + arg_value = arg.args[2] + if arg_name == :from + frame_from_expr = arg_value + elseif arg_name == :to + frame_to_expr = arg_value + else + error("Unknown keyword argument: $(arg_name)") + end + else + # Positional argument + if frame_from_expr === nothing + frame_from_expr = arg + elseif frame_to_expr === nothing + frame_to_expr = arg + else + error("Too many positional arguments") + end + end + end + + # Now generate the code that computes the frame clauses at runtime + return quote + sq = $sqlquery_expr + if isa(sq, SQLQuery) + # Evaluate frame_from_value and frame_to_value + frame_from_value = $(frame_from_expr !== nothing ? esc(frame_from_expr) : :(nothing)) + frame_to_value = $(frame_to_expr !== nothing ? esc(frame_to_expr) : :(nothing)) + + # Initialize frame_start_clause and frame_end_clause + frame_start_clause = "" + frame_end_clause = "" + + if frame_from_value !== nothing && frame_to_value === nothing + # Only from is specified + frame_start_value = frame_from_value + frame_start_clause = if frame_start_value == 0 + "CURRENT ROW" + elseif frame_start_value < 0 + string(abs(frame_start_value), " PRECEDING") + elseif frame_start_value > 0 + string(abs(frame_start_value), " FOLLOWING") + else + error("Invalid frame_from_value") + end + # Set frame_end_clause to "UNBOUNDED FOLLOWING" + frame_end_clause = "UNBOUNDED FOLLOWING" + elseif frame_from_value === nothing && frame_to_value !== nothing + # Only to is specified + frame_end_value = frame_to_value + frame_end_clause = if frame_end_value == 0 + "CURRENT ROW" + elseif frame_end_value < 0 + string(abs(frame_end_value), " PRECEDING") + elseif frame_end_value > 0 + string(abs(frame_end_value), " FOLLOWING") + else + error("Invalid frame_to_value") + end + # Set frame_start_clause to "UNBOUNDED PRECEDING" + frame_start_clause = "UNBOUNDED PRECEDING" + elseif frame_from_value !== nothing && frame_to_value !== nothing + # Both from and to are specified + frame_start_value = frame_from_value + frame_start_clause = if frame_start_value == 0 + "CURRENT ROW" + elseif frame_start_value < 0 + string(abs(frame_start_value), " PRECEDING") + elseif frame_start_value > 0 + string(abs(frame_start_value), " FOLLOWING") + else + error("Invalid frame_from_value") + end + + frame_end_value = frame_to_value + frame_end_clause = if frame_end_value == 0 + "CURRENT ROW" + elseif frame_end_value < 0 + string(abs(frame_end_value), " PRECEDING") + elseif frame_end_value > 0 + string(abs(frame_end_value), " FOLLOWING") + else + error("Invalid frame_to_value") + end + else + # Neither from nor to is specified + frame_start_clause = "UNBOUNDED PRECEDING" + frame_end_clause = "UNBOUNDED FOLLOWING" + + end + + # Construct the window frame clause + frame_clause = "ROWS BETWEEN $(frame_start_clause) AND $(frame_end_clause)" + + # Update the windowFrame field of the SQLQuery instance + sq.windowFrame = frame_clause + else + error("Expected sqlquery to be an instance of SQLQuery") + end + sq + end +end From 52754b42fd6914056aba05cc70d1cfa81c24a6d1 Mon Sep 17 00:00:00 2001 From: Daniel Rizk Date: Thu, 26 Sep 2024 09:31:21 -0400 Subject: [PATCH 3/4] revert join changes --- src/TidierDB.jl | 2 +- src/joins_sq.jl | 230 ++++++------------------------------------------ 2 files changed, 30 insertions(+), 202 deletions(-) diff --git a/src/TidierDB.jl b/src/TidierDB.jl index e0181b3..9cd1e9b 100644 --- a/src/TidierDB.jl +++ b/src/TidierDB.jl @@ -19,7 +19,7 @@ using GZip @slice_min, @slice_sample, @rename, copy_to, duckdb_open, duckdb_connect, @semi_join, @full_join, @anti_join, connect, from_query, @interpolate, add_interp_parameter!, update_con, @head, clickhouse, duckdb, sqlite, mysql, mssql, postgres, athena, snowflake, gbq, oracle, databricks, SQLQuery, show_tables, - t, @union + t abstract type SQLBackend end diff --git a/src/joins_sq.jl b/src/joins_sq.jl index 163b8ab..ea804db 100644 --- a/src/joins_sq.jl +++ b/src/joins_sq.jl @@ -25,95 +25,41 @@ end $docstring_left_join """ macro left_join(sqlquery, join_table, lhs_column, rhs_column) - # Convert column references to strings + # Convert column references to string lhs_col_str = string(lhs_column) rhs_col_str = string(rhs_column) - # Removed the QuoteNode wrapping to allow evaluation of join_table - # join_table = QuoteNode(join_table) + join_table = QuoteNode(join_table) return quote sq = $(esc(sqlquery)) - jq = $(esc(join_table)) # Evaluate join_table - if isa(sq, SQLQuery) needs_new_cte = !isempty(sq.select) || !isempty(sq.where) || sq.is_aggregated || !isempty(sq.ctes) if needs_new_cte sq.cte_count += 1 cte_name = "cte_" * string(sq.cte_count) + most_recent_source = !isempty(sq.ctes) ? "cte_" * string(sq.cte_count - 1) : sq.from - if isa(jq, SQLQuery) - # Handle when join_table is an SQLQuery - needs_new_cte_jq = !isempty(jq.select) || !isempty(jq.where) || jq.is_aggregated || !isempty(jq.ctes) - if needs_new_cte_jq - jq.cte_count += 1 - cte_name_jq = "jcte_" * string(jq.cte_count) - most_recent_source_jq = !isempty(jq.ctes) ? "jcte_" * string(jq.cte_count - 1) : jq.from - select_sql_jq = "SELECT * FROM " * most_recent_source_jq - new_cte_jq = CTE(name=cte_name_jq, select=select_sql_jq) - push!(jq.ctes, new_cte_jq) - jq.from = cte_name_jq - end - # Combine CTEs and metadata - sq.ctes = vcat(sq.ctes, jq.ctes) - sq.metadata = vcat(sq.metadata, jq.metadata) - join_table_name = jq.from - else - # When join_table is a table name - join_table_name = string(jq) - if current_sql_mode[] != :athena - new_metadata = get_table_metadata(sq.db, join_table_name) - else - new_metadata = get_table_metadata_athena(sq.db, join_table_name, sq.athena_params) - end - sq.metadata = vcat(sq.metadata, new_metadata) - end - - join_sql = " " * most_recent_source * ".*, " * - get_join_columns(sq.db, join_table_name, $lhs_col_str) * gbq_join_parse(most_recent_source) * - " LEFT JOIN " * join_table_name * " ON " * - gbq_join_parse(join_table_name) * "." * $lhs_col_str * " = " * - gbq_join_parse(most_recent_source) * "." * $rhs_col_str + join_sql = " " * most_recent_source * ".*, " * get_join_columns(sq.db, string($(esc(join_table))), $lhs_col_str) * gbq_join_parse(most_recent_source) * + " LEFT JOIN " * string($(esc(join_table))) * " ON " * string(gbq_join_parse($(esc(join_table))), ".", $lhs_col_str, " = ", gbq_join_parse(most_recent_source), ".", $rhs_col_str) # Create and add the new CTE new_cte = CTE(name=cte_name, select=join_sql) push!(sq.ctes, new_cte) + # Update the FROM clause sq.from = cte_name else - if isa(jq, SQLQuery) - # Handle when join_table is an SQLQuery - needs_new_cte_jq = !isempty(jq.select) || !isempty(jq.where) || jq.is_aggregated || !isempty(jq.ctes) - - if needs_new_cte_jq - jq.cte_count += 1 - cte_name_jq = "ncte_" * string(jq.cte_count) - most_recent_source_jq = !isempty(jq.ctes) ? "cte_" * string(jq.cte_count - 1) : jq.from - select_sql_jq = "SELECT * FROM " * most_recent_source_jq - new_cte_jq = CTE(name=cte_name_jq, select=select_sql_jq) - push!(jq.ctes, new_cte_jq) - jq.from = cte_name_jq - end - # Combine CTEs and metadata - sq.ctes = vcat(sq.ctes, jq.ctes) - sq.metadata = vcat(sq.metadata, jq.metadata) - join_table_name = jq.from - else - # When join_table is a table name - join_table_name = string(jq) - if current_sql_mode[] != :athena - new_metadata = get_table_metadata(sq.db, join_table_name) - else - new_metadata = get_table_metadata_athena(sq.db, join_table_name, sq.athena_params) - end - sq.metadata = vcat(sq.metadata, new_metadata) - end - - join_clause = " LEFT JOIN " * join_table_name * " ON " * - gbq_join_parse(join_table_name) * "." * $lhs_col_str * " = " * - gbq_join_parse(sq.from) * "." * $rhs_col_str + join_clause = " LEFT JOIN " * string($(esc(join_table))) * " ON " * string(gbq_join_parse($(esc(join_table))), ".", $lhs_col_str, " = ", gbq_join_parse(sq.from), ".", $rhs_col_str) sq.from *= join_clause end + + if current_sql_mode[] != :athena + new_metadata = get_table_metadata(sq.db, string($(esc(join_table)))) + else + new_metadata = get_table_metadata_athena(sq.db, string($(esc(join_table))), sq.athena_params) + end + sq.metadata = vcat(sq.metadata, new_metadata) else error("Expected sqlquery to be an instance of SQLQuery") end @@ -121,7 +67,6 @@ macro left_join(sqlquery, join_table, lhs_column, rhs_column) end end - """ $docstring_right_join """ @@ -222,92 +167,42 @@ end $docstring_full_join """ macro full_join(sqlquery, join_table, lhs_column, rhs_column) - # Convert column references to strings + # Convert column references to string lhs_col_str = string(lhs_column) rhs_col_str = string(rhs_column) + join_table = QuoteNode(join_table) return quote sq = $(esc(sqlquery)) - jq = $(esc(join_table)) # Evaluate join_table - if isa(sq, SQLQuery) needs_new_cte = !isempty(sq.select) || !isempty(sq.where) || sq.is_aggregated || !isempty(sq.ctes) + if needs_new_cte sq.cte_count += 1 cte_name = "cte_" * string(sq.cte_count) + most_recent_source = !isempty(sq.ctes) ? "cte_" * string(sq.cte_count - 1) : sq.from - if isa(jq, SQLQuery) - # Handle when join_table is an SQLQuery - needs_new_cte_jq = !isempty(jq.select) || !isempty(jq.where) || jq.is_aggregated || !isempty(jq.ctes) - if needs_new_cte_jq - jq.cte_count += 1 - cte_name_jq = "jcte_" * string(jq.cte_count) - most_recent_source_jq = !isempty(jq.ctes) ? "jcte_" * string(jq.cte_count - 1) : jq.from - select_sql_jq = "SELECT * FROM " * most_recent_source_jq - new_cte_jq = CTE(name=cte_name_jq, select=select_sql_jq) - push!(jq.ctes, new_cte_jq) - jq.from = cte_name_jq - end - # Combine CTEs and metadata - sq.ctes = vcat(sq.ctes, jq.ctes) - sq.metadata = vcat(sq.metadata, jq.metadata) - join_table_name = jq.from - else - # When join_table is a table name - join_table_name = string(jq) - if current_sql_mode[] != :athena - new_metadata = get_table_metadata(sq.db, join_table_name) - else - new_metadata = get_table_metadata_athena(sq.db, join_table_name, sq.athena_params) - end - sq.metadata = vcat(sq.metadata, new_metadata) - end - - join_sql = " " * most_recent_source * ".*, " * - get_join_columns(sq.db, join_table_name, $lhs_col_str) * gbq_join_parse(most_recent_source) * - " FULL JOIN " * join_table_name * " ON " * - gbq_join_parse(join_table_name) * "." * $lhs_col_str * " = " * - gbq_join_parse(most_recent_source) * "." * $rhs_col_str + join_sql = " " * most_recent_source * ".*, " * get_join_columns(sq.db, string($(esc(join_table))), $lhs_col_str) * gbq_join_parse(most_recent_source) * + " FULL JOIN " * string($(esc(join_table))) * " ON " * string(gbq_join_parse($(esc(join_table))), ".", $lhs_col_str, " = ", gbq_join_parse(most_recent_source), ".", $rhs_col_str) # Create and add the new CTE new_cte = CTE(name=cte_name, select=join_sql) push!(sq.ctes, new_cte) + # Update the FROM clause sq.from = cte_name else - if isa(jq, SQLQuery) - # Handle when join_table is an SQLQuery - needs_new_cte_jq = !isempty(jq.select) || !isempty(jq.where) || jq.is_aggregated || !isempty(jq.ctes) - if needs_new_cte_jq - jq.cte_count += 1 - cte_name_jq = "ncte_" * string(jq.cte_count) - most_recent_source_jq = !isempty(jq.ctes) ? "cte_" * string(jq.cte_count - 1) : jq.from - select_sql_jq = "SELECT * FROM " * most_recent_source_jq - new_cte_jq = CTE(name=cte_name_jq, select=select_sql_jq) - push!(jq.ctes, new_cte_jq) - jq.from = cte_name_jq - end - # Combine CTEs and metadata - sq.ctes = vcat(sq.ctes, jq.ctes) - sq.metadata = vcat(sq.metadata, jq.metadata) - join_table_name = jq.from - else - # When join_table is a table name - join_table_name = string(jq) - if current_sql_mode[] != :athena - new_metadata = get_table_metadata(sq.db, join_table_name) - else - new_metadata = get_table_metadata_athena(sq.db, join_table_name, sq.athena_params) - end - sq.metadata = vcat(sq.metadata, new_metadata) - end - - join_clause = " FULL JOIN " * join_table_name * " ON " * - gbq_join_parse(join_table_name) * "." * $lhs_col_str * " = " * - gbq_join_parse(sq.from) * "." * $rhs_col_str + join_clause = " FULL JOIN " * string($(esc(join_table))) * " ON " * string(gbq_join_parse($(esc(join_table))), ".", $lhs_col_str, " = ", gbq_join_parse(sq.from), ".", $rhs_col_str) sq.from *= join_clause end + + if current_sql_mode[] != :athena + new_metadata = get_table_metadata(sq.db, string($(esc(join_table)))) + else + new_metadata = get_table_metadata_athena(sq.db, string($(esc(join_table))), sq.athena_params) + end + sq.metadata = vcat(sq.metadata, new_metadata) else error("Expected sqlquery to be an instance of SQLQuery") end @@ -316,7 +211,6 @@ macro full_join(sqlquery, join_table, lhs_column, rhs_column) end - """ $docstring_semi_join """ @@ -411,69 +305,3 @@ macro anti_join(sqlquery, join_table, lhs_column, rhs_column) sq end end - -""" -$docstring_union -""" -macro union(sqlquery, union_query) - return quote - sq = $(esc(sqlquery)) - uq = $(esc(union_query)) - - if isa(sq, SQLQuery) - # Determine if sq needs a new CTE - needs_new_cte_sq = !isempty(sq.select) || !isempty(sq.where) || sq.is_aggregated || !isempty(sq.ctes) - if needs_new_cte_sq - sq.cte_count += 1 - cte_name_sq = "cte_" * string(sq.cte_count) - most_recent_source_sq = !isempty(sq.ctes) ? "cte_" * string(sq.cte_count - 1) : sq.from - select_sql_sq = "SELECT * FROM " * most_recent_source_sq - new_cte_sq = CTE(name=cte_name_sq, select=select_sql_sq) - push!(sq.ctes, new_cte_sq) - sq.from = cte_name_sq - end - - # Prepare the union query - if isa(uq, SQLQuery) - # Determine if uq needs a new CTE - needs_new_cte_uq = !isempty(uq.select) || !isempty(uq.where) || uq.is_aggregated || !isempty(uq.ctes) - if needs_new_cte_uq - uq.cte_count += 1 - cte_name_uq = "cte_" * string(uq.cte_count) - most_recent_source_uq = !isempty(uq.ctes) ? "cte_" * string(uq.cte_count - 1) : uq.from - select_sql_uq = "SELECT * FROM " * most_recent_source_uq - new_cte_uq = CTE(name=cte_name_uq, select=select_sql_uq) - push!(uq.ctes, new_cte_uq) - uq.from = cte_name_uq - end - - # Combine the queries using UNION - union_sql = "SELECT * FROM " * sq.from * " UNION SELECT * FROM " * uq.from - - # Merge CTEs and metadata - sq.ctes = vcat(sq.ctes, uq.ctes) - sq.metadata = vcat(sq.metadata, uq.metadata) - else - # Treat uq as a table name - union_sql = "SELECT * FROM " * sq.from * " UNION SELECT * FROM " * string(uq) - # Update metadata - if current_sql_mode[] != :athena - new_metadata = get_table_metadata(sq.db, string(uq)) - else - new_metadata = get_table_metadata_athena(sq.db, string(uq), sq.athena_params) - end - sq.metadata = vcat(sq.metadata, new_metadata) - end - - # Create a new CTE for the union - sq.cte_count += 1 - union_cte_name = "cte_" * string(sq.cte_count) - union_cte = CTE(name=union_cte_name, select=union_sql) - push!(sq.ctes, union_cte) - sq.from = union_cte_name - else - error("Expected sqlquery to be an instance of SQLQuery") - end - sq - end -end \ No newline at end of file From 16f68ceb2e4a6bcbb41d6f47dd17bec23bbe3d9f Mon Sep 17 00:00:00 2001 From: Daniel Rizk Date: Fri, 27 Sep 2024 11:00:29 -0400 Subject: [PATCH 4/4] fixes `connect` for duckdb and adds docs for database file conxn --- NEWS.md | 4 ++++ Project.toml | 2 +- docs/examples/UserGuide/getting_started.jl | 10 ++++++++-- src/TBD_macros.jl | 2 +- src/TidierDB.jl | 13 ++++--------- src/docstrings.jl | 13 ++++++------- 6 files changed, 24 insertions(+), 20 deletions(-) diff --git a/NEWS.md b/NEWS.md index 00f212d..18b67b2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,9 @@ # TidierDB.jl updates +## v0.3.5 - 2024-09- +- improves DuckDB `connect()` interface and documentation +- enhances `@window_frame` to allow for just a `to` or `from` argument, as well as autodetection for `preceding`, `following` and `unbounded` for the frame boundaries. + ## v0.3.4 - 2024 2024-09-23 TidierDB works with nearly any exisiting SQL function, now there are docs about it. - Docs on using any exisiting SQL function in TidierDB diff --git a/Project.toml b/Project.toml index 846ca7f..30fcf70 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "TidierDB" uuid = "86993f9b-bbba-4084-97c5-ee15961ad48b" authors = ["Daniel Rizk and contributors"] -version = "0.3.4" +version = "0.3.5" [deps] Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45" diff --git a/docs/examples/UserGuide/getting_started.jl b/docs/examples/UserGuide/getting_started.jl index e3e6774..94428a2 100644 --- a/docs/examples/UserGuide/getting_started.jl +++ b/docs/examples/UserGuide/getting_started.jl @@ -22,7 +22,13 @@ # conn = DB.connect(DB.duckdb()) # ``` -# You can also use establish a connection through an alternate method that you preferred, and use that as your connection as well. +# ## Connect to a local database file +# You can also connect to an existing database by passing the database file path as a string. +# ```julia +# db = DB.connect(DB.duckdb(), "mydb.duckdb") +# ``` + +# You can also establish any DuckDB connection through an alternate method that you prefer, and use that as your connection as well. # ## Package Extensions # The following backends utilize package extensions. To use one of backends listed below, you will need to write `using Library` @@ -42,9 +48,9 @@ # `db_table` starts the underlying SQL query struct, in addition to pulling the table metadata and storing it there. Storing metadata is what enables a lazy interface that also supports tidy selection. # - `db_table` has two required arguments: `connection` and `table` # - `table` can be a table name on a database or a path/url to file to read. When passing `db_table` a path or url, the table is not copied into memory. +# - Of note, `db_table` only support direct file paths to a table. It does not support database file paths such as `dbname.duckdb` or `dbname.sqlite`. Such files must be used with `connect` first. # - With DuckDB and ClickHouse, if you have a folder of multiple files to read, you can use `*` read in all files matching the pattern. # - For example, the below would read all files that end in `.csv` in the given folder. -# ```julia # db_table(db, "folder/path/*.csv") # ``` # `db_table` also supports iceberg, delta, and S3 file paths via DuckDB. diff --git a/src/TBD_macros.jl b/src/TBD_macros.jl index 70630dc..3ba33e5 100644 --- a/src/TBD_macros.jl +++ b/src/TBD_macros.jl @@ -690,5 +690,5 @@ end $docstring_show_tables """ function show_tables(con::Union{DuckDB.DB, DuckDB.Connection}) - return DataFrame(DBInterface.execute(con, "SHOW TABLES")) + return DataFrame(DBInterface.execute(con, "SHOW ALL TABLES")) end diff --git a/src/TidierDB.jl b/src/TidierDB.jl index 9cd1e9b..520efe6 100644 --- a/src/TidierDB.jl +++ b/src/TidierDB.jl @@ -181,8 +181,9 @@ function get_table_metadata(conn::Union{DuckDB.DB, DuckDB.Connection}, table_nam result = DuckDB.execute(conn, query) |> DataFrame result[!, :current_selxn] .= 1 table_name = if occursin(r"[:/]", table_name) - split(basename(table_name), '.')[1] - #"'$table_name'" + split(basename(table_name), '.')[1] + elseif occursin(".", table_name) + split(basename(table_name), '.')[end] else table_name end @@ -412,13 +413,7 @@ function connect(::duckdb, db_type::Symbol; access_key::String="", secret_key::S end function connect(::duckdb, token::String) - if token == "md:" - return DBInterface.connect(DuckDB.DB, "md:") - elseif endswith(token, ".duckdb") || endswith(token, ".duck.db") - return DuckDB.DB(token) - else - return DBInterface.connect(DuckDB.DB, "md:$token") - end + return DBInterface.connect(DuckDB.DB, token) end end diff --git a/src/docstrings.jl b/src/docstrings.jl index 37e2273..fb5f7c6 100644 --- a/src/docstrings.jl +++ b/src/docstrings.jl @@ -1036,7 +1036,10 @@ This function establishes a database connection based on the specified backend a # Connect to AWS via DuckDB # aws_db = connect2(duckdb(), :aws, aws_access_key_id=get(ENV, "AWS_ACCESS_KEY_ID", "access_key"), aws_secret_access_key=get(ENV, "AWS_SECRET_ACCESS_KEY", "secret_access key"), aws_region=get(ENV, "AWS_DEFAULT_REGION", "us-east-1")) # Connect to MotherDuck -# connect(duckdb(), "token") for first connection, vs connect(duckdb(), "md:") for reconnection +# connect(duckdb(), ""md://..."") for first connection, vs connect(duckdb(), "md:") for reconnection +# Connect to exisiting database file +# connect(duckdb(), "path/to/database.duckdb") +# Open an in-memory database julia> db = connect(duckdb()) DuckDB.Connection(":memory:") ``` @@ -1094,7 +1097,7 @@ const docstring_db_table = db_table(database, table_name, athena_params, delta = false, iceberg = false) `db_table` starts the underlying SQL query struct, adding the metadata and table. If paths are passed directly to db_table instead of a -name it will not copy it to memory, but rather ready directly from the file. +name it will not copy it to memory, but rather ready directly from the file. `db_table` only supports direct file paths to a table. It does not support database file paths such as `dbname.duckdb` or `dbname.sqlite`. Such files must be used with `connect first` # Arguments - `database`: The Database or connection object @@ -1221,11 +1224,7 @@ Shows tables available in database. currently supports DuckDB, databricks, Snowf ```jldoctest julia> db = connect(duckdb()); -julia> show_tables(db) # there are no tables in when first loading so df below is empty. -0×1 DataFrame - Row │ name - │ String -─────┴──────── +julia> show_tables(db); ``` """