From 0c75e2001823e6daa67b2fcbc2bd6268356dea74 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sun, 28 Jul 2024 10:11:53 -0400 Subject: [PATCH 1/9] Update docstrings so that cross references work in online docs. Also switch from autosummary to autoapi in sphinx for building API reference documents --- docs/source/api.rst | 31 --- docs/source/api/dataframe.rst | 27 --- docs/source/api/execution_context.rst | 29 --- docs/source/api/expression.rst | 27 --- docs/source/api/functions.rst | 27 --- docs/source/api/object_store.rst | 27 --- docs/source/conf.py | 42 ++-- docs/source/index.rst | 2 - docs/source/user-guide/basics.rst | 2 + .../common-operations/expressions.rst | 2 + .../common-operations/functions.rst | 2 +- docs/source/user-guide/configuration.rst | 4 +- python/datafusion/catalog.py | 4 +- python/datafusion/context.py | 130 +++++------ python/datafusion/dataframe.py | 107 +++++---- python/datafusion/expr.py | 23 +- python/datafusion/functions.py | 210 +++++++++--------- python/datafusion/record_batch.py | 14 +- python/datafusion/substrait.py | 3 +- python/datafusion/udf.py | 28 +-- 20 files changed, 300 insertions(+), 441 deletions(-) delete mode 100644 docs/source/api.rst delete mode 100644 docs/source/api/dataframe.rst delete mode 100644 docs/source/api/execution_context.rst delete mode 100644 docs/source/api/expression.rst delete mode 100644 docs/source/api/functions.rst delete mode 100644 docs/source/api/object_store.rst diff --git a/docs/source/api.rst b/docs/source/api.rst deleted file mode 100644 index d9f4a09d..00000000 --- a/docs/source/api.rst +++ /dev/null @@ -1,31 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -.. _api: - -************* -API Reference -************* - -.. toctree:: - :maxdepth: 2 - - api/dataframe - api/execution_context - api/expression - api/functions - api/object_store diff --git a/docs/source/api/dataframe.rst b/docs/source/api/dataframe.rst deleted file mode 100644 index 0a3c4c8b..00000000 --- a/docs/source/api/dataframe.rst +++ /dev/null @@ -1,27 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -.. _api.dataframe: -.. currentmodule:: datafusion - -DataFrame -========= - -.. autosummary:: - :toctree: ../generated/ - - DataFrame diff --git a/docs/source/api/execution_context.rst b/docs/source/api/execution_context.rst deleted file mode 100644 index a3bda76d..00000000 --- a/docs/source/api/execution_context.rst +++ /dev/null @@ -1,29 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -.. _api.execution_context: -.. currentmodule:: datafusion - -SessionContext -============== - -.. autosummary:: - :toctree: ../generated/ - - SessionConfig - RuntimeConfig - SessionContext diff --git a/docs/source/api/expression.rst b/docs/source/api/expression.rst deleted file mode 100644 index 30137d13..00000000 --- a/docs/source/api/expression.rst +++ /dev/null @@ -1,27 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -.. _api.expression: -.. currentmodule:: datafusion - -Expr -========== - -.. autosummary:: - :toctree: ../generated/ - - Expr diff --git a/docs/source/api/functions.rst b/docs/source/api/functions.rst deleted file mode 100644 index 6f10d826..00000000 --- a/docs/source/api/functions.rst +++ /dev/null @@ -1,27 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -.. _api.functions: -.. currentmodule:: datafusion - -Functions -========= - -.. autosummary:: - :toctree: ../generated/ - - functions diff --git a/docs/source/api/object_store.rst b/docs/source/api/object_store.rst deleted file mode 100644 index 8d78f072..00000000 --- a/docs/source/api/object_store.rst +++ /dev/null @@ -1,27 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -.. _api.object_store: -.. currentmodule:: datafusion.object_store - -ObjectStore -=========== - -.. autosummary:: - :toctree: ../generated/ - - object_store \ No newline at end of file diff --git a/docs/source/conf.py b/docs/source/conf.py index 308069b6..3a66aaf5 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -55,6 +55,7 @@ "sphinx.ext.napoleon", "myst_parser", "IPython.sphinxext.ipython_directive", + "autoapi.extension", ] source_suffix = { @@ -80,23 +81,36 @@ autosummary_generate = True +autoapi_dirs = ["../../python"] +autoapi_ignore = ["*tests*"] +autoapi_member_order = "groupwise" +suppress_warnings = ["autoapi.python_import_resolution"] +autoapi_keep_files = True +autoapi_python_class_content = "both" -def autodoc_skip_member(app, what, name, obj, skip, options): - exclude_functions = "__init__" - exclude_classes = ("Expr", "DataFrame") - class_name = "" - if hasattr(obj, "__qualname__"): - if obj.__qualname__ is not None: - class_name = obj.__qualname__.split(".")[0] +def autoapi_skip_member_fn(app, what, name, obj, skip, options): + skip_contents = [ + # Re-exports + ("class", "datafusion.DataFrame"), + ("class", "datafusion.SessionContext"), + ("module", "datafusion.common"), + # Deprecated + ("class", "datafusion.substrait.serde"), + ("class", "datafusion.substrait.plan"), + ("class", "datafusion.substrait.producer"), + ("class", "datafusion.substrait.consumer"), + ("method", "datafusion.context.SessionContext.tables"), + ("method", "datafusion.dataframe.DataFrame.unnest_column"), + ] + if (what, name) in skip_contents: + skip = True - should_exclude = name in exclude_functions and class_name in exclude_classes + return skip - return True if should_exclude else None - -def setup(app): - app.connect("autodoc-skip-member", autodoc_skip_member) +def setup(sphinx): + sphinx.connect("autoapi-skip-member", autoapi_skip_member_fn) # -- Options for HTML output ------------------------------------------------- @@ -106,9 +120,7 @@ def setup(app): # html_theme = "pydata_sphinx_theme" -html_theme_options = { - "use_edit_page_button": True, -} +html_theme_options = {"use_edit_page_button": False, "show_toc_level": 2} html_context = { "github_user": "apache", diff --git a/docs/source/index.rst b/docs/source/index.rst index 16c88e03..b0103a33 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -104,5 +104,3 @@ Example :hidden: :maxdepth: 1 :caption: API - - api diff --git a/docs/source/user-guide/basics.rst b/docs/source/user-guide/basics.rst index 438b2319..4ac095e4 100644 --- a/docs/source/user-guide/basics.rst +++ b/docs/source/user-guide/basics.rst @@ -15,6 +15,8 @@ .. specific language governing permissions and limitations .. under the License. +.. _user_guide_concepts: + Concepts ======== diff --git a/docs/source/user-guide/common-operations/expressions.rst b/docs/source/user-guide/common-operations/expressions.rst index ebb514f1..447a90bb 100644 --- a/docs/source/user-guide/common-operations/expressions.rst +++ b/docs/source/user-guide/common-operations/expressions.rst @@ -15,6 +15,8 @@ .. specific language governing permissions and limitations .. under the License. +.. _expressions: + Expressions =========== diff --git a/docs/source/user-guide/common-operations/functions.rst b/docs/source/user-guide/common-operations/functions.rst index d793314f..7efb939e 100644 --- a/docs/source/user-guide/common-operations/functions.rst +++ b/docs/source/user-guide/common-operations/functions.rst @@ -19,7 +19,7 @@ Functions ========= DataFusion provides a large number of built-in functions for performing complex queries without requiring user-defined functions. -In here we will cover some of the more popular use cases. If you want to view all the functions go to the :ref:`Functions` API Reference. +In here we will cover some of the more popular use cases. If you want to view all the functions go to the :py:mod:`Functions ` API Reference. We'll use the pokemon dataset in the following examples. diff --git a/docs/source/user-guide/configuration.rst b/docs/source/user-guide/configuration.rst index 0c1a4818..9c506b7e 100644 --- a/docs/source/user-guide/configuration.rst +++ b/docs/source/user-guide/configuration.rst @@ -47,5 +47,5 @@ a :code:`SessionConfig` and :code:`RuntimeConfig` object. These two cover a wide print(ctx) -You can read more about available :code:`SessionConfig` options `here `_, -and about :code:`RuntimeConfig` options `here `_. +You can read more about available :code:`SessionConfig` options in the `rust DataFusion Configuration guide `_, +and about :code:`RuntimeConfig` options in the rust `online API documentation `_. diff --git a/python/datafusion/catalog.py b/python/datafusion/catalog.py index cec0be76..acd28f33 100644 --- a/python/datafusion/catalog.py +++ b/python/datafusion/catalog.py @@ -39,7 +39,7 @@ def names(self) -> list[str]: return self.catalog.names() def database(self, name: str = "public") -> Database: - """Returns the database with the given `name` from this catalog.""" + """Returns the database with the given ``name`` from this catalog.""" return Database(self.catalog.database(name)) @@ -55,7 +55,7 @@ def names(self) -> set[str]: return self.db.names() def table(self, name: str) -> Table: - """Return the table with the given `name` from this database.""" + """Return the table with the given ``name`` from this database.""" return Table(self.db.table(name)) diff --git a/python/datafusion/context.py b/python/datafusion/context.py index a717db10..731ff530 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -46,7 +46,7 @@ class SessionConfig: """Session configuration options.""" def __init__(self, config_options: dict[str, str] | None = None) -> None: - """Create a new `SessionConfig` with the given configuration options. + """Create a new :py:class:`SessionConfig` with the given configuration options. Args: config_options: Configuration options. @@ -63,7 +63,7 @@ def with_create_default_catalog_and_schema( automatically created. Returns: - A new `SessionConfig` object with the updated setting. + A new :py:class:`SessionConfig` object with the updated setting. """ self.config_internal = ( self.config_internal.with_create_default_catalog_and_schema(enabled) @@ -80,7 +80,7 @@ def with_default_catalog_and_schema( schema: Schema name. Returns: - A new `SessionConfig` object with the updated setting. + A new :py:class:`SessionConfig` object with the updated setting. """ self.config_internal = self.config_internal.with_default_catalog_and_schema( catalog, schema @@ -88,13 +88,13 @@ def with_default_catalog_and_schema( return self def with_information_schema(self, enabled: bool = True) -> SessionConfig: - """Enable or disable the inclusion of `information_schema` virtual tables. + """Enable or disable the inclusion of ``information_schema`` virtual tables. Args: - enabled: Whether to include `information_schema` virtual tables. + enabled: Whether to include ``information_schema`` virtual tables. Returns: - A new `SessionConfig` object with the updated setting. + A new :py:class:`SessionConfig` object with the updated setting. """ self.config_internal = self.config_internal.with_information_schema(enabled) return self @@ -106,7 +106,7 @@ def with_batch_size(self, batch_size: int) -> SessionConfig: batch_size: Batch size. Returns: - A new `SessionConfig` object with the updated setting. + A new :py:class:`SessionConfig` object with the updated setting. """ self.config_internal = self.config_internal.with_batch_size(batch_size) return self @@ -120,7 +120,7 @@ def with_target_partitions(self, target_partitions: int) -> SessionConfig: target_partitions: Number of target partitions. Returns: - A new `SessionConfig` object with the updated setting. + A new :py:class:`SessionConfig` object with the updated setting. """ self.config_internal = self.config_internal.with_target_partitions( target_partitions @@ -136,7 +136,7 @@ def with_repartition_aggregations(self, enabled: bool = True) -> SessionConfig: enabled: Whether to use repartitioning for aggregations. Returns: - A new `SessionConfig` object with the updated setting. + A new :py:class:`SessionConfig` object with the updated setting. """ self.config_internal = self.config_internal.with_repartition_aggregations( enabled @@ -150,7 +150,7 @@ def with_repartition_joins(self, enabled: bool = True) -> SessionConfig: enabled: Whether to use repartitioning for joins. Returns: - A new `SessionConfig` object with the updated setting. + A new :py:class:`SessionConfig` object with the updated setting. """ self.config_internal = self.config_internal.with_repartition_joins(enabled) return self @@ -164,7 +164,7 @@ def with_repartition_windows(self, enabled: bool = True) -> SessionConfig: enabled: Whether to use repartitioning for window functions. Returns: - A new `SessionConfig` object with the updated setting. + A new :py:class:`SessionConfig` object with the updated setting. """ self.config_internal = self.config_internal.with_repartition_windows(enabled) return self @@ -178,7 +178,7 @@ def with_repartition_sorts(self, enabled: bool = True) -> SessionConfig: enabled: Whether to use repartitioning for window functions. Returns: - A new `SessionConfig` object with the updated setting. + A new :py:class:`SessionConfig` object with the updated setting. """ self.config_internal = self.config_internal.with_repartition_sorts(enabled) return self @@ -190,7 +190,7 @@ def with_repartition_file_scans(self, enabled: bool = True) -> SessionConfig: enabled: Whether to use repartitioning for file scans. Returns: - A new `SessionConfig` object with the updated setting. + A new :py:class:`SessionConfig` object with the updated setting. """ self.config_internal = self.config_internal.with_repartition_file_scans(enabled) return self @@ -202,7 +202,7 @@ def with_repartition_file_min_size(self, size: int) -> SessionConfig: size: Minimum file range size. Returns: - A new `SessionConfig` object with the updated setting. + A new :py:class:`SessionConfig` object with the updated setting. """ self.config_internal = self.config_internal.with_repartition_file_min_size(size) return self @@ -216,7 +216,7 @@ def with_parquet_pruning(self, enabled: bool = True) -> SessionConfig: enabled: Whether to use pruning predicate for parquet readers. Returns: - A new `SessionConfig` object with the updated setting. + A new :py:class:`SessionConfig` object with the updated setting. """ self.config_internal = self.config_internal.with_parquet_pruning(enabled) return self @@ -229,7 +229,7 @@ def set(self, key: str, value: str) -> SessionConfig: value: Option value. Returns: - A new `SessionConfig` object with the updated setting. + A new :py:class:`SessionConfig` object with the updated setting. """ self.config_internal = self.config_internal.set(key, value) return self @@ -239,14 +239,14 @@ class RuntimeConfig: """Runtime configuration options.""" def __init__(self) -> None: - """Create a new `RuntimeConfig` with default values.""" + """Create a new :py:class:`RuntimeConfig` with default values.""" self.config_internal = RuntimeConfigInternal() def with_disk_manager_disabled(self) -> RuntimeConfig: """Disable the disk manager, attempts to create temporary files will error. Returns: - A new `RuntimeConfig` object with the updated setting. + A new :py:class:`RuntimeConfig` object with the updated setting. """ self.config_internal = self.config_internal.with_disk_manager_disabled() return self @@ -255,7 +255,7 @@ def with_disk_manager_os(self) -> RuntimeConfig: """Use the operating system's temporary directory for disk manager. Returns: - A new `RuntimeConfig` object with the updated setting. + A new :py:class:`RuntimeConfig` object with the updated setting. """ self.config_internal = self.config_internal.with_disk_manager_os() return self @@ -267,7 +267,7 @@ def with_disk_manager_specified(self, *paths: str | pathlib.Path) -> RuntimeConf paths: Paths to use for the disk manager's temporary files. Returns: - A new `RuntimeConfig` object with the updated setting. + A new :py:class:`RuntimeConfig` object with the updated setting. """ paths = [str(p) for p in paths] self.config_internal = self.config_internal.with_disk_manager_specified(paths) @@ -277,7 +277,7 @@ def with_unbounded_memory_pool(self) -> RuntimeConfig: """Use an unbounded memory pool. Returns: - A new `RuntimeConfig` object with the updated setting. + A new :py:class:`RuntimeConfig` object with the updated setting. """ self.config_internal = self.config_internal.with_unbounded_memory_pool() return self @@ -303,7 +303,7 @@ def with_fair_spill_pool(self, size: int) -> RuntimeConfig: size: Size of the memory pool in bytes. Returns: - A new ``RuntimeConfig`` object with the updated setting. + A new :py:class:`RuntimeConfig` object with the updated setting. Examples usage:: @@ -316,14 +316,14 @@ def with_greedy_memory_pool(self, size: int) -> RuntimeConfig: """Use a greedy memory pool with the specified size. This pool works well for queries that do not need to spill or have a single - spillable operator. See `RuntimeConfig.with_fair_spill_pool` if there are + spillable operator. See :py:func:`with_fair_spill_pool` if there are multiple spillable operators that all will spill. Args: size: Size of the memory pool in bytes. Returns: - A new `RuntimeConfig` object with the updated setting. + A new :py:class:`RuntimeConfig` object with the updated setting. Example usage:: @@ -339,7 +339,7 @@ def with_temp_file_path(self, path: str | pathlib.Path) -> RuntimeConfig: path: Path to use for temporary files. Returns: - A new `RuntimeConfig` object with the updated setting. + A new :py:class:`RuntimeConfig` object with the updated setting. Example usage:: @@ -350,10 +350,10 @@ def with_temp_file_path(self, path: str | pathlib.Path) -> RuntimeConfig: class SQLOptions: - """Options to be used when performing SQL queries on the ``SessionContext``.""" + """Options to be used when performing SQL queries.""" def __init__(self) -> None: - """Create a new `SQLOptions` with default values. + """Create a new :py:class:`SQLOptions` with default values. The default values are: - DDL commands are allowed @@ -365,13 +365,13 @@ def __init__(self) -> None: def with_allow_ddl(self, allow: bool = True) -> SQLOptions: """Should DDL (Data Definition Language) commands be run? - Examples of DDL commands include `CREATE TABLE` and `DROP TABLE`. + Examples of DDL commands include ``CREATE TABLE`` and ``DROP TABLE``. Args: allow: Allow DDL commands to be run. Returns: - A new `SQLOptions` object with the updated setting. + A new :py:class:`SQLOptions` object with the updated setting. Example usage:: @@ -383,13 +383,13 @@ def with_allow_ddl(self, allow: bool = True) -> SQLOptions: def with_allow_dml(self, allow: bool = True) -> SQLOptions: """Should DML (Data Manipulation Language) commands be run? - Examples of DML commands include `INSERT INTO` and `DELETE`. + Examples of DML commands include ``INSERT INTO`` and ``DELETE``. Args: allow: Allow DML commands to be run. Returns: - A new `SQLOptions` object with the updated setting. + A new :py:class:`SQLOptions` object with the updated setting. Example usage:: @@ -399,13 +399,13 @@ def with_allow_dml(self, allow: bool = True) -> SQLOptions: return self def with_allow_statements(self, allow: bool = True) -> SQLOptions: - """Should statements such as `SET VARIABLE` and `BEGIN TRANSACTION` be run? + """Should statements such as ``SET VARIABLE`` and ``BEGIN TRANSACTION`` be run? Args: allow: Allow statements to be run. Returns: - A new `SQLOptions` object with the updated setting. + A new :py:class:SQLOptions` object with the updated setting. Example usage:: @@ -418,8 +418,7 @@ def with_allow_statements(self, allow: bool = True) -> SQLOptions: class SessionContext: """This is the main interface for executing queries and creating DataFrames. - See https://datafusion.apache.org/python/user-guide/basics.html for - additional information. + See :ref:`user_guide_concepts` in the online documentation for more information. """ def __init__( @@ -438,7 +437,7 @@ def __init__( Example usage: The following example demostrates how to use the context to execute - a query against a CSV data source using the ``DataFrame`` API:: + a query against a CSV data source using the :py:class:`DataFrame` API:: from datafusion import SessionContext @@ -455,7 +454,7 @@ def register_object_store(self, schema: str, store: Any, host: str | None) -> No Args: schema: The data source schema. - store: The `ObjectStore` to register. + store: The :py:class:`~datafusion.object_store.ObjectStore` to register. host: URL for the host. """ self.ctx.register_object_store(schema, store, host) @@ -471,8 +470,9 @@ def register_listing_table( ) -> None: """Register multiple files as a single table. - Registers a `Table` that can assemble multiple files from locations in - an `ObjectStore` instance. + Registers a :py:class:`~datafusion.catalog.Table` that can assemble multiple + files from locations in an :py:class:`~datafusion.object_store.ObjectStore` + instance. Args: name: Name of the resultant table. @@ -496,11 +496,12 @@ def register_listing_table( ) def sql(self, query: str, options: SQLOptions | None = None) -> DataFrame: - """Create a `DataFrame` from SQL query text. + """Create a :py:class:`~datafusion.DataFrame` from SQL query text. - Note: This API implements DDL statements such as `CREATE TABLE` and - `CREATE VIEW` and DML statements such as `INSERT INTO` with in-memory - default implementation. See `SessionContext.sql_with_options`. + Note: This API implements DDL statements such as ``CREATE TABLE`` and + ``CREATE VIEW`` and DML statements such as ``INSERT INTO`` with in-memory + default implementation.See + :py:func:`~datafusion.context.SessionContext.sql_with_options`. Args: query: SQL query text. @@ -514,7 +515,7 @@ def sql(self, query: str, options: SQLOptions | None = None) -> DataFrame: return DataFrame(self.ctx.sql_with_options(query, options.options_internal)) def sql_with_options(self, query: str, options: SQLOptions) -> DataFrame: - """Create a `DataFrame` from SQL query text. + """Create a :py:class:`~datafusion.dataframe.DataFrame` from SQL query text. This function will first validating that the query is allowed by the provided options. @@ -537,7 +538,7 @@ def create_dataframe( """Create and return a dataframe using the provided partitions. Args: - partitions: `RecordBatch` partitions to register. + partitions: :py:class:`pyarrow.RecordBatch` partitions to register. name: Resultant dataframe name. schema: Schema for the partitions. @@ -547,7 +548,7 @@ def create_dataframe( return DataFrame(self.ctx.create_dataframe(partitions, name, schema)) def create_dataframe_from_logical_plan(self, plan: LogicalPlan) -> DataFrame: - """Create a `DataFrame` from an existing logical plan. + """Create a :py:class:`~datafusion.dataframe.DataFrame` from an existing plan. Args: plan: Logical plan. @@ -560,7 +561,7 @@ def create_dataframe_from_logical_plan(self, plan: LogicalPlan) -> DataFrame: def from_pylist( self, data: list[dict[str, Any]], name: str | None = None ) -> DataFrame: - """Create a `DataFrame` from a list of dictionaries. + """Create a :py:class:`~datafusion.dataframe.DataFrame` from a list. Args: data: List of dictionaries. @@ -574,7 +575,7 @@ def from_pylist( def from_pydict( self, data: dict[str, list[Any]], name: str | None = None ) -> DataFrame: - """Create a `DataFrame` from a dictionary of lists. + """Create a :py:class:`~datafusion.dataframe.DataFrame` from a dictionary. Args: data: Dictionary of lists. @@ -588,7 +589,7 @@ def from_pydict( def from_arrow_table( self, data: pyarrow.Table, name: str | None = None ) -> DataFrame: - """Create a `DataFrame` from an Arrow table. + """Create a :py:class:`~datafusion.dataframe.DataFrame` from an Arrow table. Args: data: Arrow table. @@ -600,7 +601,7 @@ def from_arrow_table( return DataFrame(self.ctx.from_arrow_table(data, name)) def from_pandas(self, data: pandas.DataFrame, name: str | None = None) -> DataFrame: - """Create a `DataFrame` from a Pandas DataFrame. + """Create a :py:class:`~datafusion.dataframe.DataFrame` from a Pandas DataFrame. Args: data: Pandas DataFrame. @@ -612,7 +613,7 @@ def from_pandas(self, data: pandas.DataFrame, name: str | None = None) -> DataFr return DataFrame(self.ctx.from_pandas(data, name)) def from_polars(self, data: polars.DataFrame, name: str | None = None) -> DataFrame: - """Create a `DataFrame` from a Polars DataFrame. + """Create a :py:class:`~datafusion.dataframe.DataFrame` from a Polars DataFrame. Args: data: Polars DataFrame. @@ -799,7 +800,7 @@ def register_avro( ) def register_dataset(self, name: str, dataset: pyarrow.dataset.Dataset) -> None: - """Register a `pyarrow.dataset.Dataset` as a table. + """Register a :py:class:`pyarrow.dataset.Dataset` as a table. Args: name: Name of the table to register. @@ -820,7 +821,7 @@ def catalog(self, name: str = "datafusion") -> Catalog: return self.ctx.catalog(name) @deprecated( - "Use the catalog provider interface `SessionContext.catalog` to " + "Use the catalog provider interface ``SessionContext.Catalog`` to " "examine available catalogs, schemas and tables" ) def tables(self) -> set[str]: @@ -828,7 +829,7 @@ def tables(self) -> set[str]: return self.ctx.tables() def table(self, name: str) -> DataFrame: - """Retrieve a `DataFrame` representing a previously registered table.""" + """Retrieve a previously registered table by name.""" return DataFrame(self.ctx.table(name)) def table_exist(self, name: str) -> bool: @@ -836,11 +837,11 @@ def table_exist(self, name: str) -> bool: return self.ctx.table_exist(name) def empty_table(self) -> DataFrame: - """Create an empty `DataFrame`.""" + """Create an empty :py:class:`~datafusion.dataframe.DataFrame`.""" return DataFrame(self.ctx.empty_table()) def session_id(self) -> str: - """Retrun an id that uniquely identifies this `SessionContext`.""" + """Retrun an id that uniquely identifies this :py:class:`SessionContext`.""" return self.ctx.session_id() def read_json( @@ -852,7 +853,7 @@ def read_json( table_partition_cols: list[tuple[str, str]] | None = None, file_compression_type: str | None = None, ) -> DataFrame: - """Create a `DataFrame` for reading a line-delimited JSON data source. + """Read a line-delimited JSON data source. Args: path: Path to the JSON file. @@ -891,7 +892,7 @@ def read_csv( table_partition_cols: list[tuple[str, str]] | None = None, file_compression_type: str | None = None, ) -> DataFrame: - """Create a `DataFrame` for reading a CSV data source. + """Read a CSV data source. Args: path: Path to the CSV file @@ -936,7 +937,7 @@ def read_parquet( schema: pyarrow.Schema | None = None, file_sort_order: list[list[Expr]] | None = None, ) -> DataFrame: - """Create a `DataFrame` for reading Parquet data source. + """Read a Parquet source into a :py:class:`~datafusion.dataframe.Dataframe`. Args: path: Path to the Parquet file. @@ -977,7 +978,7 @@ def read_avro( file_partition_cols: list[tuple[str, str]] | None = None, file_extension: str = ".avro", ) -> DataFrame: - """Create a ``DataFrame`` for reading Avro data source. + """Create a :py:class:`DataFrame` for reading Avro data source. Args: path: Path to the Avro file. @@ -995,9 +996,14 @@ def read_avro( ) def read_table(self, table: Table) -> DataFrame: - """Creates a ``DataFrame`` for a ``Table`` such as a ``ListingTable``.""" + """Creates a :py:class:`~datafusion.dataframe.DataFrame` from a table. + + For a :py:class:`~datafusion.catalog.Table` such as a + :py:class:`~datafusion.catalog.ListingTable`, create a + :py:class:`~datafusion.dataframe.DataFrame`. + """ return DataFrame(self.ctx.read_table(table)) def execute(self, plan: ExecutionPlan, partitions: int) -> RecordBatchStream: - """Execute the `plan` and return the results.""" + """Execute the ``plan`` and return the results.""" return RecordBatchStream(self.ctx.execute(plan, partitions)) diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 68e6298f..f9c073b3 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -14,10 +14,9 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -"""DataFrame is one of the core concepts in DataFusion. +""":py:class:`DataFrame` is one of the core concepts in DataFusion. -See https://datafusion.apache.org/python/user-guide/basics.html for more -information. +See :ref:`user_guide_concepts` in the online documentation for more information. """ from __future__ import annotations @@ -43,19 +42,19 @@ class DataFrame: """Two dimensional table representation of data. - See https://datafusion.apache.org/python/user-guide/basics.html for more - information. + See :ref:`user_guide_concepts` in the online documentation for more information. """ def __init__(self, df: DataFrameInternal) -> None: """This constructor is not to be used by the end user. - See ``SessionContext`` for methods to create DataFrames. + See :py:class:`~datafusion.context.SessionContext` for methods to + create a :py:class:`DataFrame`. """ self.df = df def __getitem__(self, key: str | List[str]) -> DataFrame: - """Return a new `DataFrame` with the specified column or columns. + """Return a new :py:class`DataFrame` with the specified column or columns. Args: key: Column name or list of column names to select. @@ -74,7 +73,7 @@ def __repr__(self) -> str: return self.df.__repr__() def describe(self) -> DataFrame: - """Return a new `DataFrame` that has statistics for a DataFrame. + """Return the statistics for this DataFrame. Only summarized numeric datatypes at the moments and returns nulls for non-numeric datatypes. @@ -87,7 +86,7 @@ def describe(self) -> DataFrame: return DataFrame(self.df.describe()) def schema(self) -> pa.Schema: - """Return the `pyarrow.Schema` describing the output of this DataFrame. + """Return the :py:class:`pyarrow.Schema` of this DataFrame. The output schema contains information on the name, data type, and nullability for each column. @@ -106,10 +105,10 @@ def select_columns(self, *args: str) -> DataFrame: return self.select(*args) def select(self, *exprs: Expr | str) -> DataFrame: - """Project arbitrary expressions into a new `DataFrame`. + """Project arbitrary expressions into a new :py:class:`DataFrame`. Args: - exprs: Either column names or `Expr` to select. + exprs: Either column names or :py:class:`~datafusion.expr.Expr` to select. Returns: DataFrame after projection. It has one column for each expression. @@ -117,9 +116,9 @@ def select(self, *exprs: Expr | str) -> DataFrame: Example usage: The following example will return 3 columns from the original dataframe. - The first two columns will be the original column `a` and `b` since the + The first two columns will be the original column ``a`` and ``b`` since the string "a" is assumed to refer to column selection. Also a duplicate of - column `a` will be returned with the column name `alternate_a`:: + column ``a`` will be returned with the column name ``alternate_a``:: df = df.select("a", col("b"), col("a").alias("alternate_a")) @@ -131,12 +130,12 @@ def select(self, *exprs: Expr | str) -> DataFrame: return DataFrame(self.df.select(*exprs)) def filter(self, *predicates: Expr) -> DataFrame: - """Return a DataFrame for which `predicate` evaluates to `True`. + """Return a DataFrame for which ``predicate`` evaluates to ``True``. - Rows for which `predicate` evaluates to `False` or `None` are filtered + Rows for which ``predicate`` evaluates to ``False`` or ``None`` are filtered out. If more than one predicate is provided, these predicates will be combined as a logical AND. If more complex logic is required, see the - logical operations in `datafusion.functions`. + logical operations in :py:mod:`~datafusion.functions`. Args: predicates: Predicate expression(s) to filter the DataFrame. @@ -162,12 +161,12 @@ def with_column(self, name: str, expr: Expr) -> DataFrame: return DataFrame(self.df.with_column(name, expr.expr)) def with_column_renamed(self, old_name: str, new_name: str) -> DataFrame: - """Rename one column by applying a new projection. + r"""Rename one column by applying a new projection. This is a no-op if the column to be renamed does not exist. The method supports case sensitive rename with wrapping column name - into one the following symbols (" or ' or `). + into one the following symbols (" or ' or \`). Args: old_name: Old column name. @@ -196,7 +195,7 @@ def sort(self, *exprs: Expr) -> DataFrame: """Sort the DataFrame by the specified sorting expressions. Note that any expression can be turned into a sort expression by - calling its `sort` method. + calling its` ``sort`` method. Args: exprs: Sort expressions, applied in order. @@ -208,7 +207,7 @@ def sort(self, *exprs: Expr) -> DataFrame: return DataFrame(self.df.sort(*exprs)) def limit(self, count: int, offset: int = 0) -> DataFrame: - """Return a new `DataFrame` with a limited number of rows. + """Return a new :py:class:`DataFrame` with a limited number of rows. Args: count: Number of rows to limit the DataFrame to. @@ -220,14 +219,14 @@ def limit(self, count: int, offset: int = 0) -> DataFrame: return DataFrame(self.df.limit(count, offset)) def collect(self) -> list[pa.RecordBatch]: - """Execute this `DataFrame` and collect results into memory. + """Execute this :py:class:`DataFrame` and collect results into memory. - Prior to calling `collect`, modifying a DataFrme simply updates a plan - (no actual computation is performed). Calling `collect` triggers the + Prior to calling ``collect``, modifying a DataFrme simply updates a plan + (no actual computation is performed). Calling ``collect`` triggers the computation. Returns: - List of `pyarrow.RecordBatch`es collected from the DataFrame. + List of :py:class:`pyarrow.RecordBatch` collected from the DataFrame. """ return self.df.collect() @@ -242,11 +241,11 @@ def cache(self) -> DataFrame: def collect_partitioned(self) -> list[list[pa.RecordBatch]]: """Execute this DataFrame and collect all partitioned results. - This operation returns ``RecordBatch`` maintaining the input + This operation returns :py:class:`pyarrow.RecordBatch` maintaining the input partitioning. Returns: - List of list of ``RecordBatch`` collected from the + List of list of :py:class:`RecordBatch` collected from the DataFrame. """ return self.df.collect_partitioned() @@ -260,7 +259,7 @@ def show(self, num: int = 20) -> None: self.df.show(num) def distinct(self) -> DataFrame: - """Return a new `DataFrame` with all duplicated rows removed. + """Return a new :py:class:`DataFrame` with all duplicated rows removed. Returns: DataFrame after removing duplicates. @@ -273,7 +272,7 @@ def join( join_keys: tuple[list[str], list[str]], how: str, ) -> DataFrame: - """Join this `DataFrame` with another `DataFrame`. + """Join this :py:class:`DataFrame` with another :py:class:`DataFrame`. Join keys are a pair of lists of column names in the left and right dataframes, respectively. These lists must have the same length. @@ -292,11 +291,11 @@ def join( def explain(self, verbose: bool = False, analyze: bool = False) -> DataFrame: """Return a DataFrame with the explanation of its plan so far. - If `analyze` is specified, runs the plan and reports metrics. + If ``analyze`` is specified, runs the plan and reports metrics. Args: - verbose: If `True`, more details will be included. - analyze: If `True`, the plan will run and metrics reported. + verbose: If ``True``, more details will be included. + analyze: If ``Tru`e``, the plan will run and metrics reported. Returns: DataFrame with the explanation of its plan. @@ -304,7 +303,7 @@ def explain(self, verbose: bool = False, analyze: bool = False) -> DataFrame: return DataFrame(self.df.explain(verbose, analyze)) def logical_plan(self) -> LogicalPlan: - """Return the unoptimized `LogicalPlan` that comprises this `DataFrame`. + """Return the unoptimized ``LogicalPlan``. Returns: Unoptimized logical plan. @@ -312,7 +311,7 @@ def logical_plan(self) -> LogicalPlan: return self.df.logical_plan() def optimized_logical_plan(self) -> LogicalPlan: - """Return the optimized `LogicalPlan` that comprises this `DataFrame`. + """Return the optimized ``LogicalPlan``. Returns: Optimized logical plan. @@ -320,7 +319,7 @@ def optimized_logical_plan(self) -> LogicalPlan: return self.df.optimized_logical_plan() def execution_plan(self) -> ExecutionPlan: - """Return the execution/physical plan that comprises this `DataFrame`. + """Return the execution/physical plan. Returns: Execution plan. @@ -328,7 +327,7 @@ def execution_plan(self) -> ExecutionPlan: return self.df.execution_plan() def repartition(self, num: int) -> DataFrame: - """Repartition a DataFrame into `num` partitions. + """Repartition a DataFrame into ``num`` partitions. The batches allocation uses a round-robin algorithm. @@ -354,13 +353,13 @@ def repartition_by_hash(self, *exprs: Expr, num: int) -> DataFrame: return DataFrame(self.df.repartition_by_hash(*exprs, num=num)) def union(self, other: DataFrame, distinct: bool = False) -> DataFrame: - """Calculate the union of two `DataFrame`s. + """Calculate the union of two :py:class:`DataFrame`. - The two `DataFrame`s must have exactly the same schema. + The two :py:class:`DataFrame` must have exactly the same schema. Args: other: DataFrame to union with. - distinct: If `True`, duplicate rows will be removed. + distinct: If ``True``, duplicate rows will be removed. Returns: DataFrame after union. @@ -368,9 +367,9 @@ def union(self, other: DataFrame, distinct: bool = False) -> DataFrame: return DataFrame(self.df.union(other.df, distinct)) def union_distinct(self, other: DataFrame) -> DataFrame: - """Calculate the distinct union of two `DataFrame`s. + """Calculate the distinct union of two :py:class:`DataFrame`. - The two `DataFrame`s must have exactly the same schema. + The two :py:class:`DataFrame` must have exactly the same schema. Any duplicate rows are discarded. Args: @@ -382,9 +381,9 @@ def union_distinct(self, other: DataFrame) -> DataFrame: return DataFrame(self.df.union_distinct(other.df)) def intersect(self, other: DataFrame) -> DataFrame: - """Calculate the intersection of two `DataFrame`s. + """Calculate the intersection of two :py:class:`DataFrame`. - The two `DataFrame`s must have exactly the same schema. + The two :py:class:`DataFrame` must have exactly the same schema. Args: other: DataFrame to intersect with. @@ -395,9 +394,9 @@ def intersect(self, other: DataFrame) -> DataFrame: return DataFrame(self.df.intersect(other.df)) def except_all(self, other: DataFrame) -> DataFrame: - """Calculate the exception of two `DataFrame`s. + """Calculate the exception of two :py:class:`DataFrame`. - The two `DataFrame`s must have exactly the same schema. + The two :py:class:`DataFrame` must have exactly the same schema. Args: other: DataFrame to calculate exception with. @@ -408,7 +407,7 @@ def except_all(self, other: DataFrame) -> DataFrame: return DataFrame(self.df.except_all(other.df)) def write_csv(self, path: str | pathlib.Path, with_header: bool = False) -> None: - """Execute the `DataFrame` and write the results to a CSV file. + """Execute the :py:class:`DataFrame` and write the results to a CSV file. Args: path: Path of the CSV file to write. @@ -422,7 +421,7 @@ def write_parquet( compression: str = "uncompressed", compression_level: int | None = None, ) -> None: - """Execute the `DataFrame` and write the results to a Parquet file. + """Execute the :py:class:`DataFrame` and write the results to a Parquet file. Args: path: Path of the Parquet file to write. @@ -432,7 +431,7 @@ def write_parquet( self.df.write_parquet(str(path), compression, compression_level) def write_json(self, path: str | pathlib.Path) -> None: - """Execute the `DataFrame` and write the results to a JSON file. + """Execute the :py:class:`DataFrame` and write the results to a JSON file. Args: path: Path of the JSON file to write. @@ -440,7 +439,7 @@ def write_json(self, path: str | pathlib.Path) -> None: self.df.write_json(str(path)) def to_arrow_table(self) -> pa.Table: - """Execute the `DataFrame` and convert it into an Arrow Table. + """Execute the :py:class:`DataFrame` and convert it into an Arrow Table. Returns: Arrow Table. @@ -465,7 +464,7 @@ def execute_stream_partitioned(self) -> list[RecordBatchStream]: return [RecordBatchStream(rbs) for rbs in streams] def to_pandas(self) -> pd.DataFrame: - """Execute the `DataFrame` and convert it into a Pandas DataFrame. + """Execute the :py:class:`DataFrame` and convert it into a Pandas DataFrame. Returns: Pandas DataFrame. @@ -473,7 +472,7 @@ def to_pandas(self) -> pd.DataFrame: return self.df.to_pandas() def to_pylist(self) -> list[dict[str, Any]]: - """Execute the `DataFrame` and convert it into a list of dictionaries. + """Execute the :py:class:`DataFrame` and convert it into a list of dictionaries. Returns: List of dictionaries. @@ -481,7 +480,7 @@ def to_pylist(self) -> list[dict[str, Any]]: return self.df.to_pylist() def to_pydict(self) -> dict[str, list[Any]]: - """Execute the `DataFrame` and convert it into a dictionary of lists. + """Execute the :py:class:`DataFrame` and convert it into a dictionary of lists. Returns: Dictionary of lists. @@ -489,7 +488,7 @@ def to_pydict(self) -> dict[str, list[Any]]: return self.df.to_pydict() def to_polars(self) -> pl.DataFrame: - """Execute the `DataFrame` and convert it into a Polars DataFrame. + """Execute the :py:class:`DataFrame` and convert it into a Polars DataFrame. Returns: Polars DataFrame. @@ -497,7 +496,7 @@ def to_polars(self) -> pl.DataFrame: return self.df.to_polars() def count(self) -> int: - """Return the total number of rows in this `DataFrame`. + """Return the total number of rows in this :py:class:`DataFrame`. Note that this method will actually run a plan to calculate the count, which may be slow for large or complicated DataFrames. @@ -509,7 +508,7 @@ def count(self) -> int: @deprecated("Use :func:`unnest_columns` instead.") def unnest_column(self, column: str, preserve_nulls: bool = True) -> DataFrame: - """See ``unnest_columns``.""" + """See :py:func:`unnest_columns`.""" return DataFrame(self.df.unnest_column(column, preserve_nulls=preserve_nulls)) def unnest_columns(self, *columns: str, preserve_nulls: bool = True) -> DataFrame: diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index 318b8b9a..4bf33baf 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -17,7 +17,7 @@ """This module supports expressions, one of the core concepts in DataFusion. -See ``Expr`` for more details. +See :ref:`Expressions` in the online documentation for more details. """ from __future__ import annotations @@ -155,8 +155,7 @@ class Expr: """Expression object. Expressions are one of the core concepts in DataFusion. See - https://datafusion.apache.org/python/user-guide/common-operations/expressions.html - for more information. + :ref:`Expressions` in the online documentation for more information. """ def __init__(self, expr: expr_internal.Expr) -> None: @@ -324,7 +323,7 @@ def __lt__(self, rhs: Any) -> Expr: def literal(value: Any) -> Expr: """Creates a new expression representing a scalar value. - `value` must be a valid PyArrow scalar value or easily castable to one. + ``value`` must be a valid PyArrow scalar value or easily castable to one. """ if not isinstance(value, pa.Scalar): value = pa.scalar(value) @@ -332,7 +331,7 @@ def literal(value: Any) -> Expr: @staticmethod def column(value: str) -> Expr: - """Creates a new expression representing a column in a ``DataFrame``.""" + """Creates a new expression representing a column.""" return Expr(expr_internal.Expr.column(value)) def alias(self, name: str) -> Expr: @@ -340,7 +339,7 @@ def alias(self, name: str) -> Expr: return Expr(self.expr.alias(name)) def sort(self, ascending: bool = True, nulls_first: bool = True) -> Expr: - """Creates a sort ``Expr`` from an existing ``Expr``. + """Creates a sort :py:class:`Expr` from an existing :py:class:`Expr`. Args: ascending: If true, sort in ascending order. @@ -361,7 +360,7 @@ def rex_type(self) -> RexType: A Rex (Row Expression) specifies a single row of data.That specification could include user defined functions or types. RexType identifies the - row as one of the possible valid ``RexType``(s). + row as one of the possible valid ``RexType``. """ return self.expr.rex_type() @@ -412,12 +411,12 @@ def __init__( """Construct a window frame using the given parameters. Args: - units: Should be one of `rows`, `range`, or `groups`. + units: Should be one of ``rows``, ``range``, or ``groups``. start_bound: Sets the preceeding bound. Must be >= 0. If none, this - will be set to unbounded. If unit type is `groups`, this + will be set to unbounded. If unit type is ``groups``, this parameter must be set. end_bound: Sets the following bound. Must be >= 0. If none, this - will be set to unbounded. If unit type is `groups`, this + will be set to unbounded. If unit type is ``groups``, this parameter must be set. """ self.window_frame = expr_internal.WindowFrame(units, start_bound, end_bound) @@ -438,7 +437,7 @@ def get_upper_bound(self): class WindowFrameBound: """Defines a single window frame bound. - ``WindowFrame`` typically requires a start and end bound. + :py:class:`WindowFrame` typically requires a start and end bound. """ def __init__(self, frame_bound: expr_internal.WindowFrameBound) -> None: @@ -485,7 +484,7 @@ def __init__(self, case_builder: expr_internal.CaseBuilder) -> None: """Constructs a case builder. This is not typically called by the end user directly. See - ``datafusion.functions.case`` instead. + :py:func:`datafusion.functions.case` instead. """ self.case_builder = case_builder diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index be83d359..3c9e01bc 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -14,7 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -"""This module contains the user functions for operating on ``Expr``.""" +"""User functions for operating on :py:class:`~datafusion.expr.Expr`.""" from __future__ import annotations @@ -263,12 +263,12 @@ def nullif(expr1: Expr, expr2: Expr) -> Expr: def encode(input: Expr, encoding: Expr) -> Expr: - """Encode the `input`, using the `encoding`. encoding can be base64 or hex.""" + """Encode the ``input``, using the ``encoding``. encoding can be base64 or hex.""" return Expr(f.encode(input.expr, encoding.expr)) def decode(input: Expr, encoding: Expr) -> Expr: - """Decode the `input`, using the `encoding`. encoding can be base64 or hex.""" + """Decode the ``input``, using the ``encoding``. encoding can be base64 or hex.""" return Expr(f.decode(input.expr, encoding.expr)) @@ -302,7 +302,7 @@ def list_join(expr: Expr, delimiter: Expr) -> Expr: def in_list(arg: Expr, values: list[Expr], negated: bool = False) -> Expr: - """Returns whether the argument is contained within the list `values`.""" + """Returns whether the argument is contained within the list ``values``.""" values = [v.expr for v in values] return Expr(f.in_list(arg.expr, values, negated)) @@ -326,9 +326,9 @@ def concat(*args: Expr) -> Expr: def concat_ws(separator: str, *args: Expr) -> Expr: - """Concatenates the list `args` with the separator. + """Concatenates the list ``args`` with the separator. - `NULL` arugments are ignored. `separator` should not be `NULL`. + ``NULL`` arugments are ignored. ``separator`` should not be ``NULL``. """ args = [arg.expr for arg in args] return Expr(f.concat_ws(separator, args)) @@ -355,9 +355,11 @@ def count_star() -> Expr: def case(expr: Expr) -> CaseBuilder: - """Create a ``CaseBuilder`` to match cases for the expression ``expr``. + """Create a case expression. - See ``datafusion.expr.CaseBuilder`` for detailed usage of ``CaseBuilder``. + Create a :py:class:`~datafusion.expr.CaseBuilder` to match cases for the + expression ``expr``. See :py:class:`~datafusion.expr.CaseBuilder` for + detailed usage. """ return CaseBuilder(f.case(expr.expr)) @@ -462,12 +464,12 @@ def character_length(arg: Expr) -> Expr: def length(string: Expr) -> Expr: - """The number of characters in the `string`.""" + """The number of characters in the ``string``.""" return Expr(f.length(string.expr)) def char_length(string: Expr) -> Expr: - """The number of characters in the `string`.""" + """The number of characters in the ``string``.""" return Expr(f.char_length(string.expr)) @@ -477,7 +479,7 @@ def chr(arg: Expr) -> Expr: def coalesce(*args: Expr) -> Expr: - """Returns the value of the first expr in `args` which is not NULL.""" + """Returns the value of the first expr in ``args`` which is not NULL.""" args = [arg.expr for arg in args] return Expr(f.coalesce(*args)) @@ -503,7 +505,7 @@ def degrees(arg: Expr) -> Expr: def ends_with(arg: Expr, suffix: Expr) -> Expr: - """Returns true if the `string` ends with the `suffix`, false otherwise.""" + """Returns true if the ``string`` ends with the ``suffix``, false otherwise.""" return Expr(f.ends_with(arg.expr, suffix.expr)) @@ -521,9 +523,9 @@ def find_in_set(string: Expr, string_list: Expr) -> Expr: """Find a string in a list of strings. Returns a value in the range of 1 to N if the string is in the string list - `string_list` consisting of N substrings. + ``string_list`` consisting of N substrings. - The string list is a string composed of substrings separated by `,` characters. + The string list is a string composed of substrings separated by ``,`` characters. """ return Expr(f.find_in_set(string.expr, string_list.expr)) @@ -541,16 +543,16 @@ def gcd(x: Expr, y: Expr) -> Expr: def initcap(string: Expr) -> Expr: """Set the initial letter of each word to capital. - Converts the first letter of each word in `string` to uppercase and the remaining + Converts the first letter of each word in ``string`` to uppercase and the remaining characters to lowercase. """ return Expr(f.initcap(string.expr)) def instr(string: Expr, substring: Expr) -> Expr: - """Finds the position from where the `substring` matches the `string`. + """Finds the position from where the ``substring`` matches the ``string``. - This is an alias for :func:`strpos`. + This is an alias for :py:func:`strpos`. """ return strpos(string, substring) @@ -566,7 +568,7 @@ def lcm(x: Expr, y: Expr) -> Expr: def left(string: Expr, n: Expr) -> Expr: - """Returns the first `n` characters in the `string`.""" + """Returns the first ``n`` characters in the ``string``.""" return Expr(f.left(string.expr, n.expr)) @@ -581,7 +583,7 @@ def ln(arg: Expr) -> Expr: def log(base: Expr, num: Expr) -> Expr: - """Returns the logarithm of a number for a particular `base`.""" + """Returns the logarithm of a number for a particular ``base``.""" return Expr(f.log(base.expr, num.expr)) @@ -622,7 +624,7 @@ def md5(arg: Expr) -> Expr: def nanvl(x: Expr, y: Expr) -> Expr: - """Returns `x` if `x` is not `NaN`. Otherwise returns `y`.""" + """Returns ``x`` if ``x`` is not ``NaN``. Otherwise returns ``y``.""" return Expr(f.nanvl(x.expr, y.expr)) @@ -636,8 +638,8 @@ def overlay( ) -> Expr: """Replace a substring with a new substring. - Replace the substring of string that starts at the `start`'th character and - extends for `length` characters with new substring. + Replace the substring of string that starts at the ``start``'th character and + extends for ``length`` characters with new substring. """ if length is None: return Expr(f.overlay(string.expr, substring.expr, start.expr)) @@ -650,7 +652,7 @@ def pi() -> Expr: def position(string: Expr, substring: Expr) -> Expr: - """Finds the position from where the `substring` matches the `string`. + """Finds the position from where the ``substring`` matches the ``string``. This is an alias for :func:`strpos`. """ @@ -658,14 +660,14 @@ def position(string: Expr, substring: Expr) -> Expr: def power(base: Expr, exponent: Expr) -> Expr: - """Returns `base` raised to the power of `exponent`.""" + """Returns ``base`` raised to the power of ``exponent``.""" return Expr(f.power(base.expr, exponent.expr)) def pow(base: Expr, exponent: Expr) -> Expr: - """Returns `base` raised to the power of `exponent`. + """Returns ``base`` raised to the power of ``exponent``. - This is an alias of `power`. + This is an alias of :py:func:`power`. """ return power(base, exponent) @@ -690,7 +692,7 @@ def regexp_match(string: Expr, regex: Expr, flags: Expr | None = None) -> Expr: """Perform regular expression (regex) matching. Returns an array with each element containing the leftmost-first match of the - corresponding index in `regex` to string in `string`. + corresponding index in ``regex`` to string in ``string``. """ if flags is not None: flags = flags.expr @@ -714,12 +716,12 @@ def regexp_replace( def repeat(string: Expr, n: Expr) -> Expr: - """Repeats the `string` to `n` times.""" + """Repeats the ``string`` to ``n`` times.""" return Expr(f.repeat(string.expr, n.expr)) def replace(string: Expr, from_val: Expr, to_val: Expr) -> Expr: - """Replaces all occurrences of `from` with `to` in the `string`.""" + """Replaces all occurrences of ``from_val`` with ``to_val`` in the ``string``.""" return Expr(f.replace(string.expr, from_val.expr, to_val.expr)) @@ -729,7 +731,7 @@ def reverse(arg: Expr) -> Expr: def right(string: Expr, n: Expr) -> Expr: - """Returns the last `n` characters in the `string`.""" + """Returns the last ``n`` characters in the ``string``.""" return Expr(f.right(string.expr, n.expr)) @@ -738,7 +740,7 @@ def round(value: Expr, decimal_places: Expr = Expr.literal(0)) -> Expr: If the optional ``decimal_places`` is specified, round to the nearest number of decimal places. You can specify a negative number of decimal places. For example - `round(lit(125.2345), lit(-2))` would yield a value of `100.0`. + ``round(lit(125.2345), lit(-2))`` would yield a value of ``100.0``. """ return Expr(f.round(value.expr, decimal_places.expr)) @@ -813,22 +815,26 @@ def starts_with(string: Expr, prefix: Expr) -> Expr: def strpos(string: Expr, substring: Expr) -> Expr: - """Finds the position from where the `substring` matches the `string`.""" + """Finds the position from where the ``substring`` matches the ``string``.""" return Expr(f.strpos(string.expr, substring.expr)) def substr(string: Expr, position: Expr) -> Expr: - """Substring from the `position` to the end.""" + """Substring from the ``position`` to the end.""" return Expr(f.substr(string.expr, position.expr)) def substr_index(string: Expr, delimiter: Expr, count: Expr) -> Expr: - """Returns the substring from `string` before `count` occurrences of `delimiter`.""" + """Returns an indexed substring. + + The return will be the ``string`` from before ``count`` occurrences of + ``delimiter``. + """ return Expr(f.substr_index(string.expr, delimiter.expr, count.expr)) def substring(string: Expr, position: Expr, length: Expr) -> Expr: - """Substring from the `position` with `length` characters.""" + """Substring from the ``position`` with ``length`` characters.""" return Expr(f.substring(string.expr, position.expr, length.expr)) @@ -856,7 +862,7 @@ def now() -> Expr: def to_timestamp(arg: Expr, *formatters: Expr) -> Expr: - """Converts a string and optional formats to a `Timestamp` in nanoseconds. + """Converts a string and optional formats to a ``Timestamp`` in nanoseconds. For usage of ``formatters`` see the rust chrono package ``strftime`` package. @@ -870,33 +876,33 @@ def to_timestamp(arg: Expr, *formatters: Expr) -> Expr: def to_timestamp_millis(arg: Expr, *formatters: Expr) -> Expr: - """Converts a string and optional formats to a `Timestamp` in milliseconds. + """Converts a string and optional formats to a ``Timestamp`` in milliseconds. - See `to_timestamp` for a description on how to use formatters. + See :py:func:`to_timestamp` for a description on how to use formatters. """ return Expr(f.to_timestamp_millis(arg.expr, *formatters)) def to_timestamp_micros(arg: Expr, *formatters: Expr) -> Expr: - """Converts a string and optional formats to a `Timestamp` in microseconds. + """Converts a string and optional formats to a ``Timestamp`` in microseconds. - See `to_timestamp` for a description on how to use formatters. + See :py:func:`to_timestamp` for a description on how to use formatters. """ return Expr(f.to_timestamp_micros(arg.expr, *formatters)) def to_timestamp_nanos(arg: Expr, *formatters: Expr) -> Expr: - """Converts a string and optional formats to a `Timestamp` in nanoseconds. + """Converts a string and optional formats to a ``Timestamp`` in nanoseconds. - See `to_timestamp` for a description on how to use formatters. + See :py:func:`to_timestamp` for a description on how to use formatters. """ return Expr(f.to_timestamp_nanos(arg.expr, *formatters)) def to_timestamp_seconds(arg: Expr, *formatters: Expr) -> Expr: - """Converts a string and optional formats to a `Timestamp` in seconds. + """Converts a string and optional formats to a ``Timestamp`` in seconds. - See `to_timestamp` for a description on how to use formatters. + See :py:func:`to_timestamp` for a description on how to use formatters. """ return Expr(f.to_timestamp_seconds(arg.expr, *formatters)) @@ -920,7 +926,7 @@ def current_time() -> Expr: def datepart(part: Expr, date: Expr) -> Expr: """Return a specified part of a date. - This is an alias for `date_part`. + This is an alias for :py:func:`date_part`. """ return date_part(part, date) @@ -938,7 +944,7 @@ def date_trunc(part: Expr, date: Expr) -> Expr: def datetrunc(part: Expr, date: Expr) -> Expr: """Truncates the date to a specified level of precision. - This is an alias for `date_trunc`. + This is an alias for :py:func:`date_trunc`. """ return date_trunc(part, date) @@ -954,7 +960,7 @@ def make_date(year: Expr, month: Expr, day: Expr) -> Expr: def translate(string: Expr, from_val: Expr, to_val: Expr) -> Expr: - """Replaces the characters in `from_val` with the counterpart in `to_val`.""" + """Replaces the characters in ``from_val`` with the counterpart in ``to_val``.""" return Expr(f.translate(string.expr, from_val.expr, to_val.expr)) @@ -984,7 +990,7 @@ def make_array(*args: Expr) -> Expr: def array(*args: Expr) -> Expr: """Returns an array using the specified input expressions. - This is an alias for `make_array`. + This is an alias for :py:func:`make_array`. """ return make_array(args) @@ -1025,7 +1031,7 @@ def arrow_typeof(arg: Expr) -> Expr: def random() -> Expr: - """Returns a random value in the range `0.0 <= x < 1.0`.""" + """Returns a random value in the range ``0.0 <= x < 1.0``.""" return Expr(f.random()) @@ -1037,7 +1043,7 @@ def array_append(array: Expr, element: Expr) -> Expr: def array_push_back(array: Expr, element: Expr) -> Expr: """Appends an element to the end of an array. - This is an alias for `array_append`. + This is an alias for :py:func:`array_append`. """ return array_append(array, element) @@ -1045,7 +1051,7 @@ def array_push_back(array: Expr, element: Expr) -> Expr: def list_append(array: Expr, element: Expr) -> Expr: """Appends an element to the end of an array. - This is an alias for `array_append`. + This is an alias for :py:func:`array_append`. """ return array_append(array, element) @@ -1053,7 +1059,7 @@ def list_append(array: Expr, element: Expr) -> Expr: def list_push_back(array: Expr, element: Expr) -> Expr: """Appends an element to the end of an array. - This is an alias for `array_append`. + This is an alias for :py:func:`array_append`. """ return array_append(array, element) @@ -1067,7 +1073,7 @@ def array_concat(*args: Expr) -> Expr: def array_cat(*args: Expr) -> Expr: """Concatenates the input arrays. - This is an alias for `array_concat`. + This is an alias for :py:func:`array_concat`. """ return array_concat(*args) @@ -1085,7 +1091,7 @@ def array_distinct(array: Expr) -> Expr: def list_distinct(array: Expr) -> Expr: """Returns distinct values from the array after removing duplicates. - This is an alias for `array_distinct`. + This is an alias for :py:func:`array_distinct`. """ return array_distinct(array) @@ -1093,7 +1099,7 @@ def list_distinct(array: Expr) -> Expr: def list_dims(array: Expr) -> Expr: """Returns an array of the array's dimensions. - This is an alias for `array_dims`. + This is an alias for :py:func:`array_dims`. """ return array_dims(array) @@ -1106,7 +1112,7 @@ def array_element(array: Expr, n: Expr) -> Expr: def array_extract(array: Expr, n: Expr) -> Expr: """Extracts the element with the index n from the array. - This is an alias for `array_element`. + This is an alias for :py:func:`array_element`. """ return array_element(array, n) @@ -1114,7 +1120,7 @@ def array_extract(array: Expr, n: Expr) -> Expr: def list_element(array: Expr, n: Expr) -> Expr: """Extracts the element with the index n from the array. - This is an alias for `array_element`. + This is an alias for :py:func:`array_element`. """ return array_element(array, n) @@ -1122,7 +1128,7 @@ def list_element(array: Expr, n: Expr) -> Expr: def list_extract(array: Expr, n: Expr) -> Expr: """Extracts the element with the index n from the array. - This is an alias for `array_element`. + This is an alias for :py:func:`array_element`. """ return array_element(array, n) @@ -1135,7 +1141,7 @@ def array_length(array: Expr) -> Expr: def list_length(array: Expr) -> Expr: """Returns the length of the array. - This is an alias for `array_length`. + This is an alias for :py:func:`array_length`. """ return array_length(array) @@ -1171,7 +1177,7 @@ def array_position(array: Expr, element: Expr, index: int | None = 1) -> Expr: def array_indexof(array: Expr, element: Expr, index: int | None = 1) -> Expr: """Return the position of the first occurrence of ``element`` in ``array``. - This is an alias for `array_position`. + This is an alias for :py:func:`array_position`. """ return array_position(array, element, index) @@ -1179,7 +1185,7 @@ def array_indexof(array: Expr, element: Expr, index: int | None = 1) -> Expr: def list_position(array: Expr, element: Expr, index: int | None = 1) -> Expr: """Return the position of the first occurrence of ``element`` in ``array``. - This is an alias for `array_position`. + This is an alias for :py:func:`array_position`. """ return array_position(array, element, index) @@ -1187,7 +1193,7 @@ def list_position(array: Expr, element: Expr, index: int | None = 1) -> Expr: def list_indexof(array: Expr, element: Expr, index: int | None = 1) -> Expr: """Return the position of the first occurrence of ``element`` in ``array``. - This is an alias for `array_position`. + This is an alias for :py:func:`array_position`. """ return array_position(array, element, index) @@ -1200,7 +1206,7 @@ def array_positions(array: Expr, element: Expr) -> Expr: def list_positions(array: Expr, element: Expr) -> Expr: """Searches for an element in the array and returns all occurrences. - This is an alias for `array_positions`. + This is an alias for :py:func:`array_positions`. """ return array_positions(array, element) @@ -1213,7 +1219,7 @@ def array_ndims(array: Expr) -> Expr: def list_ndims(array: Expr) -> Expr: """Returns the number of dimensions of the array. - This is an alias for `array_ndims`. + This is an alias for :py:func:`array_ndims`. """ return array_ndims(array) @@ -1226,7 +1232,7 @@ def array_prepend(element: Expr, array: Expr) -> Expr: def array_push_front(element: Expr, array: Expr) -> Expr: """Prepends an element to the beginning of an array. - This is an alias for `array_prepend`. + This is an alias for :py:func:`array_prepend`. """ return array_prepend(element, array) @@ -1234,7 +1240,7 @@ def array_push_front(element: Expr, array: Expr) -> Expr: def list_prepend(element: Expr, array: Expr) -> Expr: """Prepends an element to the beginning of an array. - This is an alias for `array_prepend`. + This is an alias for :py:func:`array_prepend`. """ return array_prepend(element, array) @@ -1242,7 +1248,7 @@ def list_prepend(element: Expr, array: Expr) -> Expr: def list_push_front(element: Expr, array: Expr) -> Expr: """Prepends an element to the beginning of an array. - This is an alias for `array_prepend`. + This is an alias for :py:func:`array_prepend`. """ return array_prepend(element, array) @@ -1265,20 +1271,20 @@ def array_remove(array: Expr, element: Expr) -> Expr: def list_remove(array: Expr, element: Expr) -> Expr: """Removes the first element from the array equal to the given value. - This is an alias for `array_remove`. + This is an alias for :py:func:`array_remove`. """ return array_remove(array, element) def array_remove_n(array: Expr, element: Expr, max: Expr) -> Expr: - """Removes the first `max` elements from the array equal to the given value.""" + """Removes the first ``max`` elements from the array equal to the given value.""" return Expr(f.array_remove_n(array.expr, element.expr, max.expr)) def list_remove_n(array: Expr, element: Expr, max: Expr) -> Expr: - """Removes the first `max` elements from the array equal to the given value. + """Removes the first ``max`` elements from the array equal to the given value. - This is an alias for `array_remove_n`. + This is an alias for :py:func:`array_remove_n`. """ return array_remove_n(array, element, max) @@ -1291,13 +1297,13 @@ def array_remove_all(array: Expr, element: Expr) -> Expr: def list_remove_all(array: Expr, element: Expr) -> Expr: """Removes all elements from the array equal to the given value. - This is an alias for `array_remove_all`. + This is an alias for :py:func:`array_remove_all`. """ return array_remove_all(array, element) def array_repeat(element: Expr, count: Expr) -> Expr: - """Returns an array containing `element` `count` times.""" + """Returns an array containing ``element`` ``count`` times.""" return Expr(f.array_repeat(element.expr, count.expr)) @@ -1309,27 +1315,27 @@ def array_replace(array: Expr, from_val: Expr, to_val: Expr) -> Expr: def list_replace(array: Expr, from_val: Expr, to_val: Expr) -> Expr: """Replaces the first occurrence of ``from_val`` with ``to_val``. - This is an alias for `array_replace`. + This is an alias for :py:func:`array_replace`. """ return array_replace(array, from_val, to_val) def array_replace_n(array: Expr, from_val: Expr, to_val: Expr, max: Expr) -> Expr: - """Replace `n` occurrences of ``from_val`` with ``to_val``. + """Replace ``n`` occurrences of ``from_val`` with ``to_val``. - Replaces the first `max` occurrences of the specified element with another + Replaces the first ``max`` occurrences of the specified element with another specified element. """ return Expr(f.array_replace_n(array.expr, from_val.expr, to_val.expr, max.expr)) def list_replace_n(array: Expr, from_val: Expr, to_val: Expr, max: Expr) -> Expr: - """Replace `n` occurrences of ``from_val`` with ``to_val``. + """Replace ``n`` occurrences of ``from_val`` with ``to_val``. - Replaces the first `max` occurrences of the specified element with another + Replaces the first ``max`` occurrences of the specified element with another specified element. - This is an alias for `array_replace_n`. + This is an alias for :py:func:`array_replace_n`. """ return array_replace_n(array, from_val, to_val, max) @@ -1342,7 +1348,7 @@ def array_replace_all(array: Expr, from_val: Expr, to_val: Expr) -> Expr: def list_replace_all(array: Expr, from_val: Expr, to_val: Expr) -> Expr: """Replaces all occurrences of ``from_val`` with ``to_val``. - This is an alias for `array_replace_all`. + This is an alias for :py:func:`array_replace_all`. """ return array_replace_all(array, from_val, to_val) @@ -1365,7 +1371,7 @@ def array_sort(array: Expr, descending: bool = False, null_first: bool = False) def list_sort(array: Expr, descending: bool = False, null_first: bool = False) -> Expr: - """This is an alias for ``array_sort``.""" + """This is an alias for :py:func:`array_sort`.""" return array_sort(array, descending=descending, null_first=null_first) @@ -1381,20 +1387,20 @@ def array_slice( def list_slice(array: Expr, begin: Expr, end: Expr, stride: Expr | None = None) -> Expr: """Returns a slice of the array. - This is an alias for `array_slice`. + This is an alias for :py:func:`array_slice`. """ return array_slice(array, begin, end, stride) def array_intersect(array1: Expr, array2: Expr) -> Expr: - """Returns an array of the elements in the intersection of array1 and array2.""" + """Returns the intersection of ``array1`` and ``array2``.""" return Expr(f.array_intersect(array1.expr, array2.expr)) def list_intersect(array1: Expr, array2: Expr) -> Expr: - """Returns an array of the elements in the intersection of `array1` and `array2`. + """Returns an the intersection of ``array1`` and ``array2``. - This is an alias for `array_intersect`. + This is an alias for :py:func:`array_intersect`. """ return array_intersect(array1, array2) @@ -1412,20 +1418,20 @@ def list_union(array1: Expr, array2: Expr) -> Expr: Duplicate rows will not be returned. - This is an alias for `array_union`. + This is an alias for :py:func:`array_union`. """ return array_union(array1, array2) def array_except(array1: Expr, array2: Expr) -> Expr: - """Returns an array of the elements that appear in `array1` but not in `array2`.""" + """Returns the elements that appear in ``array1`` but not in ``array2``.""" return Expr(f.array_except(array1.expr, array2.expr)) def list_except(array1: Expr, array2: Expr) -> Expr: - """Returns an array of the elements that appear in `array1` but not in the `array2`. + """Returns the elements that appear in ``array1`` but not in the ``array2``. - This is an alias for `array_except`. + This is an alias for :py:func:`array_except`. """ return array_except(array1, array2) @@ -1433,8 +1439,8 @@ def list_except(array1: Expr, array2: Expr) -> Expr: def array_resize(array: Expr, size: Expr, value: Expr) -> Expr: """Returns an array with the specified size filled. - If `size` is greater than the `array` length, the additional entries will be filled - with the given `value`. + If ``size`` is greater than the ``array`` length, the additional entries will + be filled with the given ``value``. """ return Expr(f.array_resize(array.expr, size.expr, value.expr)) @@ -1442,8 +1448,8 @@ def array_resize(array: Expr, size: Expr, value: Expr) -> Expr: def list_resize(array: Expr, size: Expr, value: Expr) -> Expr: """Returns an array with the specified size filled. - If `size` is greater than the `array` length, the additional entries will be - filled with the given `value`. This is an alias for `array_resize`. + If ``size`` is greater than the ``array`` length, the additional entries will be + filled with the given ``value``. This is an alias for :py:func:`array_resize`. """ return array_resize(array, size, value) @@ -1489,7 +1495,7 @@ def approx_percentile_cont_with_weight( ) -> Expr: """Returns the value of the approximate percentile. - This function is similar to ``approx_percentile_cont`` except that it uses + This function is similar to :py:func:`approx_percentile_cont` except that it uses the associated associated weights. """ return Expr( @@ -1510,7 +1516,7 @@ def avg(arg: Expr, distinct: bool = False) -> Expr: def corr(value1: Expr, value2: Expr, distinct: bool = False) -> Expr: - """Returns the correlation coefficient between `value1` and `value2`.""" + """Returns the correlation coefficient between ``value1`` and ``value2``.""" return Expr(f.corr(value1.expr, value2.expr, distinct=distinct)) @@ -1528,7 +1534,7 @@ def count(args: Expr | list[Expr] | None = None, distinct: bool = False) -> Expr def covar(y: Expr, x: Expr) -> Expr: """Computes the sample covariance. - This is an alias for `covar_samp`. + This is an alias for :py:func:`covar_samp`. """ return covar_samp(y, x) @@ -1559,7 +1565,7 @@ def max(arg: Expr, distinct: bool = False) -> Expr: def mean(arg: Expr, distinct: bool = False) -> Expr: """Returns the average (mean) value of the argument. - This is an alias for `avg`. + This is an alias for :py:func:`avg`. """ return avg(arg, distinct) @@ -1592,7 +1598,7 @@ def stddev_pop(arg: Expr, distinct: bool = False) -> Expr: def stddev_samp(arg: Expr, distinct: bool = False) -> Expr: """Computes the sample standard deviation of the argument. - This is an alias for `stddev`. + This is an alias for :py:func:`stddev`. """ return stddev(arg, distinct) @@ -1600,7 +1606,7 @@ def stddev_samp(arg: Expr, distinct: bool = False) -> Expr: def var(arg: Expr) -> Expr: """Computes the sample variance of the argument. - This is an alias for `var_samp`. + This is an alias for :py:func:`var_samp`. """ return var_samp(arg) @@ -1616,7 +1622,7 @@ def var_samp(arg: Expr) -> Expr: def regr_avgx(y: Expr, x: Expr, distinct: bool = False) -> Expr: - """Computes the average of the independent variable `x`. + """Computes the average of the independent variable ``x``. Only non-null pairs of the inputs are evaluated. """ @@ -1652,7 +1658,7 @@ def regr_slope(y: Expr, x: Expr, distinct: bool = False) -> Expr: def regr_sxx(y: Expr, x: Expr, distinct: bool = False) -> Expr: - """Computes the sum of squares of the independent variable `x`.""" + """Computes the sum of squares of the independent variable ``x``.""" return Expr(f.regr_sxx(y.expr, x.expr, distinct)) @@ -1662,7 +1668,7 @@ def regr_sxy(y: Expr, x: Expr, distinct: bool = False) -> Expr: def regr_syy(y: Expr, x: Expr, distinct: bool = False) -> Expr: - """Computes the sum of squares of the dependent variable `y`.""" + """Computes the sum of squares of the dependent variable ``y``.""" return Expr(f.regr_syy(y.expr, x.expr, distinct)) diff --git a/python/datafusion/record_batch.py b/python/datafusion/record_batch.py index dcfd5548..44936f7d 100644 --- a/python/datafusion/record_batch.py +++ b/python/datafusion/record_batch.py @@ -17,7 +17,8 @@ """This module provides the classes for handling record batches. -These are typically the result of dataframe `execute_stream` operations. +These are typically the result of dataframe +:py:func:`datafusion.dataframe.execute_stream` operations. """ from __future__ import annotations @@ -31,24 +32,25 @@ class RecordBatch: - """This class is essentially a wrapper for ``pyarrow.RecordBatch``.""" + """This class is essentially a wrapper for :py:class:`pyarrow.RecordBatch`.""" def __init__(self, record_batch: df_internal.RecordBatch) -> None: """This constructor is generally not called by the end user. - See the ``RecordBatchStream`` iterator for generating this class. + See the :py:class:`RecordBatchStream` iterator for generating this class. """ self.record_batch = record_batch def to_pyarrow(self) -> pyarrow.RecordBatch: - """Convert to pyarrow ``RecordBatch``.""" + """Convert to :py:class:`pyarrow.RecordBatch`.""" return self.record_batch.to_pyarrow() class RecordBatchStream: """This class represents a stream of record batches. - These are typically the result of a ``DataFrame::execute_stream`` operation. + These are typically the result of a + :py:func:`~datafusion.dataframe.DataFrame.execute_stream` operation. """ def __init__(self, record_batch_stream: df_internal.RecordBatchStream) -> None: @@ -56,7 +58,7 @@ def __init__(self, record_batch_stream: df_internal.RecordBatchStream) -> None: self.rbs = record_batch_stream def next(self) -> RecordBatch | None: - """See ``__next__`` for the iterator function.""" + """See :py:func:`__next__` for the iterator function.""" try: next_batch = next(self) except StopIteration: diff --git a/python/datafusion/substrait.py b/python/datafusion/substrait.py index 4b44ad19..0cdd19a5 100644 --- a/python/datafusion/substrait.py +++ b/python/datafusion/substrait.py @@ -48,7 +48,8 @@ def __init__(self, plan: substrait_internal.Plan) -> None: """Create a substrait plan. The user should not have to call this constructor directly. Rather, it - should be created via `Serde` or `Producer` classes in this module. + should be created via :py:class:`Serde` or py:class:`Producer` classes + in this module. """ self.plan_internal = plan diff --git a/python/datafusion/udf.py b/python/datafusion/udf.py index 4bfbabe6..3161db1b 100644 --- a/python/datafusion/udf.py +++ b/python/datafusion/udf.py @@ -35,7 +35,7 @@ class Volatility(Enum): """Defines how stable or volatile a function is. When setting the volatility of a function, you can either pass this - enumeration or a `str`. The `str` equivalent is the lower case value of the + enumeration or a ``str``. The ``str`` equivalent is the lower case value of the name (`"immutable"`, `"stable"`, or `"volatile"`). """ @@ -52,9 +52,9 @@ class Volatility(Enum): A stable function may return different values given the same input across different queries but must return the same value for a given input within a - query. An example of this is the `Now` function. DataFusion will attempt to - inline `Stable` functions during planning, when possible. For query - `select col1, now() from t1`, it might take a while to execute but `now()` + query. An example of this is the ``Now`` function. DataFusion will attempt to + inline ``Stable`` functions during planning, when possible. For query + ``select col1, now() from t1``, it might take a while to execute but ``now()`` column will be the same for each output row, which is evaluated during planning. """ @@ -66,7 +66,7 @@ class Volatility(Enum): Multiple invocations of a volatile function may return different results when used in the same query. An example of this is the random() function. DataFusion can not evaluate such functions during planning. In the query - `select col1, random() from t1`, `random()` function will be evaluated + ``select col1, random() from t1``, ``random()`` function will be evaluated for each output row, resulting in a unique random value for each row. """ @@ -78,7 +78,7 @@ def __str__(self): class ScalarUDF: """Class for performing scalar user defined functions (UDF). - Scalar UDFs operate on a row by row basis. See also ``AggregateUDF`` for + Scalar UDFs operate on a row by row basis. See also :py:class:`AggregateUDF` for operating on a group of rows. """ @@ -92,7 +92,7 @@ def __init__( ) -> None: """Instantiate a scalar user defined function (UDF). - See helper method ``udf`` for argument details. + See helper method :py:func:`udf` for argument details. """ self.udf = df_internal.ScalarUDF( name, func, input_types, return_type, str(volatility) @@ -119,7 +119,7 @@ def udf( Args: func: A callable python function. - input_types: The data types of the arguments to `func`. This list + input_types: The data types of the arguments to ``func``. This list must be of the same length as the number of arguments. return_type: The data type of the return value from the python function. @@ -144,7 +144,7 @@ def udf( class Accumulator(metaclass=ABCMeta): - """Defines how an `AggregateUDF` accumulates values during an evaluation.""" + """Defines how an :py:class:`AggregateUDF` accumulates values.""" @abstractmethod def state(self) -> List[pyarrow.Scalar]: @@ -175,7 +175,7 @@ class AggregateUDF: """Class for performing scalar user defined functions (UDF). Aggregate UDFs operate on a group of rows and return a single value. See - also ``ScalarUDF`` for operating on a row by row basis. + also :py:class:`ScalarUDF` for operating on a row by row basis. """ def __init__( @@ -189,7 +189,7 @@ def __init__( ) -> None: """Instantiate a user defined aggregate function (UDAF). - See ``Aggregate::udaf`` for a convenience function and arugment + See :py:func:`udaf` for a convenience function and arugment descriptions. """ self.udf = df_internal.AggregateUDF( @@ -216,14 +216,14 @@ def udaf( ) -> AggregateUDF: """Create a new User Defined Aggregate Function. - The accumulator function must be callable and implement `Accumulator`. + The accumulator function must be callable and implement :py:class:`Accumulator`. Args: accum: The accumulator python function. - input_types: The data types of the arguments to `accum`. + input_types: The data types of the arguments to ``accum``. return_type: The data type of the return value. state_type: The data types of the intermediate accumulation. - volatility: See `Volatility` for allowed values. + volatility: See :py:class:`Volatility` for allowed values. name: A descriptive name for the function. Returns: From 7ec414869f9d5ea498af1dde5cfc470c0e749c45 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sun, 28 Jul 2024 11:24:31 -0400 Subject: [PATCH 2/9] Update documentation to cross reference --- docs/source/conf.py | 4 ++-- docs/source/user-guide/basics.rst | 12 ++++++------ .../user-guide/common-operations/aggregations.rst | 2 +- .../user-guide/common-operations/basic-info.rst | 8 ++++---- .../user-guide/common-operations/expressions.rst | 10 +++++----- .../user-guide/common-operations/functions.rst | 12 ++++++------ docs/source/user-guide/common-operations/joins.rst | 4 ++-- .../common-operations/select-and-filter.rst | 10 +++++----- .../user-guide/common-operations/udf-and-udfa.rst | 4 ++-- docs/source/user-guide/common-operations/windows.rst | 6 +++--- docs/source/user-guide/configuration.rst | 6 +++--- docs/source/user-guide/io/avro.rst | 2 +- docs/source/user-guide/io/csv.rst | 6 +++--- docs/source/user-guide/io/json.rst | 2 +- docs/source/user-guide/io/parquet.rst | 6 +++--- python/datafusion/__init__.py | 11 +++++++++-- python/datafusion/dataframe.py | 2 +- python/datafusion/functions.py | 8 ++++---- python/datafusion/udf.py | 8 ++++---- 19 files changed, 65 insertions(+), 58 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 3a66aaf5..122352ec 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -46,8 +46,8 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - "sphinx.ext.autodoc", - "sphinx.ext.autosummary", + # "sphinx.ext.autodoc", + # "sphinx.ext.autosummary", "sphinx.ext.doctest", "sphinx.ext.ifconfig", "sphinx.ext.mathjax", diff --git a/docs/source/user-guide/basics.rst b/docs/source/user-guide/basics.rst index 4ac095e4..3c97d1ef 100644 --- a/docs/source/user-guide/basics.rst +++ b/docs/source/user-guide/basics.rst @@ -54,7 +54,7 @@ The first statement group: # create a context ctx = datafusion.SessionContext() -creates a :code:`SessionContext`, that is, the main interface for executing queries with DataFusion. It maintains the state +creates a :py:class:`~datafusion.context.SessionContext`, that is, the main interface for executing queries with DataFusion. It maintains the state of the connection between a user and an instance of the DataFusion engine. Additionally it provides the following functionality: - Create a DataFrame from a CSV or Parquet data source. @@ -74,9 +74,9 @@ The second statement group creates a :code:`DataFrame`, df = ctx.create_dataframe([[batch]]) A DataFrame refers to a (logical) set of rows that share the same column names, similar to a `Pandas DataFrame `_. -DataFrames are typically created by calling a method on :code:`SessionContext`, such as :code:`read_csv`, and can then be modified by -calling the transformation methods, such as :meth:`.DataFrame.filter`, :meth:`.DataFrame.select`, :meth:`.DataFrame.aggregate`, -and :meth:`.DataFrame.limit` to build up a query definition. +DataFrames are typically created by calling a method on :py:class:`~datafusion.context.SessionContext`, such as :code:`read_csv`, and can then be modified by +calling the transformation methods, such as :py:func:`~datafusion.dataframe.DataFrame.filter`, :py:func:`~datafusion.dataframe.DataFrame.select`, :py:func:`~datafusion.dataframe.DataFrame.aggregate`, +and :py:func:`~datafusion.dataframe.DataFrame.limit` to build up a query definition. The third statement uses :code:`Expressions` to build up a query definition. @@ -87,5 +87,5 @@ The third statement uses :code:`Expressions` to build up a query definition. col("a") - col("b"), ) -Finally the :code:`collect` method converts the logical plan represented by the DataFrame into a physical plan and execute it, -collecting all results into a list of `RecordBatch `_. \ No newline at end of file +Finally the :py:func:`~datafusion.dataframe.DataFrame.collect` method converts the logical plan represented by the DataFrame into a physical plan and execute it, +collecting all results into a list of `RecordBatch `_. diff --git a/docs/source/user-guide/common-operations/aggregations.rst b/docs/source/user-guide/common-operations/aggregations.rst index 235d644e..b9202129 100644 --- a/docs/source/user-guide/common-operations/aggregations.rst +++ b/docs/source/user-guide/common-operations/aggregations.rst @@ -19,7 +19,7 @@ Aggregation ============ An aggregate or aggregation is a function where the values of multiple rows are processed together to form a single summary value. -For performing an aggregation, DataFusion provides the :meth:`.DataFrame.aggregate` +For performing an aggregation, DataFusion provides the :py:func:`~datafusion.dataframe.DataFrame.aggregate` .. ipython:: python diff --git a/docs/source/user-guide/common-operations/basic-info.rst b/docs/source/user-guide/common-operations/basic-info.rst index 424e1cc9..d48b49d5 100644 --- a/docs/source/user-guide/common-operations/basic-info.rst +++ b/docs/source/user-guide/common-operations/basic-info.rst @@ -34,26 +34,26 @@ In this section, you will learn how to display essential details of DataFrames u }) df -Use :meth:`.DataFrame.limit` to view the top rows of the frame: +Use :py:func:`~datafusion.dataframe.DataFrame.limit` to view the top rows of the frame: .. ipython:: python df.limit(2) -Display the columns of the DataFrame using :meth:`.DataFrame.schema`: +Display the columns of the DataFrame using :py:func:`~datafusion.dataframe.DataFrame.schema`: .. ipython:: python df.schema() -The method :meth:`.DataFrame.to_pandas` uses pyarrow to convert to pandas DataFrame, by collecting the batches, +The method :py:func:`~datafusion.dataframe.DataFrame.to_pandas` uses pyarrow to convert to pandas DataFrame, by collecting the batches, passing them to an Arrow table, and then converting them to a pandas DataFrame. .. ipython:: python df.to_pandas() -:meth:`.DataFrame.describe` shows a quick statistic summary of your data: +:py:func:`~datafusion.dataframe.DataFrame.describe` shows a quick statistic summary of your data: .. ipython:: python diff --git a/docs/source/user-guide/common-operations/expressions.rst b/docs/source/user-guide/common-operations/expressions.rst index 447a90bb..c8f8b8f2 100644 --- a/docs/source/user-guide/common-operations/expressions.rst +++ b/docs/source/user-guide/common-operations/expressions.rst @@ -28,16 +28,16 @@ concept shared across most compilers and databases. Column ------ -The first expression most new users will interact with is the Column, which is created by calling :func:`col`. -This expression represents a column within a DataFrame. The function :func:`col` takes as in input a string +The first expression most new users will interact with is the Column, which is created by calling :py:func:`~datafusion.col`. +This expression represents a column within a DataFrame. The function :py:func:`~datafusion.col` takes as in input a string and returns an expression as it's output. Literal ------- Literal expressions represent a single value. These are helpful in a wide range of operations where -a specific, known value is of interest. You can create a literal expression using the function :func:`lit`. -The type of the object passed to the :func:`lit` function will be used to convert it to a known data type. +a specific, known value is of interest. You can create a literal expression using the function :py:func:`~datafusion.lit`. +The type of the object passed to the :py:func:`~datafusion.lit` function will be used to convert it to a known data type. In the following example we create expressions for the column named `color` and the literal scalar string `red`. The resultant variable `red_units` is itself also an expression. @@ -64,7 +64,7 @@ Functions --------- As mentioned before, most functions in DataFusion return an expression at their output. This allows us to create -a wide variety of expressions built up from other expressions. For example, :func:`.alias` is a function that takes +a wide variety of expressions built up from other expressions. For example, :py:func:`~datafusion.expr.Expr.alias` is a function that takes as it input a single expression and returns an expression in which the name of the expression has changed. The following example shows a series of expressions that are built up from functions operating on expressions. diff --git a/docs/source/user-guide/common-operations/functions.rst b/docs/source/user-guide/common-operations/functions.rst index 7efb939e..a0b95c90 100644 --- a/docs/source/user-guide/common-operations/functions.rst +++ b/docs/source/user-guide/common-operations/functions.rst @@ -40,7 +40,7 @@ We'll use the pokemon dataset in the following examples. Mathematical ------------ -DataFusion offers mathematical functions such as :func:`.pow` or :func:`.log` +DataFusion offers mathematical functions such as :py:func:`~datafusion.functions.pow` or :py:func:`~datafusion.functions.log` .. ipython:: python @@ -55,7 +55,7 @@ DataFusion offers mathematical functions such as :func:`.pow` or :func:`.log` Conditional ----------- -There 3 conditional functions in DataFusion :func:`.coalesce`, :func:`.nullif` and :func:`.case` (not available in Python) +There 3 conditional functions in DataFusion :py:func:`~datafusion.functions.coalesce`, :py:func:`~datafusion.functions.nullif` and :py:func:`~datafusion.functions.case`. .. ipython:: python @@ -66,13 +66,13 @@ There 3 conditional functions in DataFusion :func:`.coalesce`, :func:`.nullif` a Temporal -------- -For selecting the current time use :func:`.now` +For selecting the current time use :py:func:`~datafusion.functions.now` .. ipython:: python df.select(f.now()) -Convert to timestamps using :func:`.to_timestamp` +Convert to timestamps using :py:func:`~datafusion.functions.to_timestamp` .. ipython:: python @@ -92,7 +92,7 @@ DataFusion offers a range of helpful options. f.left(col('"Name"'), literal(4)).alias("code") ) -This also includes the functions for regular expressions like :func:`.regexp_replace` and :func:`.regexp_match` +This also includes the functions for regular expressions like :py:func:`~datafusion.functions.regexp_replace` and :py:func:`~datafusion.functions.regexp_match` .. ipython:: python @@ -105,7 +105,7 @@ This also includes the functions for regular expressions like :func:`.regexp_rep Other ----- -The function :func:`.in_list` allows to check a column for the presence of multiple values: +The function :py:func:`~datafusion.functions.in_list` allows to check a column for the presence of multiple values: .. ipython:: python diff --git a/docs/source/user-guide/common-operations/joins.rst b/docs/source/user-guide/common-operations/joins.rst index 12820311..09fa145a 100644 --- a/docs/source/user-guide/common-operations/joins.rst +++ b/docs/source/user-guide/common-operations/joins.rst @@ -18,7 +18,7 @@ Joins ===== -DataFusion supports the following join variants via the method :meth:`.DataFrame.join` +DataFusion supports the following join variants via the method :py:func:`~datafusion.dataframe.DataFrame.join` - Inner Join - Left Join @@ -58,7 +58,7 @@ will be included in the resulting DataFrame. left.join(right, join_keys=(["customer_id"], ["id"]), how="inner") -The parameter :code:`join_keys` specifies the columns from the left DataFrame and right DataFrame that contains the values +The parameter ``join_keys`` specifies the columns from the left DataFrame and right DataFrame that contains the values that should match. Left Join diff --git a/docs/source/user-guide/common-operations/select-and-filter.rst b/docs/source/user-guide/common-operations/select-and-filter.rst index 8ede230e..92b4841b 100644 --- a/docs/source/user-guide/common-operations/select-and-filter.rst +++ b/docs/source/user-guide/common-operations/select-and-filter.rst @@ -18,7 +18,7 @@ Column Selections ================= -Use :meth:`.DataFrame.select_columns` for basic column selection. +Use :py:func:`~datafusion.dataframe.DataFrame.select` for basic column selection. DataFusion can work with several file types, to start simple we can use a subset of the `TLC Trip Record Data `_ @@ -35,8 +35,8 @@ DataFusion can work with several file types, to start simple we can use a subset df = ctx.read_parquet("yellow_trip_data.parquet") df.select_columns("trip_distance", "passenger_count") -For mathematical or logical operations use :func:`.col` to select columns, and give meaningful names to the resulting -operations using :func:`.alias` +For mathematical or logical operations use :py:func:`~datafusion.col` to select columns, and give meaningful names to the resulting +operations using :py:func:`~datafusion.expr.Expr.alias` .. ipython:: python @@ -48,7 +48,7 @@ operations using :func:`.alias` Please be aware that all identifiers are effectively made lower-case in SQL, so if your file has capital letters (ex: Name) you must put your column name in double quotes or the selection won’t work. As an alternative for simple - column selection use :meth:`.DataFrame.select_columns` without double quotes + column selection use :py:func:`~datafusion.dataframe.DataFrame.select_columns` without double quotes For selecting columns with capital letters use ``'"VendorID"'`` @@ -57,7 +57,7 @@ For selecting columns with capital letters use ``'"VendorID"'`` df.select(col('"VendorID"')) -To combine it with literal values use the :func:`.lit` +To combine it with literal values use the :py:func:`~datafusion.lit` .. ipython:: python diff --git a/docs/source/user-guide/common-operations/udf-and-udfa.rst b/docs/source/user-guide/common-operations/udf-and-udfa.rst index 62d249c7..54c68579 100644 --- a/docs/source/user-guide/common-operations/udf-and-udfa.rst +++ b/docs/source/user-guide/common-operations/udf-and-udfa.rst @@ -19,7 +19,7 @@ User Defined Functions ====================== DataFusion provides powerful expressions and functions, reducing the need for custom Python functions. -However you can still incorporate your own functions, i.e. User-Defined Functions (UDFs), with the :func:`.udf` function. +However you can still incorporate your own functions, i.e. User-Defined Functions (UDFs), with the :py:func:`~datafusion.udf.ScalarUDF.udf` function. .. ipython:: python @@ -42,7 +42,7 @@ However you can still incorporate your own functions, i.e. User-Defined Function df.select(is_null_arr(col("a"))).to_pandas() -Additionally the :func:`.udaf` function allows you to define User-Defined Aggregate Functions (UDAFs) +Additionally the :py:func:`~datafusion.udf.AggregateUDF.udaf` function allows you to define User-Defined Aggregate Functions (UDAFs) .. code-block:: python diff --git a/docs/source/user-guide/common-operations/windows.rst b/docs/source/user-guide/common-operations/windows.rst index f884c7e0..5ef3c986 100644 --- a/docs/source/user-guide/common-operations/windows.rst +++ b/docs/source/user-guide/common-operations/windows.rst @@ -21,7 +21,7 @@ Window Functions In this section you will learn about window functions. A window function utilizes values from one or multiple rows to produce a result for each individual row, unlike an aggregate function that provides a single value for multiple rows. -The functionality of window functions in DataFusion is supported by the dedicated :func:`.window` function. +The functionality of window functions in DataFusion is supported by the dedicated :py:func:`~datafusion.functions.window` function. We'll use the pokemon dataset (from Ritchie Vink) in the following examples. @@ -40,7 +40,7 @@ We'll use the pokemon dataset (from Ritchie Vink) in the following examples. ctx = SessionContext() df = ctx.read_csv("pokemon.csv") -Here is an example that shows how to compare each pokemons’s attack power with the average attack power in its :code:`"Type 1"` +Here is an example that shows how to compare each pokemons’s attack power with the average attack power in its ``"Type 1"`` .. ipython:: python @@ -54,7 +54,7 @@ Here is an example that shows how to compare each pokemons’s attack power with ) You can also control the order in which rows are processed by window functions by providing -a list of :func:`.order_by` functions for the :code:`order_by` parameter. +a list of ``order_by`` functions for the ``order_by`` parameter. .. ipython:: python diff --git a/docs/source/user-guide/configuration.rst b/docs/source/user-guide/configuration.rst index 9c506b7e..7d330019 100644 --- a/docs/source/user-guide/configuration.rst +++ b/docs/source/user-guide/configuration.rst @@ -18,8 +18,8 @@ Configuration ============= -Let's look at how we can configure DataFusion. When creating a :code:`SessionContext`, you can pass in -a :code:`SessionConfig` and :code:`RuntimeConfig` object. These two cover a wide range of options. +Let's look at how we can configure DataFusion. When creating a :py:class:`~datafusion.context.SessionContext`, you can pass in +a :py:class:`~datafusion.context.SessionConfig` and :py:class:`~datafusion.context.RuntimeConfig` object. These two cover a wide range of options. .. code-block:: python @@ -47,5 +47,5 @@ a :code:`SessionConfig` and :code:`RuntimeConfig` object. These two cover a wide print(ctx) -You can read more about available :code:`SessionConfig` options in the `rust DataFusion Configuration guide `_, +You can read more about available :py:class:`~datafusion.context.SessionConfig` options in the `rust DataFusion Configuration guide `_, and about :code:`RuntimeConfig` options in the rust `online API documentation `_. diff --git a/docs/source/user-guide/io/avro.rst b/docs/source/user-guide/io/avro.rst index 85d546e2..5f1ff728 100644 --- a/docs/source/user-guide/io/avro.rst +++ b/docs/source/user-guide/io/avro.rst @@ -19,7 +19,7 @@ Avro ==== `Avro `_ is a serialization format for record data. Reading an avro file is very straightforward -with :meth:`.SessionContext.read_avro` +with :py:func:`~datafusion.context.SessionContext.read_avro` .. code-block:: python diff --git a/docs/source/user-guide/io/csv.rst b/docs/source/user-guide/io/csv.rst index 3f95c54a..d2a62bfe 100644 --- a/docs/source/user-guide/io/csv.rst +++ b/docs/source/user-guide/io/csv.rst @@ -18,7 +18,7 @@ CSV === -Reading a csv is very straightforward with :meth:`.SessionContext.read_csv` +Reading a csv is very straightforward with :py:func:`~datafusion.context.SessionContext.read_csv` .. code-block:: python @@ -28,9 +28,9 @@ Reading a csv is very straightforward with :meth:`.SessionContext.read_csv` ctx = SessionContext() df = ctx.read_csv("file.csv") -An alternative is to use :meth:`.SessionContext.register_csv` +An alternative is to use :py:func:`~datafusion.context.SessionContext.register_csv` .. code-block:: python ctx.register_csv("file", "file.csv") - df = ctx.table("file") \ No newline at end of file + df = ctx.table("file") diff --git a/docs/source/user-guide/io/json.rst b/docs/source/user-guide/io/json.rst index 1ee065c4..f9da3755 100644 --- a/docs/source/user-guide/io/json.rst +++ b/docs/source/user-guide/io/json.rst @@ -18,7 +18,7 @@ JSON ==== `JSON `_ (JavaScript Object Notation) is a lightweight data-interchange format. -When it comes to reading a JSON file, using :meth:`.SessionContext.read_json` is a simple and easy +When it comes to reading a JSON file, using :py:func:`~datafusion.context.SessionContext.read_json` is a simple and easy .. code-block:: python diff --git a/docs/source/user-guide/io/parquet.rst b/docs/source/user-guide/io/parquet.rst index 78bba30c..75bc981c 100644 --- a/docs/source/user-guide/io/parquet.rst +++ b/docs/source/user-guide/io/parquet.rst @@ -18,7 +18,7 @@ Parquet ======= -It is quite simple to read a parquet file using the :meth:`.SessionContext.read_parquet` function. +It is quite simple to read a parquet file using the :py:func:`~datafusion.context.SessionContext.read_parquet` function. .. code-block:: python @@ -28,9 +28,9 @@ It is quite simple to read a parquet file using the :meth:`.SessionContext.read_ ctx = SessionContext() df = ctx.read_parquet("file.parquet") -An alternative is to use :meth:`.SessionContext.register_parquet` +An alternative is to use :py:func:`~datafusion.context.SessionContext.register_parquet` .. code-block:: python ctx.register_parquet("file", "file.parquet") - df = ctx.table("file") \ No newline at end of file + df = ctx.table("file") diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index 0569ac4b..08ca3fe0 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -69,7 +69,9 @@ "ScalarUDF", "WindowFrame", "column", + "col", "literal", + "lit", "DFSchema", "runtime", "Catalog", @@ -93,7 +95,9 @@ def column(value: str): return Expr.column(value) -col = column +def col(value: str): + """Create a column expression.""" + return Expr.column(value) def literal(value): @@ -101,7 +105,10 @@ def literal(value): return Expr.literal(value) -lit = literal +def lit(value): + """Create a literal expression.""" + return Expr.literal(value) + udf = ScalarUDF.udf diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index f9c073b3..fa739844 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -506,7 +506,7 @@ def count(self) -> int: """ return self.df.count() - @deprecated("Use :func:`unnest_columns` instead.") + @deprecated("Use :py:func:`unnest_columns` instead.") def unnest_column(self, column: str, preserve_nulls: bool = True) -> DataFrame: """See :py:func:`unnest_columns`.""" return DataFrame(self.df.unnest_column(column, preserve_nulls=preserve_nulls)) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 3c9e01bc..82b5056d 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -280,7 +280,7 @@ def array_to_string(expr: Expr, delimiter: Expr) -> Expr: def array_join(expr: Expr, delimiter: Expr) -> Expr: """Converts each element to its text representation. - This is an alias for :func:`array_to_string`. + This is an alias for :py:func:`array_to_string`. """ return array_to_string(expr, delimiter) @@ -288,7 +288,7 @@ def array_join(expr: Expr, delimiter: Expr) -> Expr: def list_to_string(expr: Expr, delimiter: Expr) -> Expr: """Converts each element to its text representation. - This is an alias for :func:`array_to_string`. + This is an alias for :py:func:`array_to_string`. """ return array_to_string(expr, delimiter) @@ -296,7 +296,7 @@ def list_to_string(expr: Expr, delimiter: Expr) -> Expr: def list_join(expr: Expr, delimiter: Expr) -> Expr: """Converts each element to its text representation. - This is an alias for :func:`array_to_string`. + This is an alias for :py:func:`array_to_string`. """ return array_to_string(expr, delimiter) @@ -654,7 +654,7 @@ def pi() -> Expr: def position(string: Expr, substring: Expr) -> Expr: """Finds the position from where the ``substring`` matches the ``string``. - This is an alias for :func:`strpos`. + This is an alias for :py:func:`strpos`. """ return strpos(string, substring) diff --git a/python/datafusion/udf.py b/python/datafusion/udf.py index 3161db1b..12563b3d 100644 --- a/python/datafusion/udf.py +++ b/python/datafusion/udf.py @@ -94,7 +94,7 @@ def __init__( See helper method :py:func:`udf` for argument details. """ - self.udf = df_internal.ScalarUDF( + self._udf = df_internal.ScalarUDF( name, func, input_types, return_type, str(volatility) ) @@ -105,7 +105,7 @@ def __call__(self, *args: Expr) -> Expr: occur during the evaluation of the dataframe. """ args = [arg.expr for arg in args] - return Expr(self.udf.__call__(*args)) + return Expr(self._udf.__call__(*args)) @staticmethod def udf( @@ -192,7 +192,7 @@ def __init__( See :py:func:`udaf` for a convenience function and arugment descriptions. """ - self.udf = df_internal.AggregateUDF( + self._udf = df_internal.AggregateUDF( name, accumulator, input_types, return_type, state_type, str(volatility) ) @@ -203,7 +203,7 @@ def __call__(self, *args: Expr) -> Expr: occur during the evaluation of the dataframe. """ args = [arg.expr for arg in args] - return Expr(self.udf.__call__(*args)) + return Expr(self._udf.__call__(*args)) @staticmethod def udaf( From dd5e111394dd86134b6f458af8da06eb8b66edca Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sun, 28 Jul 2024 18:34:20 -0400 Subject: [PATCH 3/9] Correct class names and internal attr --- python/datafusion/context.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 731ff530..922cc87a 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -810,11 +810,11 @@ def register_dataset(self, name: str, dataset: pyarrow.dataset.Dataset) -> None: def register_udf(self, udf: ScalarUDF) -> None: """Register a user-defined function (UDF) with the context.""" - self.ctx.register_udf(udf.udf) + self.ctx.register_udf(udf._udf) def register_udaf(self, udaf: AggregateUDF) -> None: """Register a user-defined aggregation function (UDAF) with the context.""" - self.ctx.register_udaf(udaf) + self.ctx.register_udaf(udaf._udaf) def catalog(self, name: str = "datafusion") -> Catalog: """Retrieve a catalog by name.""" From 9d7dda2a336d8ecefb2ec0b52cd8c8fe0cb3ab54 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Mon, 29 Jul 2024 06:31:47 -0400 Subject: [PATCH 4/9] Revert changes that will end up coming in via PR #782 --- python/datafusion/substrait.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/python/datafusion/substrait.py b/python/datafusion/substrait.py index 0cdd19a5..ad52002f 100644 --- a/python/datafusion/substrait.py +++ b/python/datafusion/substrait.py @@ -81,7 +81,7 @@ def serialize(sql: str, ctx: SessionContext, path: str | pathlib.Path) -> None: ctx: SessionContext to use. path: Path to write the Substrait plan to. """ - return substrait_internal.Serde.serialize(sql, ctx.ctx, str(path)) + return substrait_internal.serde.serialize(sql, ctx.ctx, str(path)) @staticmethod def serialize_to_plan(sql: str, ctx: SessionContext) -> Plan: @@ -94,7 +94,7 @@ def serialize_to_plan(sql: str, ctx: SessionContext) -> Plan: Returns: Substrait plan. """ - return Plan(substrait_internal.Serde.serialize_to_plan(sql, ctx.ctx)) + return Plan(substrait_internal.serde.serialize_to_plan(sql, ctx.ctx)) @staticmethod def serialize_bytes(sql: str, ctx: SessionContext) -> bytes: @@ -107,7 +107,7 @@ def serialize_bytes(sql: str, ctx: SessionContext) -> bytes: Returns: Substrait plan as bytes. """ - return substrait_internal.Serde.serialize_bytes(sql, ctx.ctx) + return substrait_internal.serde.serialize_bytes(sql, ctx.ctx) @staticmethod def deserialize(path: str | pathlib.Path) -> Plan: @@ -119,7 +119,7 @@ def deserialize(path: str | pathlib.Path) -> Plan: Returns: Substrait plan. """ - return Plan(substrait_internal.Serde.deserialize(str(path))) + return Plan(substrait_internal.serde.deserialize(str(path))) @staticmethod def deserialize_bytes(proto_bytes: bytes) -> Plan: @@ -131,7 +131,7 @@ def deserialize_bytes(proto_bytes: bytes) -> Plan: Returns: Substrait plan. """ - return Plan(substrait_internal.Serde.deserialize_bytes(proto_bytes)) + return Plan(substrait_internal.serde.deserialize_bytes(proto_bytes)) @deprecated("Use `Serde` instead.") @@ -156,7 +156,7 @@ def to_substrait_plan(logical_plan: LogicalPlan, ctx: SessionContext) -> Plan: Substrait plan. """ return Plan( - substrait_internal.Producer.to_substrait_plan(logical_plan, ctx.ctx) + substrait_internal.producer.to_substrait_plan(logical_plan, ctx.ctx) ) @@ -181,7 +181,7 @@ def from_substrait_plan(ctx: SessionContext, plan: Plan) -> LogicalPlan: Returns: LogicalPlan. """ - return substrait_internal.Consumer.from_substrait_plan( + return substrait_internal.consumer.from_substrait_plan( ctx.ctx, plan.plan_internal ) From 678b9c6f89415be54a266b20108b29818e199a18 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Mon, 29 Jul 2024 06:59:58 -0400 Subject: [PATCH 5/9] Add autoapi to requirements file --- docs/requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 42bc4e51..f5cece78 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -22,4 +22,5 @@ maturin jinja2 ipython pandas -pickleshare \ No newline at end of file +pickleshare +sphinx-autoapi From 611c2d9c557b81699d5e33dde3e22d4fd3449010 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Mon, 29 Jul 2024 07:00:46 -0400 Subject: [PATCH 6/9] Add git ignore for files retrieved during local site building --- docs/.gitignore | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 docs/.gitignore diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 00000000..41e13534 --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1,2 @@ +pokemon.csv +yellow_trip_data.parquet From b2dcebc0dd93e427054d701a3dd54ce568e48db9 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Mon, 5 Aug 2024 07:24:38 -0400 Subject: [PATCH 7/9] Remove unused portions of doc config --- docs/source/conf.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 122352ec..d5084551 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -46,12 +46,7 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - # "sphinx.ext.autodoc", - # "sphinx.ext.autosummary", - "sphinx.ext.doctest", - "sphinx.ext.ifconfig", "sphinx.ext.mathjax", - "sphinx.ext.viewcode", "sphinx.ext.napoleon", "myst_parser", "IPython.sphinxext.ipython_directive", @@ -71,21 +66,10 @@ # This pattern also affects html_static_path and html_extra_path. exclude_patterns = [] -# Show members for classes in .. autosummary -autodoc_default_options = { - "members": None, - "undoc-members": None, - "show-inheritance": None, - "inherited-members": None, -} - -autosummary_generate = True - autoapi_dirs = ["../../python"] autoapi_ignore = ["*tests*"] autoapi_member_order = "groupwise" suppress_warnings = ["autoapi.python_import_resolution"] -autoapi_keep_files = True autoapi_python_class_content = "both" From 790cbb7cf31fd1fdd985fcec0dd5642d115ddcdd Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Mon, 5 Aug 2024 07:46:49 -0400 Subject: [PATCH 8/9] Reset substrait capitalization that was reverted during rebase --- python/datafusion/substrait.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/python/datafusion/substrait.py b/python/datafusion/substrait.py index ad52002f..0cdd19a5 100644 --- a/python/datafusion/substrait.py +++ b/python/datafusion/substrait.py @@ -81,7 +81,7 @@ def serialize(sql: str, ctx: SessionContext, path: str | pathlib.Path) -> None: ctx: SessionContext to use. path: Path to write the Substrait plan to. """ - return substrait_internal.serde.serialize(sql, ctx.ctx, str(path)) + return substrait_internal.Serde.serialize(sql, ctx.ctx, str(path)) @staticmethod def serialize_to_plan(sql: str, ctx: SessionContext) -> Plan: @@ -94,7 +94,7 @@ def serialize_to_plan(sql: str, ctx: SessionContext) -> Plan: Returns: Substrait plan. """ - return Plan(substrait_internal.serde.serialize_to_plan(sql, ctx.ctx)) + return Plan(substrait_internal.Serde.serialize_to_plan(sql, ctx.ctx)) @staticmethod def serialize_bytes(sql: str, ctx: SessionContext) -> bytes: @@ -107,7 +107,7 @@ def serialize_bytes(sql: str, ctx: SessionContext) -> bytes: Returns: Substrait plan as bytes. """ - return substrait_internal.serde.serialize_bytes(sql, ctx.ctx) + return substrait_internal.Serde.serialize_bytes(sql, ctx.ctx) @staticmethod def deserialize(path: str | pathlib.Path) -> Plan: @@ -119,7 +119,7 @@ def deserialize(path: str | pathlib.Path) -> Plan: Returns: Substrait plan. """ - return Plan(substrait_internal.serde.deserialize(str(path))) + return Plan(substrait_internal.Serde.deserialize(str(path))) @staticmethod def deserialize_bytes(proto_bytes: bytes) -> Plan: @@ -131,7 +131,7 @@ def deserialize_bytes(proto_bytes: bytes) -> Plan: Returns: Substrait plan. """ - return Plan(substrait_internal.serde.deserialize_bytes(proto_bytes)) + return Plan(substrait_internal.Serde.deserialize_bytes(proto_bytes)) @deprecated("Use `Serde` instead.") @@ -156,7 +156,7 @@ def to_substrait_plan(logical_plan: LogicalPlan, ctx: SessionContext) -> Plan: Substrait plan. """ return Plan( - substrait_internal.producer.to_substrait_plan(logical_plan, ctx.ctx) + substrait_internal.Producer.to_substrait_plan(logical_plan, ctx.ctx) ) @@ -181,7 +181,7 @@ def from_substrait_plan(ctx: SessionContext, plan: Plan) -> LogicalPlan: Returns: LogicalPlan. """ - return substrait_internal.consumer.from_substrait_plan( + return substrait_internal.Consumer.from_substrait_plan( ctx.ctx, plan.plan_internal ) From 82580545fef33c14441025d6c600eba1042dcb09 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Mon, 5 Aug 2024 16:14:42 -0400 Subject: [PATCH 9/9] Small example changes --- examples/python-udf-comparisons.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/python-udf-comparisons.py b/examples/python-udf-comparisons.py index e2d85674..5a6f548f 100644 --- a/examples/python-udf-comparisons.py +++ b/examples/python-udf-comparisons.py @@ -22,7 +22,7 @@ import time path = os.path.dirname(os.path.abspath(__file__)) -filepath = os.path.join(path, "../tpch/data/lineitem.parquet") +filepath = os.path.join(path, "./tpch/data/lineitem.parquet") # This example serves to demonstrate alternate approaches to answering the # question "return all of the rows that have a specific combination of these @@ -122,7 +122,7 @@ def is_of_interest_impl( is_of_interest = udf( is_of_interest_impl, - [pa.int32(), pa.int32(), pa.utf8()], + [pa.int64(), pa.int64(), pa.utf8()], pa.bool_(), "stable", ) @@ -170,7 +170,7 @@ def udf_using_pyarrow_compute_impl( udf_using_pyarrow_compute = udf( udf_using_pyarrow_compute_impl, - [pa.int32(), pa.int32(), pa.utf8()], + [pa.int64(), pa.int64(), pa.utf8()], pa.bool_(), "stable", )