From 9d474deb800ed95cf4fe2c2d2775c25515e88993 Mon Sep 17 00:00:00 2001 From: rjzamora Date: Tue, 16 Jul 2024 12:55:46 -0700 Subject: [PATCH 1/8] disable query-planning when merlin.core is imported --- merlin/core/__init__.py | 31 ++++++++++++++++++++++++++++++- tests/__init__.py | 3 +++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/merlin/core/__init__.py b/merlin/core/__init__.py index f35898e5d..38c822313 100644 --- a/merlin/core/__init__.py +++ b/merlin/core/__init__.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 20222024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,3 +17,32 @@ from merlin.core import _version __version__ = _version.get_versions()["version"] + + +# If dask is installed, make sure query-planning is disabled +_DASK_QUERY_PLANNING_ENABLED = False +try: + import dask + + dask.config.set({"dataframe.query-planning": False}) +except ImportError: + pass +else: + import sys + + import dask.dataframe as dd + from packaging.version import parse + + if parse(dask.__version__) > parse("2024.6.0"): + _DASK_QUERY_PLANNING_ENABLED = dd.DASK_EXPR_ENABLED + else: + _DASK_QUERY_PLANNING_ENABLED = "dask_expr" in sys.modules + + +if _DASK_QUERY_PLANNING_ENABLED: + raise NotImplementedError( + "Merlin does not support the query-planning API in Dask " + "Dataframe yet. Please make sure query-planning is " + "disabled before dask.dataframe is imported. E.g.:\n" + "dask.config.set({'dataframe.query-planning': False})" + ) diff --git a/tests/__init__.py b/tests/__init__.py index e69de29bb..3e70093aa 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -0,0 +1,3 @@ +from merlin.core import _DASK_QUERY_PLANNING_ENABLED + +assert _DASK_QUERY_PLANNING_ENABLED is False From 3c26f83575dc2a40109c347056e00e2ab2791109 Mon Sep 17 00:00:00 2001 From: rjzamora Date: Tue, 16 Jul 2024 12:56:57 -0700 Subject: [PATCH 2/8] fix typo --- merlin/core/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/merlin/core/__init__.py b/merlin/core/__init__.py index 38c822313..f6032befa 100644 --- a/merlin/core/__init__.py +++ b/merlin/core/__init__.py @@ -1,5 +1,5 @@ # -# Copyright (c) 20222024, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From cdc7e6df02a338a4fdabe69fa2d3148f95365938 Mon Sep 17 00:00:00 2001 From: rjzamora Date: Tue, 16 Jul 2024 13:37:53 -0700 Subject: [PATCH 3/8] remove tests/__init__.py change --- tests/__init__.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/__init__.py b/tests/__init__.py index 3e70093aa..e69de29bb 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,3 +0,0 @@ -from merlin.core import _DASK_QUERY_PLANNING_ENABLED - -assert _DASK_QUERY_PLANNING_ENABLED is False From dfedab7dc0cf3424dd16175dd3fb0a085abd6d0f Mon Sep 17 00:00:00 2001 From: rjzamora Date: Tue, 16 Jul 2024 21:25:43 -0700 Subject: [PATCH 4/8] disable string conversin --- merlin/core/__init__.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/merlin/core/__init__.py b/merlin/core/__init__.py index f6032befa..c63cba880 100644 --- a/merlin/core/__init__.py +++ b/merlin/core/__init__.py @@ -19,12 +19,17 @@ __version__ = _version.get_versions()["version"] -# If dask is installed, make sure query-planning is disabled _DASK_QUERY_PLANNING_ENABLED = False try: + # Disable query-planning and string conversion import dask - dask.config.set({"dataframe.query-planning": False}) + dask.config.set( + { + "dataframe.query-planning": False, + "dataframe.convert-string": False, + } + ) except ImportError: pass else: From 884dc26ca9c3814d61a842065c6ac89fa5c86328 Mon Sep 17 00:00:00 2001 From: rjzamora Date: Wed, 17 Jul 2024 07:19:17 -0700 Subject: [PATCH 5/8] move dask config settings/check to top level (only way to be sure we don't import dd anywere before disabling these settings) --- merlin/__init__.py | 48 +++++++++++++++++++++++++++++++++++++++++ merlin/core/__init__.py | 36 +------------------------------ 2 files changed, 49 insertions(+), 35 deletions(-) create mode 100644 merlin/__init__.py diff --git a/merlin/__init__.py b/merlin/__init__.py new file mode 100644 index 000000000..70b6ed8aa --- /dev/null +++ b/merlin/__init__.py @@ -0,0 +1,48 @@ +# +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +_DASK_QUERY_PLANNING_ENABLED = False +try: + # Disable query-planning and string conversion + import dask + + dask.config.set( + { + "dataframe.query-planning": False, + "dataframe.convert-string": False, + } + ) +except ImportError: + pass +else: + import sys + + import dask.dataframe as dd + from packaging.version import parse + + if parse(dask.__version__) > parse("2024.6.0"): + _DASK_QUERY_PLANNING_ENABLED = dd.DASK_EXPR_ENABLED + else: + _DASK_QUERY_PLANNING_ENABLED = "dask_expr" in sys.modules + + +if _DASK_QUERY_PLANNING_ENABLED: + raise NotImplementedError( + "Merlin does not support the query-planning API in Dask " + "Dataframe yet. Please make sure query-planning is " + "disabled before dask.dataframe is imported. E.g.:\n" + "dask.config.set({'dataframe.query-planning': False})" + ) diff --git a/merlin/core/__init__.py b/merlin/core/__init__.py index c63cba880..f35898e5d 100644 --- a/merlin/core/__init__.py +++ b/merlin/core/__init__.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,37 +17,3 @@ from merlin.core import _version __version__ = _version.get_versions()["version"] - - -_DASK_QUERY_PLANNING_ENABLED = False -try: - # Disable query-planning and string conversion - import dask - - dask.config.set( - { - "dataframe.query-planning": False, - "dataframe.convert-string": False, - } - ) -except ImportError: - pass -else: - import sys - - import dask.dataframe as dd - from packaging.version import parse - - if parse(dask.__version__) > parse("2024.6.0"): - _DASK_QUERY_PLANNING_ENABLED = dd.DASK_EXPR_ENABLED - else: - _DASK_QUERY_PLANNING_ENABLED = "dask_expr" in sys.modules - - -if _DASK_QUERY_PLANNING_ENABLED: - raise NotImplementedError( - "Merlin does not support the query-planning API in Dask " - "Dataframe yet. Please make sure query-planning is " - "disabled before dask.dataframe is imported. E.g.:\n" - "dask.config.set({'dataframe.query-planning': False})" - ) From fec8038e1eb01738ef4927849110c9041fed45ae Mon Sep 17 00:00:00 2001 From: rjzamora Date: Wed, 17 Jul 2024 07:35:07 -0700 Subject: [PATCH 6/8] clarify error message --- merlin/__init__.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/merlin/__init__.py b/merlin/__init__.py index 70b6ed8aa..15a4a5ee5 100644 --- a/merlin/__init__.py +++ b/merlin/__init__.py @@ -34,8 +34,13 @@ from packaging.version import parse if parse(dask.__version__) > parse("2024.6.0"): + # For newer versions of dask, we can just check + # the official DASK_EXPR_ENABLED constant _DASK_QUERY_PLANNING_ENABLED = dd.DASK_EXPR_ENABLED else: + # For older versions of dask, we must assume query + # planning is enabled if dask_expr was imported + # (because we can't know for sure) _DASK_QUERY_PLANNING_ENABLED = "dask_expr" in sys.modules @@ -43,6 +48,8 @@ raise NotImplementedError( "Merlin does not support the query-planning API in Dask " "Dataframe yet. Please make sure query-planning is " - "disabled before dask.dataframe is imported. E.g.:\n" - "dask.config.set({'dataframe.query-planning': False})" + "disabled before dask.dataframe is imported.\n\n" + "e.g. dask.config.set({'dataframe.query-planning': False})" + "\n\nOr set the environment variable: " + "export DASK_DATAFRAME__QUERY_PLANNING=False" ) From 1bafba87c35bacfb3b4ded8d85feb76f9b69bd1e Mon Sep 17 00:00:00 2001 From: rjzamora Date: Wed, 17 Jul 2024 08:24:50 -0700 Subject: [PATCH 7/8] use new configs module tfor central config validation point --- merlin/__init__.py | 55 ----------------------------------------- merlin/core/__init__.py | 4 ++- merlin/dag/__init__.py | 6 ++++- merlin/io/__init__.py | 8 ++++-- 4 files changed, 14 insertions(+), 59 deletions(-) delete mode 100644 merlin/__init__.py diff --git a/merlin/__init__.py b/merlin/__init__.py deleted file mode 100644 index 15a4a5ee5..000000000 --- a/merlin/__init__.py +++ /dev/null @@ -1,55 +0,0 @@ -# -# Copyright (c) 2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -_DASK_QUERY_PLANNING_ENABLED = False -try: - # Disable query-planning and string conversion - import dask - - dask.config.set( - { - "dataframe.query-planning": False, - "dataframe.convert-string": False, - } - ) -except ImportError: - pass -else: - import sys - - import dask.dataframe as dd - from packaging.version import parse - - if parse(dask.__version__) > parse("2024.6.0"): - # For newer versions of dask, we can just check - # the official DASK_EXPR_ENABLED constant - _DASK_QUERY_PLANNING_ENABLED = dd.DASK_EXPR_ENABLED - else: - # For older versions of dask, we must assume query - # planning is enabled if dask_expr was imported - # (because we can't know for sure) - _DASK_QUERY_PLANNING_ENABLED = "dask_expr" in sys.modules - - -if _DASK_QUERY_PLANNING_ENABLED: - raise NotImplementedError( - "Merlin does not support the query-planning API in Dask " - "Dataframe yet. Please make sure query-planning is " - "disabled before dask.dataframe is imported.\n\n" - "e.g. dask.config.set({'dataframe.query-planning': False})" - "\n\nOr set the environment variable: " - "export DASK_DATAFRAME__QUERY_PLANNING=False" - ) diff --git a/merlin/core/__init__.py b/merlin/core/__init__.py index f35898e5d..0dda4f9d5 100644 --- a/merlin/core/__init__.py +++ b/merlin/core/__init__.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,6 +14,8 @@ # limitations under the License. # +from merlin.config import validate_dask_configs from merlin.core import _version __version__ = _version.get_versions()["version"] +validate_dask_configs() diff --git a/merlin/dag/__init__.py b/merlin/dag/__init__.py index dca0c76dd..c668e8945 100644 --- a/merlin/dag/__init__.py +++ b/merlin/dag/__init__.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,6 +15,10 @@ # # flake8: noqa +from merlin.config import validate_dask_configs + +validate_dask_configs() + from merlin.dag.graph import Graph from merlin.dag.node import Node, iter_nodes, postorder_iter_nodes, preorder_iter_nodes from merlin.dag.operator import DataFormats, Operator, Supports diff --git a/merlin/io/__init__.py b/merlin/io/__init__.py index ff4058c5a..851f5a558 100644 --- a/merlin/io/__init__.py +++ b/merlin/io/__init__.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,8 +13,12 @@ # See the License for the specific language governing permissions and # limitations under the License. # - # flake8: noqa + +from merlin.config import validate_dask_configs + +validate_dask_configs() + from merlin.io import dataframe_iter, dataset, shuffle from merlin.io.dataframe_iter import DataFrameIter from merlin.io.dataset import MERLIN_METADATA_DIR_NAME, Dataset From 498e9b160c7a32dafb514f0d9a014868ef3714be Mon Sep 17 00:00:00 2001 From: rjzamora Date: Wed, 17 Jul 2024 08:27:17 -0700 Subject: [PATCH 8/8] add missing config module --- merlin/config/__init__.py | 67 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 merlin/config/__init__.py diff --git a/merlin/config/__init__.py b/merlin/config/__init__.py new file mode 100644 index 000000000..a1f37a010 --- /dev/null +++ b/merlin/config/__init__.py @@ -0,0 +1,67 @@ +# +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +_DASK_QUERY_PLANNING_ENABLED = False +try: + # Disable query-planning and string conversion + import dask + + dask.config.set( + { + "dataframe.query-planning": False, + "dataframe.convert-string": False, + } + ) +except ImportError: + dask = None +else: + import sys + + import dask.dataframe as dd + from packaging.version import parse + + if parse(dask.__version__) > parse("2024.6.0"): + # For newer versions of dask, we can just check + # the official DASK_EXPR_ENABLED constant + _DASK_QUERY_PLANNING_ENABLED = dd.DASK_EXPR_ENABLED + else: + # For older versions of dask, we must assume query + # planning is enabled if dask_expr was imported + # (because we can't know for sure) + _DASK_QUERY_PLANNING_ENABLED = "dask_expr" in sys.modules + + +def validate_dask_configs(): + """Central check for problematic config options in Dask""" + if _DASK_QUERY_PLANNING_ENABLED: + raise NotImplementedError( + "Merlin does not support the query-planning API in " + "Dask Dataframe yet. Please make sure query-planning is " + "disabled before dask.dataframe is imported.\n\ne.g." + "dask.config.set({'dataframe.query-planning': False})" + "\n\nOr set the environment variable: " + "export DASK_DATAFRAME__QUERY_PLANNING=False" + ) + + if dask is not None and dask.config.get("dataframe.convert-string"): + raise NotImplementedError( + "Merlin does not support automatic string conversion in " + "Dask Dataframe yet. Please make sure this option is " + "disabled.\n\ne.g." + "dask.config.set({'dataframe.convert-string': False})" + "\n\nOr set the environment variable: " + "export DASK_DATAFRAME__CONVERT_STRING=False" + )