Merge branch 'branch-24.12' into wence/fix/12391

rapidsai · Oct 8, 2024 · 4684cc9 · 4684cc9
2 parents e2e6cbc + 09ed210
commit 4684cc9
Show file tree

Hide file tree

Showing 189 changed files with 3,120 additions and 1,309 deletions.
diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml
@@ -62,3 +62,33 @@ jobs:
         UPDATE_ITEM: true
         UPDATE_LINKED_ISSUES: true
       secrets: inherit
+
+    process-branch-name:
+      if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
+      needs: get-project-id
+      runs-on: ubuntu-latest
+      outputs:
+        branch-name: ${{ steps.process-branch-name.outputs.branch-name }}
+      steps:
+        - name: Extract branch name
+          id: process-branch-name
+          run: |
+            branch=${{ github.event.pull_request.base.ref }}
+            release=${branch#branch-}
+            echo "branch-name=$release" >> "$GITHUB_OUTPUT"
+
+    update-release:
+      # This job sets the PR and its linked issues to the release they are targeting
+      uses: rapidsai/shared-workflows/.github/workflows/[email protected]
+      if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
+      needs: [get-project-id, process-branch-name]
+      with:
+        PROJECT_ID: "PVT_kwDOAp2shc4AiNzl"
+        SINGLE_SELECT_FIELD_ID: "PVTSSF_lADOAp2shc4AiNzlzgg52UQ"
+        SINGLE_SELECT_FIELD_NAME: "Release"
+        SINGLE_SELECT_OPTION_VALUE: "${{ needs.process-branch-name.outputs.branch-name }}"
+        ITEM_PROJECT_ID: "${{ needs.get-project-id.outputs.ITEM_PROJECT_ID }}"
+        ITEM_NODE_ID: "${{ github.event.pull_request.node_id }}"
+        UPDATE_ITEM: true
+        UPDATE_LINKED_ISSUES: true
+      secrets: inherit
diff --git a/ci/build_wheel_libcudf.sh b/ci/build_wheel_libcudf.sh
@@ -5,11 +5,15 @@ set -euo pipefail
 
 package_dir="python/libcudf"
 
+export SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_RUNTIME_WHEEL=ON"
 ./ci/build_wheel.sh ${package_dir}
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 
 mkdir -p ${package_dir}/final_dist
-python -m auditwheel repair -w ${package_dir}/final_dist ${package_dir}/dist/*
+python -m auditwheel repair \
+    --exclude libnvcomp.so.4 \
+    -w ${package_dir}/final_dist \
+    ${package_dir}/dist/*
 
 RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp ${package_dir}/final_dist
diff --git a/ci/test_cudf_polars_polars_tests.sh b/ci/test_cudf_polars_polars_tests.sh
@@ -24,14 +24,17 @@ rapids-logger "Download wheels"
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist
 
-# Download the pylibcudf built in the previous step
-RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibcudf-dep
+# Download libcudf and pylibcudf built in the previous step
+RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./local-libcudf-dep
+RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./local-pylibcudf-dep
 
-rapids-logger "Install pylibcudf"
-python -m pip install ./local-pylibcudf-dep/pylibcudf*.whl
+rapids-logger "Install libcudf, pylibcudf and cudf_polars"
+python -m pip install \
+    -v \
+    "$(echo ./dist/cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \
+    "$(echo ./local-libcudf-dep/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
+    "$(echo ./local-pylibcudf-dep/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"
 
-rapids-logger "Install cudf_polars"
-python -m pip install $(echo ./dist/cudf_polars*.whl)
 
 TAG=$(python -c 'import polars; print(f"py-{polars.__version__}")')
 rapids-logger "Clone polars to ${TAG}"

diff --git a/ci/test_python_cudf.sh b/ci/test_python_cudf.sh
@@ -9,7 +9,7 @@ source ./ci/test_python_common.sh test_python_cudf
 
 rapids-logger "Check GPU usage"
 nvidia-smi
-
+rapids-print-env
 EXITCODE=0
 trap "EXITCODE=1" ERR
 set +e

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -63,7 +63,7 @@ dependencies:
 - openpyxl
 - packaging
 - pandas
-- pandas>=2.0,<2.2.3dev0
+- pandas>=2.0,<2.2.4dev0
 - pandoc
 - polars>=1.8,<1.9
 - pre-commit

diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -61,7 +61,7 @@ dependencies:
 - openpyxl
 - packaging
 - pandas
-- pandas>=2.0,<2.2.3dev0
+- pandas>=2.0,<2.2.4dev0
 - pandoc
 - polars>=1.8,<1.9
 - pre-commit

diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
@@ -78,7 +78,7 @@ requirements:
   run:
     - python
     - typing_extensions >=4.0.0
-    - pandas >=2.0,<2.2.3dev0
+    - pandas >=2.0,<2.2.4dev0
     - cupy >=12.0.0
     - numba-cuda >=0.0.13
     - numpy >=1.23,<3.0a0

diff --git a/conda/recipes/pylibcudf/meta.yaml b/conda/recipes/pylibcudf/meta.yaml
@@ -77,7 +77,7 @@ requirements:
   run:
     - python
     - typing_extensions >=4.0.0
-    - pandas >=2.0,<2.2.3dev0
+    - pandas >=2.0,<2.2.4dev0
     - numpy >=1.23,<3.0a0
     - pyarrow>=14.0.0,<18.0.0a0
     - {{ pin_compatible('rmm', max_pin='x.x') }}

diff --git a/cpp/.clang-tidy b/cpp/.clang-tidy
@@ -1,18 +1,47 @@
 ---
+# Notes on disabled checks
+# ------------------------
+# modernize-use-equals-default:
+#     auto-fix is broken (doesn't insert =default correctly)
+# modernize-concat-nested-namespaces:
+#     auto-fix is broken (can delete code)
+# modernize-use-trailing-return-type:
+#     Purely stylistic, no benefit to rewriting everything
+# modernize-return-braced-init-list:
+#     Stylistically we prefer to see the return type at the return site.
+#     See https://github.com/rapidsai/cudf/pull/16956#pullrequestreview-2341891672
+#     for more information.
+# modernize-use-bool-literals:
+#     Our tests use int flags for validity masks extensively and we prefer that
+# clang-analyzer-cplusplus.NewDeleteLeaks:
+#     This check has numerous bugs, see
+#     https://github.com/llvm/llvm-project/issues?q=is%3Aissue+is%3Aopen+newdeleteleaks
+#     We encounter at least
+#     https://github.com/llvm/llvm-project/issues/60896
+#     https://github.com/llvm/llvm-project/issues/69602
+# clang-analyzer-optin.core.EnumCastOutOfRange
+#     We use enums as flags in multiple cases and this check makes ORing flags invalid
+# clang-analyzer-optin.cplusplus.UninitializedObject'
+#     There is an error in nanoarrow that none of the clang-tidy filters (i.e.
+#     header-filter and exclude-header-filter are able to properly avoid. This
+#     merits further investigation
+#
+# We need to verify that broken checks are still broken
 Checks:
       'modernize-*,
        -modernize-use-equals-default,
        -modernize-concat-nested-namespaces,
        -modernize-use-trailing-return-type,
-       -modernize-use-bool-literals'
-
-      # -modernize-use-equals-default        # auto-fix is broken (doesn't insert =default correctly)
-      # -modernize-concat-nested-namespaces  # auto-fix is broken (can delete code)
-      # -modernize-use-trailing-return-type  # just a preference
+       -modernize-return-braced-init-list,
+       -modernize-use-bool-literals,
+       clang-analyzer-*,
+       -clang-analyzer-cplusplus.NewDeleteLeaks,
+       -clang-analyzer-optin.core.EnumCastOutOfRange,
+       -clang-analyzer-optin.cplusplus.UninitializedObject'
 
 WarningsAsErrors: ''
-HeaderFilterRegex: ''
-AnalyzeTemporaryDtors: false
+HeaderFilterRegex: '.*cudf/cpp/(src|include|tests).*'
+ExcludeHeaderFilterRegex: '.*(Message_generated.h|Schema_generated.h|brotli_dict.hpp|unbz2.hpp|cxxopts.hpp).*'
 FormatStyle:     none
 CheckOptions:
  - key:             modernize-loop-convert.MaxCopySize

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -52,6 +52,7 @@ option(JITIFY_USE_CACHE "Use a file cache for JIT compiled kernels" ON)
 option(CUDF_BUILD_TESTUTIL "Whether to build the test utilities contained in libcudf" ON)
 mark_as_advanced(CUDF_BUILD_TESTUTIL)
 option(CUDF_USE_PROPRIETARY_NVCOMP "Download and use NVCOMP with proprietary extensions" ON)
+option(CUDF_EXPORT_NVCOMP "Export NVCOMP as a dependency" ON)
 option(CUDF_LARGE_STRINGS_DISABLED "Build with large string support disabled" OFF)
 mark_as_advanced(CUDF_LARGE_STRINGS_DISABLED)
 option(

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -330,11 +330,11 @@ ConfigureNVBench(CSV_WRITER_NVBENCH io/csv/csv_writer.cpp)
 
 # ##################################################################################################
 # * ast benchmark ---------------------------------------------------------------------------------
-ConfigureBench(AST_BENCH ast/transform.cpp)
+ConfigureNVBench(AST_NVBENCH ast/transform.cpp)
 
 # ##################################################################################################
 # * binaryop benchmark ----------------------------------------------------------------------------
-ConfigureBench(BINARYOP_BENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.cpp)
+ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.cpp)
 
 # ##################################################################################################
 # * nvtext benchmark -------------------------------------------------------------------
@@ -392,11 +392,6 @@ ConfigureNVBench(JSON_READER_NVBENCH io/json/nested_json.cpp io/json/json_reader
 ConfigureNVBench(JSON_READER_OPTION_NVBENCH io/json/json_reader_option.cpp)
 ConfigureNVBench(JSON_WRITER_NVBENCH io/json/json_writer.cpp)
 
-# ##################################################################################################
-# * multi buffer memset benchmark
-# ----------------------------------------------------------------------
-ConfigureNVBench(BATCHED_MEMSET_BENCH io/utilities/batched_memset_bench.cpp)
-
 # ##################################################################################################
 # * io benchmark ---------------------------------------------------------------------
 ConfigureNVBench(MULTIBYTE_SPLIT_NVBENCH io/text/multibyte_split.cpp)

diff --git a/cpp/benchmarks/ast/transform.cpp b/cpp/benchmarks/ast/transform.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,14 +15,16 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/transform.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/iterator/counting_iterator.h>
 
+#include <nvbench/nvbench.cuh>
+
 #include <algorithm>
 #include <list>
 #include <memory>
@@ -35,13 +37,10 @@ enum class TreeType {
 };
 
 template <typename key_type, TreeType tree_type, bool reuse_columns, bool Nullable>
-class AST : public cudf::benchmark {};
-
-template <typename key_type, TreeType tree_type, bool reuse_columns, bool Nullable>
-static void BM_ast_transform(benchmark::State& state)
+static void BM_ast_transform(nvbench::state& state)
 {
-  auto const table_size{static_cast<cudf::size_type>(state.range(0))};
-  auto const tree_levels{static_cast<cudf::size_type>(state.range(1))};
+  auto const table_size  = static_cast<cudf::size_type>(state.get_int64("table_size"));
+  auto const tree_levels = static_cast<cudf::size_type>(state.get_int64("tree_levels"));
 
   // Create table data
   auto const n_cols = reuse_columns ? 1 : tree_levels + 1;
@@ -86,38 +85,22 @@ static void BM_ast_transform(benchmark::State& state)
 
   auto const& expression_tree_root = expressions.back();
 
-  // Execute benchmark
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
-    cudf::compute_column(table, expression_tree_root);
-  }
-
   // Use the number of bytes read from global memory
-  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * state.range(0) *
-                          (tree_levels + 1) * sizeof(key_type));
-}
+  state.add_global_memory_reads<key_type>(table_size * (tree_levels + 1));
 
-static void CustomRanges(benchmark::internal::Benchmark* b)
-{
-  auto row_counts       = std::vector<cudf::size_type>{100'000, 1'000'000, 10'000'000, 100'000'000};
-  auto operation_counts = std::vector<cudf::size_type>{1, 5, 10};
-  for (auto const& row_count : row_counts) {
-    for (auto const& operation_count : operation_counts) {
-      b->Args({row_count, operation_count});
-    }
-  }
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch&) { cudf::compute_column(table, expression_tree_root); });
 }
 
 #define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns, nullable) \
-  BENCHMARK_TEMPLATE_DEFINE_F(AST, name, key_type, tree_type, reuse_columns, nullable)     \
-  (::benchmark::State & st)                                                                \
+  static void name(::nvbench::state& st)                                                   \
   {                                                                                        \
-    BM_ast_transform<key_type, tree_type, reuse_columns, nullable>(st);                    \
+    ::BM_ast_transform<key_type, tree_type, reuse_columns, nullable>(st);                  \
   }                                                                                        \
-  BENCHMARK_REGISTER_F(AST, name)                                                          \
-    ->Apply(CustomRanges)                                                                  \
-    ->Unit(benchmark::kMillisecond)                                                        \
-    ->UseManualTime();
+  NVBENCH_BENCH(name)                                                                      \
+    .set_name(#name)                                                                       \
+    .add_int64_axis("tree_levels", {1, 5, 10})                                             \
+    .add_int64_axis("table_size", {100'000, 1'000'000, 10'000'000, 100'000'000})
 
 AST_TRANSFORM_BENCHMARK_DEFINE(
   ast_int32_imbalanced_unique, int32_t, TreeType::IMBALANCED_LEFT, false, false);