Remove dependency on forked libcudacxx (#4938)

Move the memory_resource and stream_view headers to DALI codebase and adjust the naming of files/namespaces so they no longer clash with proper libcudacxx. The `_LIBCUDACXX_...` macros were replaced with `_DALI_...` macros and the files were autoformatted to pass DALI linter. This should allow to safely include both DALI headers and libcudacxx in one code base. Adjust the custom plugin to C++17 and regenerate the custom plugin example. Add a WAR for old CMake not supporting that version of standard for CUDA code in CI. Signed-off-by: Krzysztof Lecki <[email protected]>
NVIDIA · Jul 7, 2023 · 9823dfd · 9823dfd
1 parent afe1475
commit 9823dfd
Show file tree

Hide file tree

Showing 15 changed files with 1,191 additions and 94 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -26,6 +26,3 @@
 [submodule "third_party/cocoapi"]
 	path = third_party/cocoapi
 	url = https://github.com/cocodataset/cocoapi
-[submodule "third_party/libcudacxx"]
-	path = third_party/libcudacxx
-	url = https://github.com/mzient/libcudacxx.git
diff --git a/cmake/Dependencies.common.cmake b/cmake/Dependencies.common.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -242,11 +242,6 @@ set_target_properties(cocoapi PROPERTIES POSITION_INDEPENDENT_CODE ON)
 list(APPEND DALI_LIBS cocoapi)
 list(APPEND DALI_EXCLUDES libcocoapi.a)
 
-##################################################################
-# libcu++
-##################################################################
-include_directories(SYSTEM ${PROJECT_SOURCE_DIR}/third_party/libcudacxx/include)
-
 ##################################################################
 # cfitsio
 ##################################################################

diff --git a/dali/CMakeLists.txt b/dali/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -181,12 +181,6 @@ if (BUILD_PYTHON)
     COMMAND cp -r "${PROJECT_SOURCE_DIR}/include/." "${PROJECT_BINARY_DIR}/${DALI_INCLUDE_DIR}"
   )
 
-  # Copy libcu++ include files
-  add_custom_command(
-    TARGET install_headers
-    COMMAND cp -rL "${PROJECT_SOURCE_DIR}/third_party/libcudacxx/include/." "${PROJECT_BINARY_DIR}/${DALI_INCLUDE_DIR}/"
-  )
-
   # Copy boost/preprocessor include files
   add_custom_command(
     TARGET install_headers

diff --git a/dali/kernels/imgproc/resample/resampling_filters.cu b/dali/kernels/imgproc/resample/resampling_filters.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -73,7 +73,7 @@ void InitFilters(ResamplingFilters &filters) {
   const int total_size = triangular_size + gaussian_size + cubic_size + lanczos_size;
 
   constexpr bool need_staging =
-    !cuda::kind_has_property<MemoryKind, cuda::memory_access::host>::value;
+    !cuda_for_dali::kind_has_property<MemoryKind, cuda_for_dali::memory_access::host>::value;
 
   using tmp_kind = std::conditional_t<need_staging, mm::memory_kind::host, MemoryKind>;
   filters.filter_data = mm::alloc_raw_unique<float, tmp_kind>(total_size);

diff --git a/dali/kernels/test/scatter_gather_test.cc b/dali/kernels/test/scatter_gather_test.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -73,7 +73,7 @@ class ScatterGatherTest : public testing::Test {
 
   template <typename MemoryKind>
   void Memcpy(void *dst, const void *src, size_t size, cudaMemcpyKind kind) {
-    if (cuda::kind_has_property<MemoryKind, cuda::memory_access::host>::value) {
+    if (cuda_for_dali::kind_has_property<MemoryKind, cuda_for_dali::memory_access::host>::value) {
       memcpy(dst, src, size);
     } else {
       CUDA_CALL(cudaMemcpy(dst, src, size, kind));
@@ -82,7 +82,7 @@ class ScatterGatherTest : public testing::Test {
 
   template <typename MemoryKind>
   void Memset(void *dst, int c, size_t size) {
-    if (cuda::kind_has_property<MemoryKind, cuda::memory_access::host>::value) {
+    if (cuda_for_dali::kind_has_property<MemoryKind, cuda_for_dali::memory_access::host>::value) {
       memset(dst, c, size);
     } else {
       CUDA_CALL(cudaMemset(dst, c, size));

diff --git a/dali/pipeline/data/copy_to_external.h b/dali/pipeline/data/copy_to_external.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -135,22 +135,22 @@ inline void CopyToExternalImpl(void** dsts,
 }
 
 template <typename DstKind, typename SrcBackend>
-inline void CopyToExternal(void* dst, const Tensor<SrcBackend> &src,
-                           AccessOrder order, bool use_copy_kernel) {
+inline void CopyToExternal(void *dst, const Tensor<SrcBackend> &src, AccessOrder order,
+                           bool use_copy_kernel) {
   const bool src_device_access = (std::is_same<SrcBackend, GPUBackend>::value || src.is_pinned());
-  const bool dst_device_access = cuda::kind_has_property<DstKind,
-                                                         cuda::memory_access::device>::value;
+  const bool dst_device_access =
+      cuda_for_dali::kind_has_property<DstKind, cuda_for_dali::memory_access::device>::value;
   use_copy_kernel &= dst_device_access && src_device_access;
   using DstBackend = typename detail::kind2backend<DstKind>::type;
   CopyToExternalImpl<DstBackend, SrcBackend>(dst, src, order, use_copy_kernel);
 }
 
 template <typename DstKind, typename SrcBackend>
-inline void CopyToExternal(void* dst, const TensorList<SrcBackend> &src,
-                           AccessOrder order, bool use_copy_kernel) {
+inline void CopyToExternal(void *dst, const TensorList<SrcBackend> &src, AccessOrder order,
+                           bool use_copy_kernel) {
   const bool src_device_access = (std::is_same<SrcBackend, GPUBackend>::value || src.is_pinned());
-  const bool dst_device_access = cuda::kind_has_property<DstKind,
-                                                         cuda::memory_access::device>::value;
+  const bool dst_device_access =
+      cuda_for_dali::kind_has_property<DstKind, cuda_for_dali::memory_access::device>::value;
   use_copy_kernel &= dst_device_access && src_device_access;
   using DstBackend = typename detail::kind2backend<DstKind>::type;
   CopyToExternalImpl<DstBackend, SrcBackend>(dst, src, order, use_copy_kernel);
@@ -185,7 +185,8 @@ template <typename DstKind, typename SrcBackend>
 inline void CopyToExternal(void** dsts, const TensorList<SrcBackend> &src,
                            AccessOrder order, bool use_copy_kernel) {
   bool src_device_access = (std::is_same<SrcBackend, GPUBackend>::value || src.is_pinned());
-  bool dst_device_access = cuda::kind_has_property<DstKind, cuda::memory_access::device>::value;
+  bool dst_device_access =
+      cuda_for_dali::kind_has_property<DstKind, cuda_for_dali::memory_access::device>::value;
   use_copy_kernel &= dst_device_access && src_device_access;
   using DstBackend = typename detail::kind2backend<DstKind>::type;
   CopyToExternalImpl<DstBackend, SrcBackend>(dsts, src, order, use_copy_kernel);

diff --git a/dali/test/mat2tensor.h b/dali/test/mat2tensor.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2019-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -71,8 +71,9 @@ TensorView<StorageCPU, T, ndim> view_as_tensor(cv::Mat &mat) {
 template <typename MemoryKind = mm::memory_kind::device, typename T = uint8_t, int ndims = 3>
 std::pair<TensorView<kind2storage_t<MemoryKind>, T, ndims>, mm::uptr<T>>
 copy_as_tensor(const cv::Mat &mat) {
-  static_assert(cuda::kind_has_property<MemoryKind, cuda::memory_access::device>::value,
-                "A GPU-accessible memory kind is required.");
+  static_assert(
+      cuda_for_dali::kind_has_property<MemoryKind, cuda_for_dali::memory_access::device>::value,
+      "A GPU-accessible memory kind is required.");
   auto tvin = kernels::view_as_tensor<const T, ndims>(mat);
   return copy<MemoryKind>(tvin);
 }

diff --git a/docs/examples/custom_operations/custom_operator/create_a_custom_operator.ipynb b/docs/examples/custom_operations/custom_operator/create_a_custom_operator.ipynb
@@ -236,7 +236,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "/usr/local/lib/python3.6/dist-packages/nvidia/dali/include\n"
+      "/usr/local/lib/python3.10/dist-packages/nvidia/dali/include\n"
      ]
     }
    ],
@@ -253,7 +253,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "/usr/local/lib/python3.6/dist-packages/nvidia/dali\n"
+      "/usr/local/lib/python3.10/dist-packages/nvidia/dali\n"
      ]
     }
    ],
@@ -270,7 +270,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "['-I/usr/local/lib/python3.6/dist-packages/nvidia/dali/include', '-D_GLIBCXX_USE_CXX11_ABI=1']\n"
+      "['-I/usr/local/lib/python3.10/dist-packages/nvidia/dali/include', '-D_GLIBCXX_USE_CXX11_ABI=1']\n"
      ]
     }
    ],
@@ -287,7 +287,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "['-L/usr/local/lib/python3.6/dist-packages/nvidia/dali', '-ldali']\n"
+      "['-L/usr/local/lib/python3.10/dist-packages/nvidia/dali', '-ldali']\n"
      ]
     }
    ],
@@ -315,7 +315,7 @@
      "output_type": "stream",
      "text": [
       "cmake_minimum_required(VERSION 3.10)\r\n",
-      "set(CMAKE_CUDA_ARCHITECTURES \"35;50;52;60;61;70;75;80;86\")\r\n",
+      "set(CMAKE_CUDA_ARCHITECTURES \"50;60;70;80;90\")\r\n",
       "\r\n",
       "project(custom_dummy_plugin LANGUAGES CUDA CXX C)\r\n",
       "\r\n",
@@ -324,7 +324,7 @@
       "set(CMAKE_CXX_EXTENSIONS OFF)\r\n",
       "set(CMAKE_C_STANDARD 11)\r\n",
       "\r\n",
-      "set(CMAKE_CUDA_STANDARD 14)\r\n",
+      "set(CMAKE_CUDA_STANDARD 17)\r\n",
       "set(CMAKE_CUDA_STANDARD_REQUIRED ON)\r\n",
       "\r\n",
       "include_directories(SYSTEM \"${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}\")\r\n",
@@ -368,31 +368,29 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "-- The CUDA compiler identification is NVIDIA 11.4.48\n",
-      "-- The CXX compiler identification is GNU 7.5.0\n",
-      "-- The C compiler identification is GNU 7.5.0\n",
+      "-- The CUDA compiler identification is NVIDIA 12.0.76\n",
+      "-- The CXX compiler identification is GNU 11.3.0\n",
+      "-- The C compiler identification is GNU 11.3.0\n",
       "-- Detecting CUDA compiler ABI info\n",
       "-- Detecting CUDA compiler ABI info - done\n",
-      "-- Check for working CUDA compiler: /opt/ccache/bin/nvcc - skipped\n",
+      "-- Check for working CUDA compiler: /usr/local/cuda/bin/nvcc - skipped\n",
       "-- Detecting CUDA compile features\n",
       "-- Detecting CUDA compile features - done\n",
       "-- Detecting CXX compiler ABI info\n",
       "-- Detecting CXX compiler ABI info - done\n",
-      "-- Check for working CXX compiler: /opt/ccache/bin/g++ - skipped\n",
+      "-- Check for working CXX compiler: /usr/bin/c++ - skipped\n",
       "-- Detecting CXX compile features\n",
       "-- Detecting CXX compile features - done\n",
       "-- Detecting C compiler ABI info\n",
       "-- Detecting C compiler ABI info - done\n",
-      "-- Check for working C compiler: /opt/ccache/bin/gcc - skipped\n",
+      "-- Check for working C compiler: /usr/bin/cc - skipped\n",
       "-- Detecting C compile features\n",
       "-- Detecting C compile features - done\n",
-      "-- Configuring done\n",
-      "-- Generating done\n",
+      "-- Configuring done (4.4s)\n",
+      "-- Generating done (0.0s)\n",
       "-- Build files have been written to: /dali/docs/examples/custom_operations/custom_operator/customdummy/build\n",
-      "\u001b[35m\u001b[1mScanning dependencies of target customdummy\u001b[0m\n",
       "[ 33%] \u001b[32mBuilding CXX object CMakeFiles/customdummy.dir/dummy.cc.o\u001b[0m\n",
       "[ 66%] \u001b[32mBuilding CUDA object CMakeFiles/customdummy.dir/dummy.cu.o\u001b[0m\n",
-      "nvcc warning : The 'compute_35', 'compute_37', 'compute_50', 'sm_35', 'sm_37' and 'sm_50' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning).\n",
       "[100%] \u001b[32m\u001b[1mLinking CXX shared library libcustomdummy.so\u001b[0m\n",
       "[100%] Built target customdummy\n"
      ]
@@ -513,15 +511,15 @@
       "    \n",
       "    Keyword args\n",
       "    ------------\n",
-      "    `bytes_per_sample_hint` : int or list of int, optional, default = [0]\n",
+      "    `bytes_per_sample_hint` : int or list of int, optional, default = `[0]`\n",
       "        Output size hint, in bytes per sample.\n",
       "        \n",
       "        If specified, the operator's outputs residing in GPU or page-locked host memory will be preallocated\n",
       "        to accommodate a batch of samples of this size.\n",
-      "    `preserve` : bool, optional, default = False\n",
+      "    `preserve` : bool, optional, default = `False`\n",
       "        Prevents the operator from being removed from the\n",
       "        graph even if its outputs are not used.\n",
-      "    `seed` : int, optional, default = -1\n",
+      "    `seed` : int, optional, default = `-1`\n",
       "        Random seed.\n",
       "        \n",
       "        If not provided, it will be populated based on the global seed of the pipeline.\n",
@@ -552,6 +550,8 @@
       "Help on class CustomDummy in module nvidia.dali.ops:\n",
       "\n",
       "class CustomDummy(builtins.object)\n",
+      " |  CustomDummy(*, device='cpu', **kwargs)\n",
+      " |  \n",
       " |  Make a copy of the input tensor\n",
       " |  \n",
       " |  Supported backends\n",
@@ -561,15 +561,15 @@
       " |  \n",
       " |  Keyword args\n",
       " |  ------------\n",
-      " |  `bytes_per_sample_hint` : int or list of int, optional, default = [0]\n",
+      " |  `bytes_per_sample_hint` : int or list of int, optional, default = `[0]`\n",
       " |      Output size hint, in bytes per sample.\n",
       " |      \n",
       " |      If specified, the operator's outputs residing in GPU or page-locked host memory will be preallocated\n",
       " |      to accommodate a batch of samples of this size.\n",
-      " |  `preserve` : bool, optional, default = False\n",
+      " |  `preserve` : bool, optional, default = `False`\n",
       " |      Prevents the operator from being removed from the\n",
       " |      graph even if its outputs are not used.\n",
-      " |  `seed` : int, optional, default = -1\n",
+      " |  `seed` : int, optional, default = `-1`\n",
       " |      Random seed.\n",
       " |      \n",
       " |      If not provided, it will be populated based on the global seed of the pipeline.\n",
@@ -586,16 +586,10 @@
       " |      `data` : TensorList\n",
       " |          Input to the operator.\n",
       " |  \n",
-      " |  __init__(self, **kwargs)\n",
+      " |  __init__(self, *, device='cpu', **kwargs)\n",
       " |  \n",
       " |  ----------------------------------------------------------------------\n",
-      " |  Data descriptors defined here:\n",
-      " |  \n",
-      " |  __dict__\n",
-      " |      dictionary for instance variables (if defined)\n",
-      " |  \n",
-      " |  __weakref__\n",
-      " |      list of weak references to the object (if defined)\n",
+      " |  Readonly properties defined here:\n",
       " |  \n",
       " |  device\n",
       " |  \n",
@@ -606,6 +600,15 @@
       " |  spec\n",
       " |  \n",
       " |  ----------------------------------------------------------------------\n",
+      " |  Data descriptors defined here:\n",
+      " |  \n",
+      " |  __dict__\n",
+      " |      dictionary for instance variables (if defined)\n",
+      " |  \n",
+      " |  __weakref__\n",
+      " |      list of weak references to the object (if defined)\n",
+      " |  \n",
+      " |  ----------------------------------------------------------------------\n",
       " |  Data and other attributes defined here:\n",
       " |  \n",
       " |  schema_name = 'CustomDummy'\n",
@@ -621,7 +624,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -635,7 +638,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.9"
+   "version": "3.10.6"
   }
  },
  "nbformat": 4,

diff --git a/docs/examples/custom_operations/custom_operator/customdummy/CMakeLists.txt b/docs/examples/custom_operations/custom_operator/customdummy/CMakeLists.txt
@@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.10)
-set(CMAKE_CUDA_ARCHITECTURES "35;50;52;60;61;70;75;80;86")
+set(CMAKE_CUDA_ARCHITECTURES "50;60;70;80;90")
 
 project(custom_dummy_plugin LANGUAGES CUDA CXX C)
 
@@ -8,8 +8,11 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
 set(CMAKE_C_STANDARD 11)
 
-set(CMAKE_CUDA_STANDARD 14)
-set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+# TODO(klecki): When the test container gets a CMake that supports C++17 as a proper option,
+# swap those lines
+# set(CMAKE_CUDA_STANDARD 17)
+# set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -std=c++17")
 
 include_directories(SYSTEM "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}")