diff --git a/.gitmodules b/.gitmodules index 2b670675f7..ca263373bb 100644 --- a/.gitmodules +++ b/.gitmodules @@ -26,6 +26,3 @@ [submodule "third_party/cocoapi"] path = third_party/cocoapi url = https://github.com/cocodataset/cocoapi -[submodule "third_party/libcudacxx"] - path = third_party/libcudacxx - url = https://github.com/mzient/libcudacxx.git diff --git a/cmake/Dependencies.common.cmake b/cmake/Dependencies.common.cmake index 2f7ec98634..1fb4287b7c 100644 --- a/cmake/Dependencies.common.cmake +++ b/cmake/Dependencies.common.cmake @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -242,11 +242,6 @@ set_target_properties(cocoapi PROPERTIES POSITION_INDEPENDENT_CODE ON) list(APPEND DALI_LIBS cocoapi) list(APPEND DALI_EXCLUDES libcocoapi.a) -################################################################## -# libcu++ -################################################################## -include_directories(SYSTEM ${PROJECT_SOURCE_DIR}/third_party/libcudacxx/include) - ################################################################## # cfitsio ################################################################## diff --git a/dali/CMakeLists.txt b/dali/CMakeLists.txt index 7d7cba8c80..8307f8ff3e 100644 --- a/dali/CMakeLists.txt +++ b/dali/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2017-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -181,12 +181,6 @@ if (BUILD_PYTHON) COMMAND cp -r "${PROJECT_SOURCE_DIR}/include/." "${PROJECT_BINARY_DIR}/${DALI_INCLUDE_DIR}" ) - # Copy libcu++ include files - add_custom_command( - TARGET install_headers - COMMAND cp -rL "${PROJECT_SOURCE_DIR}/third_party/libcudacxx/include/." "${PROJECT_BINARY_DIR}/${DALI_INCLUDE_DIR}/" - ) - # Copy boost/preprocessor include files add_custom_command( TARGET install_headers diff --git a/dali/kernels/imgproc/resample/resampling_filters.cu b/dali/kernels/imgproc/resample/resampling_filters.cu index 8d1a9d8483..9a69ddc548 100644 --- a/dali/kernels/imgproc/resample/resampling_filters.cu +++ b/dali/kernels/imgproc/resample/resampling_filters.cu @@ -1,4 +1,4 @@ -// Copyright (c) 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright (c) 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -73,7 +73,7 @@ void InitFilters(ResamplingFilters &filters) { const int total_size = triangular_size + gaussian_size + cubic_size + lanczos_size; constexpr bool need_staging = - !cuda::kind_has_property::value; + !cuda_for_dali::kind_has_property::value; using tmp_kind = std::conditional_t; filters.filter_data = mm::alloc_raw_unique(total_size); diff --git a/dali/kernels/test/scatter_gather_test.cc b/dali/kernels/test/scatter_gather_test.cc index fbf8dcb47e..1f3206ed78 100644 --- a/dali/kernels/test/scatter_gather_test.cc +++ b/dali/kernels/test/scatter_gather_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright (c) 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -73,7 +73,7 @@ class ScatterGatherTest : public testing::Test { template void Memcpy(void *dst, const void *src, size_t size, cudaMemcpyKind kind) { - if (cuda::kind_has_property::value) { + if (cuda_for_dali::kind_has_property::value) { memcpy(dst, src, size); } else { CUDA_CALL(cudaMemcpy(dst, src, size, kind)); @@ -82,7 +82,7 @@ class ScatterGatherTest : public testing::Test { template void Memset(void *dst, int c, size_t size) { - if (cuda::kind_has_property::value) { + if (cuda_for_dali::kind_has_property::value) { memset(dst, c, size); } else { CUDA_CALL(cudaMemset(dst, c, size)); diff --git a/dali/pipeline/data/copy_to_external.h b/dali/pipeline/data/copy_to_external.h index b09a5390c3..a4208b6f30 100644 --- a/dali/pipeline/data/copy_to_external.h +++ b/dali/pipeline/data/copy_to_external.h @@ -1,4 +1,4 @@ -// Copyright (c) 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright (c) 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -135,22 +135,22 @@ inline void CopyToExternalImpl(void** dsts, } template -inline void CopyToExternal(void* dst, const Tensor &src, - AccessOrder order, bool use_copy_kernel) { +inline void CopyToExternal(void *dst, const Tensor &src, AccessOrder order, + bool use_copy_kernel) { const bool src_device_access = (std::is_same::value || src.is_pinned()); - const bool dst_device_access = cuda::kind_has_property::value; + const bool dst_device_access = + cuda_for_dali::kind_has_property::value; use_copy_kernel &= dst_device_access && src_device_access; using DstBackend = typename detail::kind2backend::type; CopyToExternalImpl(dst, src, order, use_copy_kernel); } template -inline void CopyToExternal(void* dst, const TensorList &src, - AccessOrder order, bool use_copy_kernel) { +inline void CopyToExternal(void *dst, const TensorList &src, AccessOrder order, + bool use_copy_kernel) { const bool src_device_access = (std::is_same::value || src.is_pinned()); - const bool dst_device_access = cuda::kind_has_property::value; + const bool dst_device_access = + cuda_for_dali::kind_has_property::value; use_copy_kernel &= dst_device_access && src_device_access; using DstBackend = typename detail::kind2backend::type; CopyToExternalImpl(dst, src, order, use_copy_kernel); @@ -185,7 +185,8 @@ template inline void CopyToExternal(void** dsts, const TensorList &src, AccessOrder order, bool use_copy_kernel) { bool src_device_access = (std::is_same::value || src.is_pinned()); - bool dst_device_access = cuda::kind_has_property::value; + bool dst_device_access = + cuda_for_dali::kind_has_property::value; use_copy_kernel &= dst_device_access && src_device_access; using DstBackend = typename detail::kind2backend::type; CopyToExternalImpl(dsts, src, order, use_copy_kernel); diff --git a/dali/test/mat2tensor.h b/dali/test/mat2tensor.h index d5d686d60c..f8b9afa881 100644 --- a/dali/test/mat2tensor.h +++ b/dali/test/mat2tensor.h @@ -1,4 +1,4 @@ -// Copyright (c) 2019-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright (c) 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -71,8 +71,9 @@ TensorView view_as_tensor(cv::Mat &mat) { template std::pair, T, ndims>, mm::uptr> copy_as_tensor(const cv::Mat &mat) { - static_assert(cuda::kind_has_property::value, - "A GPU-accessible memory kind is required."); + static_assert( + cuda_for_dali::kind_has_property::value, + "A GPU-accessible memory kind is required."); auto tvin = kernels::view_as_tensor(mat); return copy(tvin); } diff --git a/docs/examples/custom_operations/custom_operator/create_a_custom_operator.ipynb b/docs/examples/custom_operations/custom_operator/create_a_custom_operator.ipynb index 2e314c91fd..82d4721ea2 100644 --- a/docs/examples/custom_operations/custom_operator/create_a_custom_operator.ipynb +++ b/docs/examples/custom_operations/custom_operator/create_a_custom_operator.ipynb @@ -236,7 +236,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "/usr/local/lib/python3.6/dist-packages/nvidia/dali/include\n" + "/usr/local/lib/python3.10/dist-packages/nvidia/dali/include\n" ] } ], @@ -253,7 +253,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "/usr/local/lib/python3.6/dist-packages/nvidia/dali\n" + "/usr/local/lib/python3.10/dist-packages/nvidia/dali\n" ] } ], @@ -270,7 +270,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "['-I/usr/local/lib/python3.6/dist-packages/nvidia/dali/include', '-D_GLIBCXX_USE_CXX11_ABI=1']\n" + "['-I/usr/local/lib/python3.10/dist-packages/nvidia/dali/include', '-D_GLIBCXX_USE_CXX11_ABI=1']\n" ] } ], @@ -287,7 +287,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "['-L/usr/local/lib/python3.6/dist-packages/nvidia/dali', '-ldali']\n" + "['-L/usr/local/lib/python3.10/dist-packages/nvidia/dali', '-ldali']\n" ] } ], @@ -315,7 +315,7 @@ "output_type": "stream", "text": [ "cmake_minimum_required(VERSION 3.10)\r\n", - "set(CMAKE_CUDA_ARCHITECTURES \"35;50;52;60;61;70;75;80;86\")\r\n", + "set(CMAKE_CUDA_ARCHITECTURES \"50;60;70;80;90\")\r\n", "\r\n", "project(custom_dummy_plugin LANGUAGES CUDA CXX C)\r\n", "\r\n", @@ -324,7 +324,7 @@ "set(CMAKE_CXX_EXTENSIONS OFF)\r\n", "set(CMAKE_C_STANDARD 11)\r\n", "\r\n", - "set(CMAKE_CUDA_STANDARD 14)\r\n", + "set(CMAKE_CUDA_STANDARD 17)\r\n", "set(CMAKE_CUDA_STANDARD_REQUIRED ON)\r\n", "\r\n", "include_directories(SYSTEM \"${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}\")\r\n", @@ -368,31 +368,29 @@ "name": "stdout", "output_type": "stream", "text": [ - "-- The CUDA compiler identification is NVIDIA 11.4.48\n", - "-- The CXX compiler identification is GNU 7.5.0\n", - "-- The C compiler identification is GNU 7.5.0\n", + "-- The CUDA compiler identification is NVIDIA 12.0.76\n", + "-- The CXX compiler identification is GNU 11.3.0\n", + "-- The C compiler identification is GNU 11.3.0\n", "-- Detecting CUDA compiler ABI info\n", "-- Detecting CUDA compiler ABI info - done\n", - "-- Check for working CUDA compiler: /opt/ccache/bin/nvcc - skipped\n", + "-- Check for working CUDA compiler: /usr/local/cuda/bin/nvcc - skipped\n", "-- Detecting CUDA compile features\n", "-- Detecting CUDA compile features - done\n", "-- Detecting CXX compiler ABI info\n", "-- Detecting CXX compiler ABI info - done\n", - "-- Check for working CXX compiler: /opt/ccache/bin/g++ - skipped\n", + "-- Check for working CXX compiler: /usr/bin/c++ - skipped\n", "-- Detecting CXX compile features\n", "-- Detecting CXX compile features - done\n", "-- Detecting C compiler ABI info\n", "-- Detecting C compiler ABI info - done\n", - "-- Check for working C compiler: /opt/ccache/bin/gcc - skipped\n", + "-- Check for working C compiler: /usr/bin/cc - skipped\n", "-- Detecting C compile features\n", "-- Detecting C compile features - done\n", - "-- Configuring done\n", - "-- Generating done\n", + "-- Configuring done (4.4s)\n", + "-- Generating done (0.0s)\n", "-- Build files have been written to: /dali/docs/examples/custom_operations/custom_operator/customdummy/build\n", - "\u001b[35m\u001b[1mScanning dependencies of target customdummy\u001b[0m\n", "[ 33%] \u001b[32mBuilding CXX object CMakeFiles/customdummy.dir/dummy.cc.o\u001b[0m\n", "[ 66%] \u001b[32mBuilding CUDA object CMakeFiles/customdummy.dir/dummy.cu.o\u001b[0m\n", - "nvcc warning : The 'compute_35', 'compute_37', 'compute_50', 'sm_35', 'sm_37' and 'sm_50' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning).\n", "[100%] \u001b[32m\u001b[1mLinking CXX shared library libcustomdummy.so\u001b[0m\n", "[100%] Built target customdummy\n" ] @@ -513,15 +511,15 @@ " \n", " Keyword args\n", " ------------\n", - " `bytes_per_sample_hint` : int or list of int, optional, default = [0]\n", + " `bytes_per_sample_hint` : int or list of int, optional, default = `[0]`\n", " Output size hint, in bytes per sample.\n", " \n", " If specified, the operator's outputs residing in GPU or page-locked host memory will be preallocated\n", " to accommodate a batch of samples of this size.\n", - " `preserve` : bool, optional, default = False\n", + " `preserve` : bool, optional, default = `False`\n", " Prevents the operator from being removed from the\n", " graph even if its outputs are not used.\n", - " `seed` : int, optional, default = -1\n", + " `seed` : int, optional, default = `-1`\n", " Random seed.\n", " \n", " If not provided, it will be populated based on the global seed of the pipeline.\n", @@ -552,6 +550,8 @@ "Help on class CustomDummy in module nvidia.dali.ops:\n", "\n", "class CustomDummy(builtins.object)\n", + " | CustomDummy(*, device='cpu', **kwargs)\n", + " | \n", " | Make a copy of the input tensor\n", " | \n", " | Supported backends\n", @@ -561,15 +561,15 @@ " | \n", " | Keyword args\n", " | ------------\n", - " | `bytes_per_sample_hint` : int or list of int, optional, default = [0]\n", + " | `bytes_per_sample_hint` : int or list of int, optional, default = `[0]`\n", " | Output size hint, in bytes per sample.\n", " | \n", " | If specified, the operator's outputs residing in GPU or page-locked host memory will be preallocated\n", " | to accommodate a batch of samples of this size.\n", - " | `preserve` : bool, optional, default = False\n", + " | `preserve` : bool, optional, default = `False`\n", " | Prevents the operator from being removed from the\n", " | graph even if its outputs are not used.\n", - " | `seed` : int, optional, default = -1\n", + " | `seed` : int, optional, default = `-1`\n", " | Random seed.\n", " | \n", " | If not provided, it will be populated based on the global seed of the pipeline.\n", @@ -586,16 +586,10 @@ " | `data` : TensorList\n", " | Input to the operator.\n", " | \n", - " | __init__(self, **kwargs)\n", + " | __init__(self, *, device='cpu', **kwargs)\n", " | \n", " | ----------------------------------------------------------------------\n", - " | Data descriptors defined here:\n", - " | \n", - " | __dict__\n", - " | dictionary for instance variables (if defined)\n", - " | \n", - " | __weakref__\n", - " | list of weak references to the object (if defined)\n", + " | Readonly properties defined here:\n", " | \n", " | device\n", " | \n", @@ -606,6 +600,15 @@ " | spec\n", " | \n", " | ----------------------------------------------------------------------\n", + " | Data descriptors defined here:\n", + " | \n", + " | __dict__\n", + " | dictionary for instance variables (if defined)\n", + " | \n", + " | __weakref__\n", + " | list of weak references to the object (if defined)\n", + " | \n", + " | ----------------------------------------------------------------------\n", " | Data and other attributes defined here:\n", " | \n", " | schema_name = 'CustomDummy'\n", @@ -621,7 +624,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -635,7 +638,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.9" + "version": "3.10.6" } }, "nbformat": 4, diff --git a/docs/examples/custom_operations/custom_operator/customdummy/CMakeLists.txt b/docs/examples/custom_operations/custom_operator/customdummy/CMakeLists.txt index 68cfeee4b1..6d97d84a7f 100644 --- a/docs/examples/custom_operations/custom_operator/customdummy/CMakeLists.txt +++ b/docs/examples/custom_operations/custom_operator/customdummy/CMakeLists.txt @@ -1,5 +1,5 @@ cmake_minimum_required(VERSION 3.10) -set(CMAKE_CUDA_ARCHITECTURES "35;50;52;60;61;70;75;80;86") +set(CMAKE_CUDA_ARCHITECTURES "50;60;70;80;90") project(custom_dummy_plugin LANGUAGES CUDA CXX C) @@ -8,8 +8,11 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) set(CMAKE_C_STANDARD 11) -set(CMAKE_CUDA_STANDARD 14) -set(CMAKE_CUDA_STANDARD_REQUIRED ON) +# TODO(klecki): When the test container gets a CMake that supports C++17 as a proper option, +# swap those lines +# set(CMAKE_CUDA_STANDARD 17) +# set(CMAKE_CUDA_STANDARD_REQUIRED ON) +set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -std=c++17") include_directories(SYSTEM "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}") diff --git a/include/dali/core/backend_tags.h b/include/dali/core/backend_tags.h index 22259acffe..179d123981 100644 --- a/include/dali/core/backend_tags.h +++ b/include/dali/core/backend_tags.h @@ -1,4 +1,4 @@ -// Copyright (c) 2018-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright (c) 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -16,7 +16,7 @@ #define DALI_CORE_BACKEND_TAGS_H_ #include -#include +#include "dali/core/mm/cuda_memory_resource.h" namespace dali { @@ -59,22 +59,22 @@ template struct kind2storage; template <> -struct kind2storage { +struct kind2storage { using type = StorageCPU; }; template <> -struct kind2storage { +struct kind2storage { using type = StorageCPU; }; template <> -struct kind2storage { +struct kind2storage { using type = StorageGPU; }; template <> -struct kind2storage { +struct kind2storage { using type = StorageUnified; }; diff --git a/include/dali/core/mm/cuda_memory_resource.h b/include/dali/core/mm/cuda_memory_resource.h new file mode 100644 index 0000000000..0bdbd84c21 --- /dev/null +++ b/include/dali/core/mm/cuda_memory_resource.h @@ -0,0 +1,982 @@ +// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef DALI_CORE_MM_CUDA_MEMORY_RESOURCE_H_ +#define DALI_CORE_MM_CUDA_MEMORY_RESOURCE_H_ + + +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-local-typedefs" +#endif + +#include +#include +#include +#include +// #include // C++20 +#include "dali/core/mm/cuda_stream_view.h" + +#define _DALI_STD_VER 17 + +// DALI is built with exceptions +#define _DALI_EXT_RTTI_ENABLED + +#ifdef _DALI_EXT_RTTI_ENABLED +#include +#endif + +// #if __has_include() +#include +#define _DALI_STD_PMR_NS ::std::pmr +// #elif __has_include() +// #include +// #define _DALI_STD_PMR_NS ::std::experimental::pmr +// #endif // __has_include() + + +// no-op-define, +#define _DALI_TEMPLATE_VIS +#define _DALI_INLINE_VISIBILITY __host__ __device__ + +namespace dali { +namespace cuda_for_dali { + +/*! + * \brief Groups the tag types denoting the kind of memory of an allocation. + * + * Memory allocation kind determines where memory can be accessed and the + * performance characteristics of accesses. + * + * This is not a closed set, the user code can define custom memory kinds. + */ +namespace memory_kind { +/*! + * \brief Ordinary host memory + */ +struct host; + +/*! + * \brief Device memory, as allocated by cudaMalloc. + */ +struct device; + +/*! + * \brief Device-accessible host memory. + */ +struct pinned; + +/*! + * \brief Virtual memory that is automatically migrated between the host and devices. + */ +struct managed; +}; // namespace memory_kind + +namespace detail { + +template +struct __type_pack {}; + +namespace __fallback_typeid { + +template +struct _DALI_TEMPLATE_VIS __unique_typeinfo { + static constexpr int __id = 0; +}; +template +constexpr int __unique_typeinfo<_Tp>::__id; + +template +inline _DALI_INLINE_VISIBILITY constexpr const void *__get_fallback_typeid() { + return &__unique_typeinfo>::__id; +} + +template +const ::std::type_info *__get_typeid() { +#ifdef _DALI_EXT_RTTI_ENABLED + return &typeid(_Tp); +#else + return nullptr; +#endif +} + +inline bool __compare_type(const ::std::type_info *__ti1, const void *__fallback_ti1, + const ::std::type_info *__ti2, const void *__fallback_ti2) { +#ifdef _DALI_EXT_RTTI_ENABLED + if (__ti1 && __ti2 && *__ti1 == *__ti2) + return true; +#endif + return __fallback_ti1 == __fallback_ti2; +} + +template +bool __is_type(const ::std::type_info *__ti1, const void *__fallback_ti1) { + return __compare_type(__ti1, __fallback_ti1, __get_typeid<_Tp>(), __get_fallback_typeid<_Tp>()); +} + +} // namespace __fallback_typeid + +} // namespace detail + +template +class basic_resource_view; + +/*! + * \brief Groups the tag types denoting the execution environment in which the memory can be + * accessed + * + * This is not a closed set, the user code can define custom accessibility. + */ +namespace memory_access { +struct host; +struct device; +} // namespace memory_access + +/*! + * \brief A memory property tag type indicating that the memory can be oversubscribed. + * + * Oversubscribable memory doesn't need to have backing physical storage at all times. + */ +struct oversubscribable; + +/*! + * \brief A memory property tag type indicating that the memory has a backing physical + * storage in the target location at all times. + */ +struct resident; + +/*! + * \brief Groups the tag types that denote the actual location of the physical storage + * + * Memory kinds which can be migrated between locations can define multiple locations. + */ +namespace memory_location { +/*! + * \brief A memory property tag type indicating that the memory is located on a device + */ +struct device; + +/*! + * \brief A memory property tag type indicating that the memory is located in the host memory + */ +struct host; +} // namespace memory_location + +template +class memory_resource; + +namespace detail { + +class memory_resource_base { + public: + static constexpr std::size_t default_alignment = alignof(std::max_align_t); + + /*! + * \brief Allocates storage of size at least `__bytes` bytes. + * + * The returned storage is aligned to the specified `__alignment` if such + * alignment is supported. Otherwise throws. + * + * Storage may be accessed immediately within the execution contexts that + * can access the memory. + * + * \throws If storage of the requested size and alignment cannot be obtained. + * + * \param __bytes The size in bytes of the allocation + * \param __alignment The alignment of the allocation + * \return Pointer to the requested storage + */ + void *allocate(size_t __bytes, size_t __alignment = default_alignment) { + return do_allocate(__bytes, __alignment); + } + + /*! + * \brief Deallocates the storage pointed to by `__p`. + * + * `__p` must have been returned by a prior call to `allocate(__bytes, + * __alignment)` on a `memory_resource` that compares equal to `*this`, and + * the storage it points to must not yet have been deallocated, otherwise + * behavior is undefined. + * + * \throws Nothing. + * + * \param __p Pointer to storage to be deallocated + * \param __bytes The size in bytes of the allocation. This must be equal to + * the value of `__bytes` that was specified to the `allocate` call that + * returned `__p`. + * \param __alignment The alignment of the allocation. This must be equal to + * the value of `__alignment` that was specified to the `allocate` call that + * returned `__p`. + */ + void deallocate(void *__mem, size_t __bytes, size_t __alignment = default_alignment) { + do_deallocate(__mem, __bytes, __alignment); + } + + /*! + * \brief Tries to cast the resource to a resource of given kind + */ + template + memory_resource<_Kind> *as_kind() noexcept { + using __tag = detail::__type_pack<_Kind>; + return static_cast *>( + __do_as_kind(detail::__fallback_typeid::__get_typeid<__tag>(), + detail::__fallback_typeid::__get_fallback_typeid<__tag>())); + } + + /*! + * \brief Tries to cast the resource to a resource of given kind + */ + template + const memory_resource<_Kind> *as_kind() const noexcept { + using __tag = detail::__type_pack<_Kind>; + return static_cast *>( + __do_as_kind(detail::__fallback_typeid::__get_typeid<__tag>(), + detail::__fallback_typeid::__get_fallback_typeid<__tag>())); + } + + protected: + virtual void *do_allocate(size_t __bytes, size_t __alignment) = 0; + virtual void do_deallocate(void *__mem, size_t __bytes, size_t __alignment) = 0; + + virtual bool is_equal_base(const memory_resource_base &other) const noexcept = 0; + + bool is_equal(const memory_resource_base &other) const noexcept { + return is_equal_base(other); + } + + template + friend class cuda_for_dali::basic_resource_view; + + virtual void *__do_as_kind(const ::std::type_info *__tag_type_id, + const void *__tag_type_fallback_id) const noexcept = 0; +}; + +class stream_ordered_memory_resource_base : public virtual memory_resource_base { + public: + /*! + * \brief Allocates storage of size at least `__bytes` bytes in stream order + * on `__stream`. + * + * The returned storage is aligned to `default_alignment`. + * + * The returned storage may be used immediately only on `__stream`. Accessing + * it on any other stream (or the host) requires first synchronizing with + * `__stream`. + * + * \throws If the storage of the requested size and `default_alignment` cannot + * be obtained. + * + * \param __bytes The size in bytes of the allocation. + * \param __stream The stream on which to perform the allocation. + * \return Pointer to the requested storage. + */ + void *allocate_async(size_t bytes, stream_view stream) { + return allocate_async(bytes, default_alignment, stream); + } + /*! + * \brief Allocates storage of size at least `__bytes` bytes in stream order + * on `__stream`. + * + * The returned storage is aligned to the specified `__alignment` if such + * alignment is supported. + * + * The returned storage may be used immediately only on `__stream`. Using it + * on any other stream (or the host) requires first synchronizing with + * `__stream`. + * + * \throws If the storage of the requested size and alignment cannot be + * obtained. + * + * \param __bytes The size in bytes of the allocation. + * \param __alignment The alignment of the allocation + * \param __stream The stream on which to perform the allocation. + * \return Pointer to the requested storage. + */ + void *allocate_async(size_t bytes, size_t alignment, stream_view stream) { + return do_allocate_async(bytes, alignment, stream); + } + + /*! + * \brief Deallocates the storage pointed to by `__p` in stream order on + * `__stream`. + * + * `__p` must have been returned by a prior call to + * `allocate_async(__bytes, default_alignment)` or `allocate(__bytes, + * default_alignment)` on a `stream_ordered_memory_resource` that compares + * equal to `*this`, and the storage it points to must not yet have been + * deallocated, otherwise behavior is undefined. + * + * Asynchronous, stream-ordered operations on `__stream` initiated before + * `deallocate_async(__p, __bytes, __stream)` may still access the storage + * pointed to by `__p` after `deallocate_async` returns. + * + * Storage deallocated on `__stream` may be reused by a future + * call to `allocate_async` on the same stream without synchronizing + * `__stream`. Therefore, `__stream` is typically the last stream on which + * `__p` was last used. It is the caller's responsibility to ensure the + * storage pointed to by `__p` is not in use on any other stream (or the + * host), or behavior is undefined. + * + * \param __p Pointer to storage to be deallocated. + * \param __bytes The size in bytes of the allocation. This must be equal to + * the value of `__bytes` that was specified to the `allocate` or + * `allocate_async` call that returned `__p`. + * \param __stream The stream on which to perform the deallocation. + */ + void deallocate_async(void *__mem, size_t __bytes, stream_view __stream) { + deallocate_async(__mem, __bytes, default_alignment, __stream); + } + + /*! + * \brief Deallocates the storage pointed to by `__p` in stream order on + * `__stream`. + * + * `__p` must have been returned by a prior call to + * `allocate_async(__bytes, __alignment)` or `allocate(__bytes, + * __alignment)` on a `stream_ordered_memory_resource` that compares + * equal to `*this`, and the storage it points to must not yet have been + * deallocated, otherwise behavior is undefined. + * + * Asynchronous, stream-ordered operations on `__stream` initiated before + * `deallocate_async(__p, __bytes, __stream)` may still access the storage + * pointed to by `__p` after `deallocate_async` returns. + * + * Storage deallocated on `__stream` may be reused by a future + * call to `allocate_async` on the same stream without synchronizing + * `__stream`. Therefore, `__stream` is typically the last stream on which + * `__p` was last used. It is the caller's responsibility to ensure the + * storage pointed to by `__p` is not in use on any other stream (or the + * host), or behavior is undefined. + * + * \param __p Pointer to storage to be deallocated. + * \param __bytes The size in bytes of the allocation. This must be equal to + * the value of `__bytes` that was specified to the `allocate` or + * `allocate_async` call that returned `__p`. + * \param __alignment The alignment of the allocation. This must be equal to + * the value of `__alignment` that was specified to the `allocate` or + * `allocate_async` call that returned `__p`. + * \param __stream The stream on which to perform the deallocation. + */ + void deallocate_async(void *__mem, size_t __bytes, size_t __alignment, stream_view __stream) { + do_deallocate_async(__mem, __bytes, __alignment, __stream); + } + + protected: + virtual void *do_allocate_async(size_t __bytes, size_t __alignment, stream_view __stream) = 0; + virtual void do_deallocate_async(void *__mem, size_t __bytes, size_t __alignment, + stream_view __stream) = 0; + + template + friend class cuda_for_dali::basic_resource_view; +}; + +} // namespace detail + +/*! + * \brief Abstract interface for memory allocation. + * + * \tparam _MemoryKind The kind of the allocated memory. + */ +template +class memory_resource : private virtual detail::memory_resource_base { + public: + using memory_kind = _MemoryKind; + static constexpr std::size_t default_alignment = memory_resource_base::default_alignment; + + virtual ~memory_resource() = default; + + /*! + * \brief Allocates storage of size at least `__bytes` bytes. + * + * The returned storage is aligned to the specified `__alignment` if such + * alignment is supported. Otherwise throws. + * + * Storage may be accessed immediately within the execution contexts that + * can access the memory. + * + * \throws If storage of the requested size and alignment cannot be obtained. + * + * \param __bytes The size in bytes of the allocation + * \param __alignment The alignment of the allocation + * \return Pointer to the requested storage + */ + void *allocate(std::size_t __bytes, std::size_t __alignment = default_alignment) { + return do_allocate(__bytes, __alignment); + } + + /*! + * \brief Deallocates the storage pointed to by `__p`. + * + * `__p` must have been returned by a prior call to `allocate(__bytes, + * __alignment)` on a `memory_resource` that compares equal to `*this`, and + * the storage it points to must not yet have been deallocated, otherwise + * behavior is undefined. + * + * \throws Nothing. + * + * \param __p Pointer to storage to be deallocated + * \param __bytes The size in bytes of the allocation. This must be equal to + * the value of `__bytes` that was specified to the `allocate` call that + * returned `__p`. + * \param __alignment The alignment of the allocation. This must be equal to + * the value of `__alignment` that was specified to the `allocate` call that + * returned `__p`. + */ + void deallocate(void *__p, std::size_t __bytes, std::size_t __alignment = default_alignment) { + do_deallocate(__p, __bytes, __alignment); + } + + /*! + * \brief Compare this resource to another. + * + * Two resources compare equal if and only if memory allocated from one + * resource can be deallocated from the other and vice versa. + * + * \param __other The other resource to compare against + */ + bool is_equal(memory_resource const &__other) const noexcept { + return do_is_equal(__other); + } + + private: + template + friend class basic_resource_view; + + void *do_allocate(std::size_t __bytes, std::size_t __alignment) override = 0; + + void do_deallocate(void *__p, std::size_t __bytes, std::size_t __alignment) override = 0; + + // Default to identity comparison + virtual bool do_is_equal(memory_resource const &__other) const noexcept { + return this == &__other; + } + + void *__do_as_kind(const ::std::type_info *__tag_type_id, + const void *__tag_type_fallback_id) const noexcept final { + using __tag = detail::__type_pack; + return detail::__fallback_typeid::__is_type<__tag>(__tag_type_id, __tag_type_fallback_id) ? + const_cast(this) : + nullptr; + } + + bool is_equal_base(const detail::memory_resource_base &__other) const noexcept final { + if (auto *__other_res = __other.as_kind()) { + return do_is_equal(*__other_res); + } else { + return false; + } + } +}; + +template +inline _DALI_INLINE_VISIBILITY bool operator==(const memory_resource<_Kind> &__a, + const memory_resource<_Kind> &__b) { + return __a.is_equal(__b); +} + +/*! + * \brief Abstract interface for CUDA stream-ordered memory allocation. + * + * "Stream-ordered memory allocation" extends the CUDA programming model to + * include memory allocation as stream-ordered operations. + * + * All asynchronous accesses of the allocation must happen between the stream + * execution of the allocation and the free. If storage is accessed outside of + * the promised stream order, a use before allocation / use after free error + * will cause undefined behavior. + * + * Allocating on stream `s0` returns memory that is valid to access immediately + * only on `s0`. Accessing it on any other stream (or the host) first requires + * synchronization with `s0`, otherwise behavior is undefined. + * + * Deallocating memory on stream `s1` indicates that it is valid to reuse the + * deallocated memory immediately for another allocation on `s1`. + * + * Asynchronous, stream-ordered operations ordered before deallocation on `s1` + * may still access the storage after deallocation completes. + * + * Memory may be allocated and deallocated on different streams, `s0` and `s1` + * respectively, but requires synchronization between `s0` and `s1` before the + * deallocation occurs. + * + * \tparam _MemoryKind The kind of the allocated memory. + */ +template +class stream_ordered_memory_resource : public virtual memory_resource<_MemoryKind>, + private virtual detail::stream_ordered_memory_resource_base { + public: + using memory_kind = _MemoryKind; + static constexpr std::size_t default_alignment = memory_resource<_MemoryKind>::default_alignment; + + /*! + * \brief Allocates storage of size at least `__bytes` bytes in stream order + * on `__stream`. + * + * The returned storage is aligned to `default_alignment`. + * + * The returned storage may be used immediately only on `__stream`. Accessing + * it on any other stream (or the host) requires first synchronizing with + * `__stream`. + * + * \throws If the storage of the requested size and `default_alignment` cannot + * be obtained. + * + * \param __bytes The size in bytes of the allocation. + * \param __stream The stream on which to perform the allocation. + * \return Pointer to the requested storage. + */ + void *allocate_async(std::size_t __bytes, stream_view __stream) { + return do_allocate_async(__bytes, default_alignment, __stream); + } + + /*! + * \brief Allocates storage of size at least `__bytes` bytes in stream order + * on `__stream`. + * + * The returned storage is aligned to the specified `__alignment` if such + * alignment is supported. + * + * The returned storage may be used immediately only on `__stream`. Using it + * on any other stream (or the host) requires first synchronizing with + * `__stream`. + * + * \throws If the storage of the requested size and alignment cannot be + * obtained. + * + * \param __bytes The size in bytes of the allocation. + * \param __alignment The alignment of the allocation + * \param __stream The stream on which to perform the allocation. + * \return Pointer to the requested storage. + */ + void *allocate_async(std::size_t __bytes, std::size_t __alignment, stream_view __stream) { + return do_allocate_async(__bytes, __alignment, __stream); + } + + /*! + * \brief Deallocates the storage pointed to by `__p` in stream order on + * `__stream`. + * + * `__p` must have been returned by a prior call to + * `allocate_async(__bytes, default_alignment)` or `allocate(__bytes, + * default_alignment)` on a `stream_ordered_memory_resource` that compares + * equal to `*this`, and the storage it points to must not yet have been + * deallocated, otherwise behavior is undefined. + * + * Asynchronous, stream-ordered operations on `__stream` initiated before + * `deallocate_async(__p, __bytes, __stream)` may still access the storage + * pointed to by `__p` after `deallocate_async` returns. + * + * Storage deallocated on `__stream` may be reused by a future + * call to `allocate_async` on the same stream without synchronizing + * `__stream`. Therefore, `__stream` is typically the last stream on which + * `__p` was last used. It is the caller's responsibility to ensure the + * storage pointed to by `__p` is not in use on any other stream (or the + * host), or behavior is undefined. + * + * \param __p Pointer to storage to be deallocated. + * \param __bytes The size in bytes of the allocation. This must be equal to + * the value of `__bytes` that was specified to the `allocate` or + * `allocate_async` call that returned `__p`. + * \param __stream The stream on which to perform the deallocation. + */ + void deallocate_async(void *__p, std::size_t __bytes, stream_view __stream) { + do_deallocate_async(__p, __bytes, default_alignment, __stream); + } + + /*! + * \brief Deallocates the storage pointed to by `__p` in stream order on + * `__stream`. + * + * `__p` must have been returned by a prior call to + * `allocate_async(__bytes, __alignment)` or `allocate(__bytes, + * __alignment)` on a `stream_ordered_memory_resource` that compares + * equal to `*this`, and the storage it points to must not yet have been + * deallocated, otherwise behavior is undefined. + * + * Asynchronous, stream-ordered operations on `__stream` initiated before + * `deallocate_async(__p, __bytes, __stream)` may still access the storage + * pointed to by `__p` after `deallocate_async` returns. + * + * Storage deallocated on `__stream` may be reused by a future + * call to `allocate_async` on the same stream without synchronizing + * `__stream`. Therefore, `__stream` is typically the last stream on which + * `__p` was last used. It is the caller's responsibility to ensure the + * storage pointed to by `__p` is not in use on any other stream (or the + * host), or behavior is undefined. + * + * \param __p Pointer to storage to be deallocated. + * \param __bytes The size in bytes of the allocation. This must be equal to + * the value of `__bytes` that was specified to the `allocate` or + * `allocate_async` call that returned `__p`. + * \param __alignment The alignment of the allocation. This must be equal to + * the value of `__alignment` that was specified to the `allocate` or + * `allocate_async` call that returned `__p`. + * \param __stream The stream on which to perform the deallocation. + */ + void deallocate_async(void *__p, std::size_t __bytes, std::size_t __alignment, + stream_view __stream) { + do_deallocate_async(__p, __bytes, __alignment, __stream); + } + + private: + template + friend class basic_resource_view; + + /// Default synchronous implementation of `memory_resource::do_allocate` + void *do_allocate(std::size_t __bytes, std::size_t __alignment) override { + auto const __default_stream = stream_view{}; + auto __p = do_allocate_async(__bytes, __alignment, __default_stream); + __default_stream.wait(); + return __p; + } + + /// Default synchronous implementation of `memory_resource::do_deallocate` + void do_deallocate(void *__p, std::size_t __bytes, std::size_t __alignment) override { + auto const __default_stream = stream_view{}; + __default_stream.wait(); + do_deallocate_async(__p, __bytes, __alignment, __default_stream); + } + + void *do_allocate_async(std::size_t __bytes, std::size_t __alignment, + stream_view __stream) override = 0; + + void do_deallocate_async(void *__p, std::size_t __bytes, std::size_t __alignment, + stream_view __stream) override = 0; +}; + + +/*! + * \brief Indicates whether a memory kind `_MemoryKind` has a property `__property`. + */ +template +struct kind_has_property : std::false_type {}; + +/*! + * \brief A special property telling that given resource/resource view allocates + * memory of specific kind. + * + * When a view defines this property, it implicitly has all properties of this + * memory kind. + * This property is also a property in itself and views defining properties of + * the underlying memory kind cannot be converted to a view defining this property. + * This allows for future extension of the set of properties. + */ +template +struct is_kind; + +template +struct kind_has_property<_MemoryKind, is_kind<_MemoryKind>> : std::true_type {}; + +#define _DALI_MEMORY_KIND_PROPERTY(__kind, __property) \ + template <> \ + struct kind_has_property<__kind, __property> : std::true_type {}; + +_DALI_MEMORY_KIND_PROPERTY(memory_kind::host, memory_access::host); +_DALI_MEMORY_KIND_PROPERTY(memory_kind::host, oversubscribable); +_DALI_MEMORY_KIND_PROPERTY(memory_kind::host, memory_location::host); + +_DALI_MEMORY_KIND_PROPERTY(memory_kind::pinned, memory_access::host); +_DALI_MEMORY_KIND_PROPERTY(memory_kind::pinned, memory_access::device); +_DALI_MEMORY_KIND_PROPERTY(memory_kind::pinned, resident); +_DALI_MEMORY_KIND_PROPERTY(memory_kind::pinned, memory_location::host); + +_DALI_MEMORY_KIND_PROPERTY(memory_kind::device, memory_access::device); +_DALI_MEMORY_KIND_PROPERTY(memory_kind::device, resident); +_DALI_MEMORY_KIND_PROPERTY(memory_kind::device, memory_location::device); + +_DALI_MEMORY_KIND_PROPERTY(memory_kind::managed, memory_access::host); +_DALI_MEMORY_KIND_PROPERTY(memory_kind::managed, memory_access::device); +_DALI_MEMORY_KIND_PROPERTY(memory_kind::managed, oversubscribable); +_DALI_MEMORY_KIND_PROPERTY(memory_kind::managed, memory_location::host); +_DALI_MEMORY_KIND_PROPERTY(memory_kind::managed, memory_location::device); + +namespace detail { +template +std::false_type Has_Property(...); +template +kind_has_property<_MemoryKind, _Property> Has_Property(const memory_resource<_MemoryKind> *); +template +kind_has_property<_MemoryKind, _Property> Has_Property( + const stream_ordered_memory_resource<_MemoryKind> *); +} // namespace detail + +template +struct has_property : decltype(detail::Has_Property<_Property>( + std::declval *>())) {}; + +namespace detail { +template +struct is_property_in : std::false_type {}; + +template +struct is_property_in<_Property, _Mismatch, _Tail...> : is_property_in<_Property, _Tail...> {}; + +template +struct is_property_in<_Property, _Property, _Tail...> : std::true_type {}; + +template +struct is_property_in<_Property, is_kind<_MemoryKind>, Tail...> + : kind_has_property<_MemoryKind, _Property> {}; + +template +struct is_property_in, is_kind<_MemoryKind>, Tail...> : std::true_type {}; + +template +struct is_resource_pointer_convertible : std::is_convertible<_FromPointer, _ToPointer> {}; +// Private inheritance from (stream_ordered_)memory_resource_base* requires explicit partial +// specializations as `is_convertible` will return false + +template +struct is_resource_pointer_convertible<_FromPointer, detail::memory_resource_base *> + : std::conjunction< + std::is_pointer<_FromPointer>, + std::is_base_of::element_type>> {}; + +template +struct is_resource_pointer_convertible<_FromPointer, detail::stream_ordered_memory_resource_base *> + : std::conjunction< + std::is_pointer<_FromPointer>, + std::is_base_of::element_type>> {}; + +} // namespace detail + +template +struct has_property, _Property> + : detail::is_property_in<_Property, _Properties...> {}; + +template +struct is_view_convertible; + +template +struct is_view_convertible, + basic_resource_view<_ToPointer, _ToProperties...>> + : std::conjunction< + detail::is_resource_pointer_convertible<_FromPointer, _ToPointer>, + has_property, _ToProperties>...> {}; + +/*! + * \brief A pointer-like object to a memory resource based on resource. + * + * Resource view is an object that acts as a memory resource pointer, but provides + * enhanced implicit conversions. The idea behind this type is that a user of + * a memory resource may be interested in many kinds of resources as long as they + * have certain properties. For example, a function may work with any resource + * that can provide host-accessible memory, regardless of whether it is plain host + * memory, pinned memory, managed memory, or some,yet-to-be-defined future kind + * of memory. + * + * A resource view can be created from a memory resource pointer or from another + * resource view that defines a superset of the target properties. + * + * The resource view exposes the underlying resource's interface via `operator->`. + * + * The `basic_resource_view` class can be parameterized with the resource pointer type, + * which can be either one of the base resource classes or a concrete resource type. + * + * \tparam _ResourcePointer a pointer-like object to the underlying memory resource + * \tparam _Properies properties of a memory resource required by resource view + */ +template +class basic_resource_view { + public: + static_assert( + std::is_base_of::element_type>::value || + std::is_base_of::element_type>::value, + "ResourcePointer must be a pointer to a memory_resource_base, " + "stream_ordered_memory_resource_base or a derived class"); + + basic_resource_view() = default; + + basic_resource_view(int) = delete; // NOLINT(runtime/explicit) + + basic_resource_view(std::nullptr_t) {} // NOLINT(runtime/explicit) + + /*! + * \brief Constructs a resource view from a compatible memory resource pointer. + * + * The memory resource is considered compatible if a pointer to it can be converted to + * `_ResourcePointer` and the resource type has the required properties listed + * in `_Properties`. + * + * \tparam _Resource Type of a mmeory resource object. + * \param __p pointer to a memory resource object. + */ + template ::value && + std::conjunction...>::value>> + basic_resource_view(_Resource *__p) : __pointer(__p) {} // NOLINT(runtime/explicit) + + /*! + * \brief Constructs a resource view by copying the resource pointer from a compatible resource + * view. + * + * A resource view is considered compatible if it defines all properties required by this + * view in `_Properties`. + * + * \tparam _OtherPointer The resource pointer type of the source resource view + * \tparam _OtherProperties The properties defined byt the source resource view + */ + template < + typename _OtherPointer, typename... _OtherProperties, + typename = std::enable_if_t, basic_resource_view>::value>> + basic_resource_view(basic_resource_view<_OtherPointer, _OtherProperties...> v) // NOLINT + : __pointer(v.__pointer) {} + + /*! + * \brief Exposes the interface of the underlying memory resource. + * + * \note This method should not be used to obtain the pointer to the memory resource. + */ + _ResourcePointer operator->() const noexcept { + return __pointer; + } + + template + bool operator==( + const cuda_for_dali::basic_resource_view<_Ptr2, _Props2...> &__v2) const noexcept { + using __view1_t = basic_resource_view; + using __view2_t = basic_resource_view<_Ptr2, _Props2...>; + if (__pointer == nullptr || __v2.__pointer == nullptr) + return __pointer == nullptr && __v2.__pointer == nullptr; + return static_cast(__pointer)->is_equal(*__v2.__pointer); + } + + template + bool operator!=( + const cuda_for_dali::basic_resource_view<_Ptr2, _Props2...> &__v2) const noexcept { + return !(*this == __v2); + } + + /*! + * \brief Returns true if the underlying pointer is not null. + */ + constexpr explicit operator bool() const noexcept { + return !!__pointer; + } + + private: + template + friend class basic_resource_view; + + _ResourcePointer __pointer{}; +}; + +template +basic_resource_view<_ResourcePointer, _FirstProperty, _Properties...> view_resource( + _ResourcePointer __rsrc_ptr) { + return __rsrc_ptr; +} + + +template +basic_resource_view<_ResourcePointer, + is_kind::memory_kind>> +view_resource(_ResourcePointer __rsrc_ptr) { + return __rsrc_ptr; +} + + +template +bool operator==(const basic_resource_view<_ResourcePointer, _Properties...> &__view, + const memory_resource<_Kind> *__mr) { + return __view == view_resource(__mr); +} + +template +bool operator!=(const basic_resource_view<_ResourcePointer, _Properties...> &__view, + const memory_resource<_Kind> *__mr) { + return __view != view_resource(__mr); +} + +template +bool operator==(const memory_resource<_Kind> *__mr, + const basic_resource_view<_ResourcePointer, _Properties...> &__view) { + return view_resource(__mr) == __view; +} + +template +bool operator!=(const memory_resource<_Kind> *__mr, + const basic_resource_view<_ResourcePointer, _Properties...> &__view) { + return view_resource(__mr) != __view; +} + +template +using resource_view = basic_resource_view; + +template +using stream_ordered_resource_view = + basic_resource_view; + +#if _DALI_STD_VER > 14 + +#if defined(_DALI_STD_PMR_NS) + +namespace detail { +class __pmr_adaptor_base : public _DALI_STD_PMR_NS::memory_resource { + public: + virtual cuda_for_dali::memory_resource *resource() + const noexcept = 0; +}; +} // namespace detail + +template +class pmr_adaptor final : public detail::__pmr_adaptor_base { + using resource_type = std::remove_reference_t())>; + + static constexpr bool __is_host_accessible_resource = + has_property::value; + static_assert( + __is_host_accessible_resource, + "Pointer must be a pointer-like type to a type that allocates host-accessible memory."); + + public: + pmr_adaptor(_Pointer __mr) : __mr_{std::move(__mr)} {} // NOLINT(runtime/explicit) + + using raw_pointer = std::remove_reference_t())>; + + raw_pointer resource() const noexcept override { + return &*__mr_; + } + + private: + void *do_allocate(std::size_t __bytes, std::size_t __alignment) override { + return __mr_->allocate(__bytes, __alignment); + } + + void do_deallocate(void *__p, std::size_t __bytes, std::size_t __alignment) override { + return __mr_->deallocate(__p, __bytes, __alignment); + } + + bool do_is_equal(_DALI_STD_PMR_NS::memory_resource const &__other) const noexcept override { + auto __other_p = dynamic_cast(&__other); + return __other_p && + (__other_p->resource() == resource() || __other_p->resource()->is_equal(*resource())); + } + + _Pointer __mr_; +}; +#endif // defined(_DALI_STD_PMR_NS) +#endif // _DALI_STD_VER > 14 + +} // namespace cuda_for_dali +} // namespace dali + +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + +#endif // DALI_CORE_MM_CUDA_MEMORY_RESOURCE_H_ diff --git a/include/dali/core/mm/cuda_stream_view.h b/include/dali/core/mm/cuda_stream_view.h new file mode 100644 index 0000000000..4489a221ba --- /dev/null +++ b/include/dali/core/mm/cuda_stream_view.h @@ -0,0 +1,130 @@ +// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef DALI_CORE_MM_CUDA_STREAM_VIEW_H_ +#define DALI_CORE_MM_CUDA_STREAM_VIEW_H_ + +#include +// #include + +#include // NOLINT(build/include_order) +#include +#include "dali/core/cuda_error.h" + +namespace dali { +namespace cuda_for_dali { + +/** + * \brief A non-owning wrapper for a `cudaStream_t`. + * + * `stream_view` is a non-owning "view" type similar to `std::span` or `std::string_view`. + * \see https://en.cppreference.com/w/cpp/container/span and + * \see https://en.cppreference.com/w/cpp/string/basic_string_view + * + */ +class stream_view { + public: + using value_type = ::cudaStream_t; + + /** + * \brief Constructs a `stream_view` of the "default" CUDA stream. + * + * For behavior of the default stream, + * \see https://docs.nvidia.com/cuda_for_dali/cuda_for_dali-runtime-api/stream-sync-behavior.html + * + */ + constexpr stream_view() noexcept = default; + + /** + * \brief Constructs a `stream_view` from a `cudaStream_t` handle. + * + * This constructor provides implicit conversion from `cudaStream_t`. + * + * \note: It is the callers responsibilty to ensure the `stream_view` does not + * outlive the stream identified by the `cudaStream_t` handle. + * + */ + constexpr stream_view(value_type stream) : __stream{stream} {} // NOLINT(runtime/explicit) + + /// Disallow construction from an `int`, e.g., `0`. + stream_view(int) = delete; // NOLINT(runtime/explicit) + + /// Disallow construction from `nullptr`. + stream_view(std::nullptr_t) = delete; // NOLINT(runtime/explicit) + + /// Returns the wrapped `cudaStream_t` handle. + constexpr value_type get() const noexcept { + return __stream; + } + + /** + * \brief Synchronizes the wrapped stream. + * + * \throws cuda_for_dali::cuda_error if synchronization fails. + * + */ + void wait() const { + CUDA_CALL(::cudaStreamSynchronize(get())); // "Failed to synchronize stream." + } + + /** + * \brief Queries if all operations on the wrapped stream have completed. + * + * \throws cuda_for_dali::cuda_error if the query fails. + * + * \return `true` if all operations have completed, or `false` if not. + */ + bool ready() const { + auto const __result = ::cudaStreamQuery(get()); + if (__result == ::cudaSuccess) { + return true; + } else if (__result == ::cudaErrorNotReady) { + return false; + } + CUDA_CALL(__result); + return false; + } + + private: + value_type __stream{0}; ///< Handle of the viewed stream +}; + +/** + * \brief Compares two `stream_view`s for equality + * + * \note Allows comparison with `cudaStream_t` due to implicit conversion to + * `stream_view`. + * + * \param lhs The first `stream_view` to compare + * \param rhs The second `stream_view` to compare + * \return true if equal, false if unequal + */ +inline constexpr bool operator==(stream_view __lhs, stream_view __rhs) { + return __lhs.get() == __rhs.get(); +} + +/** + * \brief Compares two `stream_view`s for inequality + * + * \note Allows comparison with `cudaStream_t` due to implicit conversion to + * `stream_view`. + * + * \param lhs The first `stream_view` to compare + * \param rhs The second `stream_view` to compare + * \return true if unequal, false if equal + */ +inline constexpr bool operator!=(stream_view __lhs, stream_view __rhs) { + return !(__lhs == __rhs); +} + +} // namespace cuda_for_dali +} // namespace dali + +#endif // DALI_CORE_MM_CUDA_STREAM_VIEW_H_ diff --git a/include/dali/core/mm/memory_resource.h b/include/dali/core/mm/memory_resource.h index 5735fb7ede..60c66a8c4d 100644 --- a/include/dali/core/mm/memory_resource.h +++ b/include/dali/core/mm/memory_resource.h @@ -1,4 +1,4 @@ -// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -18,7 +18,7 @@ #include #include -#include +#include "dali/core/mm/cuda_memory_resource.h" namespace dali { @@ -36,18 +36,18 @@ namespace dali { */ namespace mm { -namespace memory_kind = cuda::memory_kind; +namespace memory_kind = cuda_for_dali::memory_kind; -using cuda::memory_resource; -using cuda::resource_view; -using cuda::stream_ordered_resource_view; +using cuda_for_dali::memory_resource; +using cuda_for_dali::resource_view; +using cuda_for_dali::stream_ordered_resource_view; using host_memory_resource = memory_resource; using pinned_memory_resource = memory_resource; -using cuda::stream_view; +using cuda_for_dali::stream_view; template -using async_memory_resource = cuda::stream_ordered_memory_resource; +using async_memory_resource = cuda_for_dali::stream_ordered_memory_resource; using device_async_resource = async_memory_resource; using pinned_async_resource = async_memory_resource; @@ -60,7 +60,8 @@ struct stream_context { namespace detail { template -constexpr bool is_host_accessible = cuda::kind_has_property::value; +constexpr bool is_host_accessible = + cuda_for_dali::kind_has_property::value; } // namespace detail diff --git a/third_party/README.rst b/third_party/README.rst index 5b890b836a..a95851ed2e 100644 --- a/third_party/README.rst +++ b/third_party/README.rst @@ -19,8 +19,6 @@ This part of the repository contains extra dependencies required to build DALI, +-----------------+---------------------+---------------------+ | |googletest|_ | |googletestver|_ | |googletestlic|_ | +-----------------+---------------------+---------------------+ -| |libcudacxx|_ | |libcudacxxver|_ | |libcudacxxlic|_ | -+-----------------+---------------------+---------------------+ | |pybind11|_ | |pybind11ver|_ | |pybind11lic|_ | +-----------------+---------------------+---------------------+ | |rapidjson|_ | |rapidjsonver|_ | |rapidjsonlic|_ | @@ -75,13 +73,6 @@ This part of the repository contains extra dependencies required to build DALI, .. |googletestlic| replace:: BSD 3-Clause License .. _googletestlic: https://github.com/google/googletest/blob/master/LICENSE -.. |libcudacxx| replace:: libcu++ -.. _libcudacxx: https://github.com/mzient/libcudacxx.git -.. |libcudacxxver| replace:: Custom fork (Aug 30, 2021) -.. _libcudacxxver: https://github.com/mzient/libcudacxx/tree/863f11a16cced8b7aacfc639dacb419843a300e8 -.. |libcudacxxlic| replace:: Apache License v2.0 with LLVM Exceptions -.. _libcudacxxlic: https://github.com/mzient/libcudacxx/blob/main/LICENSE.TXT - .. |pybind11| replace:: pybind11 .. _pybind11: https://github.com/pybind/pybind11 .. |pybind11ver| replace:: 2.10.4 diff --git a/third_party/libcudacxx b/third_party/libcudacxx deleted file mode 160000 index 00f1bf1fcc..0000000000 --- a/third_party/libcudacxx +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 00f1bf1fcc10bb452bb5e1e94d41b74ed666e235