From 32a02275a987045ed61b17252e8e647787639538 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Thu, 11 Apr 2024 12:45:22 -0700 Subject: [PATCH 01/71] Delete header --- libcudacxx/include/cuda/std/atomic | 22 ---------------------- 1 file changed, 22 deletions(-) delete mode 100644 libcudacxx/include/cuda/std/atomic diff --git a/libcudacxx/include/cuda/std/atomic b/libcudacxx/include/cuda/std/atomic deleted file mode 100644 index 0daab5f2cb..0000000000 --- a/libcudacxx/include/cuda/std/atomic +++ /dev/null @@ -1,22 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of libcu++, the C++ Standard Library for your entire system, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -#ifndef _CUDA_STD_ATOMIC -#define _CUDA_STD_ATOMIC - -#include - -#include - -#include - -#include - -#endif // _CUDA_STD_ATOMIC From 4be887d1898fc660a41fdf7e4e02bc4d9c6679a9 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Thu, 11 Apr 2024 12:46:32 -0700 Subject: [PATCH 02/71] Move atomic from libcxx to top-level --- libcudacxx/include/cuda/std/{detail/libcxx/include => }/atomic | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename libcudacxx/include/cuda/std/{detail/libcxx/include => }/atomic (100%) diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/atomic b/libcudacxx/include/cuda/std/atomic similarity index 100% rename from libcudacxx/include/cuda/std/detail/libcxx/include/atomic rename to libcudacxx/include/cuda/std/atomic From b36fec61334a75e4e72db0ac6e8f0053e3209b13 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Thu, 11 Apr 2024 12:50:52 -0700 Subject: [PATCH 03/71] Move PTX backends from libcxx to --- libcudacxx/codegen/CMakeLists.txt | 10 +++++----- libcudacxx/codegen/codegen.cpp | 4 ++-- .../atomic_cuda_ptx_derived.h} | 0 .../atomic_cuda_ptx_generated.h} | 0 4 files changed, 7 insertions(+), 7 deletions(-) rename libcudacxx/include/cuda/std/{detail/libcxx/include/support/atomic/atomic_cuda_derived.h => __atomic/atomic_cuda_ptx_derived.h} (100%) rename libcudacxx/include/cuda/std/{detail/libcxx/include/support/atomic/atomic_cuda_generated.h => __atomic/atomic_cuda_ptx_generated.h} (100%) diff --git a/libcudacxx/codegen/CMakeLists.txt b/libcudacxx/codegen/CMakeLists.txt index b0df1b5a98..af1b6bdb8a 100644 --- a/libcudacxx/codegen/CMakeLists.txt +++ b/libcudacxx/codegen/CMakeLists.txt @@ -19,8 +19,8 @@ target_compile_features( add_dependencies(libcudacxx.atomics.codegen codegen) -set(atomic_generated_output "${libcudacxx_BINARY_DIR}/codegen/atomic_cuda_generated.h") -set(atomic_install_location "${libcudacxx_SOURCE_DIR}/include/cuda/std/detail/libcxx/include/support/atomic") +set(atomic_generated_output "${libcudacxx_BINARY_DIR}/codegen/atomic_cuda_ptx_generated.h") +set(atomic_install_location "${libcudacxx_SOURCE_DIR}/include/cuda/std/__atomic") add_custom_target( libcudacxx.atomics.codegen.execute @@ -32,13 +32,13 @@ add_dependencies(libcudacxx.atomics.codegen libcudacxx.atomics.codegen.execute) add_custom_target( libcudacxx.atomics.codegen.install - COMMAND ${CMAKE_COMMAND} -E copy "${atomic_generated_output}" "${atomic_install_location}/atomic_cuda_generated.h" - BYPRODUCTS "${atomic_install_location}/atomic_cuda_generated.h" + COMMAND ${CMAKE_COMMAND} -E copy "${atomic_generated_output}" "${atomic_install_location}/atomic_cuda_ptx_generated.h" + BYPRODUCTS "${atomic_install_location}/atomic_cuda_ptx_generated.h" ) add_dependencies(libcudacxx.atomics.codegen.install libcudacxx.atomics.codegen.execute) add_test( NAME libcudacxx.atomics.codegen.diff - COMMAND ${CMAKE_COMMAND} -E compare_files "${atomic_install_location}/atomic_cuda_generated.h" "${atomic_generated_output}" + COMMAND ${CMAKE_COMMAND} -E compare_files "${atomic_install_location}/atomic_cuda_ptx_generated.h" "${atomic_generated_output}" ) diff --git a/libcudacxx/codegen/codegen.cpp b/libcudacxx/codegen/codegen.cpp index 77d96a92d9..fd032d1d4b 100644 --- a/libcudacxx/codegen/codegen.cpp +++ b/libcudacxx/codegen/codegen.cpp @@ -66,9 +66,9 @@ int main() std::vector cv_qualifier{"volatile ", ""}; - std::ofstream out("atomic_cuda_generated.h"); + std::ofstream out("atomic_cuda_ptx_generated.h"); - out << R"XXX(//===----------------------------------------------------------------------===// + out << R"XXX(//===----------------------------------------------------------------------===// // // Part of libcu++, the C++ Standard Library for your entire system, // under the Apache License v2.0 with LLVM Exceptions. diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda_derived.h b/libcudacxx/include/cuda/std/__atomic/atomic_cuda_ptx_derived.h similarity index 100% rename from libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda_derived.h rename to libcudacxx/include/cuda/std/__atomic/atomic_cuda_ptx_derived.h diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda_generated.h b/libcudacxx/include/cuda/std/__atomic/atomic_cuda_ptx_generated.h similarity index 100% rename from libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda_generated.h rename to libcudacxx/include/cuda/std/__atomic/atomic_cuda_ptx_generated.h From 52a60bb7f73040dc7b6ff38ad4857c1a4e6f3ce8 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Thu, 11 Apr 2024 12:53:09 -0700 Subject: [PATCH 04/71] Delete remaining atomics backends. Move MSVC backend --- .../atomic_gcc.h => __atomic/platform.h} | 9 +- .../platform/msvc_to_builtins.h} | 2 - .../include/support/atomic/atomic_base.h | 246 ------ .../include/support/atomic/atomic_c11.h | 241 ------ .../include/support/atomic/atomic_cuda.h | 787 ------------------ .../include/support/atomic/atomic_nvrtc.h | 17 - .../include/support/atomic/atomic_scopes.h | 67 -- .../include/support/atomic/cxx_atomic.h | 180 ---- 8 files changed, 3 insertions(+), 1546 deletions(-) rename libcudacxx/include/cuda/std/{detail/libcxx/include/support/atomic/atomic_gcc.h => __atomic/platform.h} (74%) rename libcudacxx/include/cuda/std/{detail/libcxx/include/support/atomic/atomic_msvc.h => __atomic/platform/msvc_to_builtins.h} (99%) delete mode 100644 libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_base.h delete mode 100644 libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_c11.h delete mode 100644 libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda.h delete mode 100644 libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_nvrtc.h delete mode 100644 libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_scopes.h delete mode 100644 libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/cxx_atomic.h diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_gcc.h b/libcudacxx/include/cuda/std/__atomic/platform.h similarity index 74% rename from libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_gcc.h rename to libcudacxx/include/cuda/std/__atomic/platform.h index 8d5d7967cb..9a2f683d15 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_gcc.h +++ b/libcudacxx/include/cuda/std/__atomic/platform.h @@ -9,9 +9,6 @@ // //===----------------------------------------------------------------------===// -#ifndef _LIBCUDACXX_ATOMIC_GCC_H -#define _LIBCUDACXX_ATOMIC_GCC_H - -#include - -#endif // _LIBCUDACXX_ATOMIC_GCC_H +#if defined(_CCCL_COMPILER_MSVC) +#include +#endif diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_msvc.h b/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h similarity index 99% rename from libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_msvc.h rename to libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h index 53cd9cd4d7..d48c68acb4 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_msvc.h +++ b/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h @@ -621,5 +621,3 @@ _Type __atomic_fetch_min(_Type volatile* __ptr, _Delta __val, int __memorder) } return __expected; } - -#include diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_base.h b/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_base.h deleted file mode 100644 index 65be5cfd97..0000000000 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_base.h +++ /dev/null @@ -1,246 +0,0 @@ -// -*- C++ -*- -//===----------------------------------------------------------------------===// -// -// Part of libcu++, the C++ Standard Library for your entire system, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -#ifndef _LIBCUDACXX_ATOMIC_BASE_H -#define _LIBCUDACXX_ATOMIC_BASE_H - -#include - -// Guard ifdef for lock free query in case it is assigned elsewhere (MSVC/CUDA) -#ifndef _LIBCUDACXX_ATOMIC_IS_LOCK_FREE -# define _LIBCUDACXX_ATOMIC_IS_LOCK_FREE(__x) __atomic_is_lock_free(__x, 0) -#endif - -_LIBCUDACXX_INLINE_VISIBILITY inline constexpr int __cxx_atomic_order_to_int(memory_order __order) -{ - // Avoid switch statement to make this a constexpr. - return __order == memory_order_relaxed - ? __ATOMIC_RELAXED - : (__order == memory_order_acquire - ? __ATOMIC_ACQUIRE - : (__order == memory_order_release - ? __ATOMIC_RELEASE - : (__order == memory_order_seq_cst - ? __ATOMIC_SEQ_CST - : (__order == memory_order_acq_rel ? __ATOMIC_ACQ_REL : __ATOMIC_CONSUME)))); -} - -_LIBCUDACXX_INLINE_VISIBILITY inline constexpr int __cxx_atomic_failure_order_to_int(memory_order __order) -{ - // Avoid switch statement to make this a constexpr. - return __order == memory_order_relaxed - ? __ATOMIC_RELAXED - : (__order == memory_order_acquire - ? __ATOMIC_ACQUIRE - : (__order == memory_order_release - ? __ATOMIC_RELAXED - : (__order == memory_order_seq_cst - ? __ATOMIC_SEQ_CST - : (__order == memory_order_acq_rel ? __ATOMIC_ACQUIRE : __ATOMIC_CONSUME)))); -} - -template -inline void __cxx_atomic_init(volatile _Tp* __a, _Up __val) -{ - auto __a_tmp = __cxx_get_underlying_atomic(__cxx_atomic_unwrap(__a)); - __cxx_atomic_assign_volatile(*__a_tmp, __val); -} - -template -inline void __cxx_atomic_init(_Tp* __a, _Up __val) -{ - auto __a_tmp = __cxx_get_underlying_atomic(__cxx_atomic_unwrap(__a)); - *__a_tmp = __val; -} - -inline void __cxx_atomic_thread_fence(memory_order __order) -{ - __atomic_thread_fence(__cxx_atomic_order_to_int(__order)); -} - -inline void __cxx_atomic_signal_fence(memory_order __order) -{ - __atomic_signal_fence(__cxx_atomic_order_to_int(__order)); -} - -template -inline void __cxx_atomic_store(_Tp* __a, _Up __val, memory_order __order) -{ - auto __v_temp = __cxx_atomic_wrap_to_base(__a, __val); - __atomic_store(__cxx_atomic_unwrap(__a), &__v_temp, __cxx_atomic_order_to_int(__order)); -} - -template -inline auto __cxx_atomic_load(const _Tp* __a, memory_order __order) -> __cxx_atomic_underlying_t<_Tp> -{ - auto __ret = __cxx_atomic_base_temporary(__a); - __atomic_load(__cxx_atomic_unwrap(__a), &__ret, __cxx_atomic_order_to_int(__order)); - return *__cxx_get_underlying_atomic(&__ret); -} - -template -inline auto __cxx_atomic_exchange(_Tp* __a, _Up __val, memory_order __order) -> __cxx_atomic_underlying_t<_Tp> -{ - auto __v_temp = __cxx_atomic_wrap_to_base(__a, __val); - auto __ret = __cxx_atomic_base_temporary(__a); - __atomic_exchange(__cxx_atomic_unwrap(__a), &__v_temp, &__ret, __cxx_atomic_order_to_int(__order)); - return *__cxx_get_underlying_atomic(&__ret); -} - -template -inline bool __cxx_atomic_compare_exchange_strong( - _Tp* __a, _Up* __expected, _Up __value, memory_order __success, memory_order __failure) -{ - (void) __expected; - return __atomic_compare_exchange( - __cxx_get_underlying_atomic(__cxx_atomic_unwrap(__a)), - __expected, - &__value, - false, - __cxx_atomic_order_to_int(__success), - __cxx_atomic_failure_order_to_int(__failure)); -} - -template -inline bool __cxx_atomic_compare_exchange_weak( - _Tp* __a, _Up* __expected, _Up __value, memory_order __success, memory_order __failure) -{ - (void) __expected; - return __atomic_compare_exchange( - __cxx_get_underlying_atomic(__cxx_atomic_unwrap(__a)), - __expected, - &__value, - true, - __cxx_atomic_order_to_int(__success), - __cxx_atomic_failure_order_to_int(__failure)); -} - -template -struct __atomic_ptr_inc -{ - enum - { - value = 1 - }; -}; - -template -struct __atomic_ptr_inc<_Tp*> -{ - enum - { - value = sizeof(_Tp) - }; -}; - -// FIXME: Haven't figured out what the spec says about using arrays with -// atomic_fetch_add. Force a failure rather than creating bad behavior. -template -struct __atomic_ptr_inc<_Tp[]> -{}; -template -struct __atomic_ptr_inc<_Tp[n]> -{}; - -template >::value, int> = 0> -inline auto __cxx_atomic_fetch_add(_Tp* __a, _Td __delta, memory_order __order) -> __cxx_atomic_underlying_t<_Tp> -{ - constexpr auto __skip_v = __atomic_ptr_inc<__cxx_atomic_underlying_t<_Tp>>::value; - auto __a_tmp = __cxx_get_underlying_atomic(__cxx_atomic_unwrap(__a)); - return __atomic_fetch_add(__a_tmp, __delta * __skip_v, __cxx_atomic_order_to_int(__order)); -} - -template >::value, int> = 0> -inline auto __cxx_atomic_fetch_add(_Tp* __a, _Td __delta, memory_order __order) -> __cxx_atomic_underlying_t<_Tp> -{ - auto __expected = __cxx_atomic_load(__a, memory_order_relaxed); - auto __desired = __expected + __delta; - - while (!__cxx_atomic_compare_exchange_strong(__a, &__expected, __desired, __order, __order)) - { - __desired = __expected + __delta; - } - - return __expected; -} - -template >::value, int> = 0> -inline auto __cxx_atomic_fetch_sub(_Tp* __a, _Td __delta, memory_order __order) -> __cxx_atomic_underlying_t<_Tp> -{ - constexpr auto __skip_v = __atomic_ptr_inc<__cxx_atomic_underlying_t<_Tp>>::value; - auto __a_tmp = __cxx_get_underlying_atomic(__cxx_atomic_unwrap(__a)); - return __atomic_fetch_sub(__a_tmp, __delta * __skip_v, __cxx_atomic_order_to_int(__order)); -} - -template >::value, int> = 0> -inline auto __cxx_atomic_fetch_sub(_Tp* __a, _Td __delta, memory_order __order) -> __cxx_atomic_underlying_t<_Tp> -{ - auto __expected = __cxx_atomic_load(__a, memory_order_relaxed); - auto __desired = __expected - __delta; - - while (!__cxx_atomic_compare_exchange_strong(__a, &__expected, __desired, __order, __order)) - { - __desired = __expected - __delta; - } - - return __expected; -} - -template -inline auto __cxx_atomic_fetch_and(_Tp* __a, _Td __pattern, memory_order __order) -> __cxx_atomic_underlying_t<_Tp> -{ - auto __a_tmp = __cxx_get_underlying_atomic(__cxx_atomic_unwrap(__a)); - return __atomic_fetch_and(__a_tmp, __pattern, __cxx_atomic_order_to_int(__order)); -} - -template -inline auto __cxx_atomic_fetch_or(_Tp* __a, _Td __pattern, memory_order __order) -> __cxx_atomic_underlying_t<_Tp> -{ - auto __a_tmp = __cxx_get_underlying_atomic(__cxx_atomic_unwrap(__a)); - return __atomic_fetch_or(__a_tmp, __pattern, __cxx_atomic_order_to_int(__order)); -} - -template -inline auto __cxx_atomic_fetch_xor(_Tp* __a, _Td __pattern, memory_order __order) -> __cxx_atomic_underlying_t<_Tp> -{ - auto __a_tmp = __cxx_get_underlying_atomic(__cxx_atomic_unwrap(__a)); - return __atomic_fetch_xor(__a_tmp, __pattern, __cxx_atomic_order_to_int(__order)); -} - -template -inline auto __cxx_atomic_fetch_max(_Tp* __a, _Td __val, memory_order __order) -> __cxx_atomic_underlying_t<_Tp> -{ - auto __expected = __cxx_atomic_load(__a, memory_order_relaxed); - auto __desired = __expected > __val ? __expected : __val; - - while (__desired == __val && !__cxx_atomic_compare_exchange_strong(__a, &__expected, __desired, __order, __order)) - { - __desired = __expected > __val ? __expected : __val; - } - - return __expected; -} - -template -inline auto __cxx_atomic_fetch_min(_Tp* __a, _Td __val, memory_order __order) -> __cxx_atomic_underlying_t<_Tp> -{ - auto __expected = __cxx_atomic_load(__a, memory_order_relaxed); - auto __desired = __expected < __val ? __expected : __val; - - while (__desired == __val && !__cxx_atomic_compare_exchange_strong(__a, &__expected, __desired, __order, __order)) - { - __desired = __expected < __val ? __expected : __val; - } - - return __expected; -} - -#endif // _LIBCUDACXX_ATOMIC_BASE_H diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_c11.h b/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_c11.h deleted file mode 100644 index 1e5c55d243..0000000000 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_c11.h +++ /dev/null @@ -1,241 +0,0 @@ -// -*- C++ -*- -//===----------------------------------------------------------------------===// -// -// Part of libcu++, the C++ Standard Library for your entire system, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -// Atomics for C11 - -template -struct __cxx_atomic_base_impl -{ - _LIBCUDACXX_INLINE_VISIBILITY __cxx_atomic_base_impl() noexcept = default; - - constexpr explicit __cxx_atomic_base_impl(_Tp value) noexcept - : __a_value(value) - {} - _LIBCUDACXX_DISABLE_EXTENSION_WARNING _Atomic(_Tp) __a_value; -}; - -#ifndef _LIBCUDACXX_ATOMIC_IS_LOCK_FREE -# define _LIBCUDACXX_ATOMIC_IS_LOCK_FREE(__x) __c11_atomic_is_lock_free(__x, 0) -#endif - -_LIBCUDACXX_INLINE_VISIBILITY inline void __cxx_atomic_thread_fence(memory_order __order) noexcept -{ - __c11_atomic_thread_fence(static_cast<__memory_order_underlying_t>(__order)); -} - -_LIBCUDACXX_INLINE_VISIBILITY inline void __cxx_atomic_signal_fence(memory_order __order) noexcept -{ - __c11_atomic_signal_fence(static_cast<__memory_order_underlying_t>(__order)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_init(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp __val) noexcept -{ - __c11_atomic_init(&__a->__a_value, __val); -} -template -_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_init(__cxx_atomic_base_impl<_Tp>* __a, _Tp __val) noexcept -{ - __c11_atomic_init(&__a->__a_value, __val); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY void -__cxx_atomic_store(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp __val, memory_order __order) noexcept -{ - __c11_atomic_store(&__a->__a_value, __val, static_cast<__memory_order_underlying_t>(__order)); -} -template -_LIBCUDACXX_INLINE_VISIBILITY void -__cxx_atomic_store(__cxx_atomic_base_impl<_Tp>* __a, _Tp __val, memory_order __order) noexcept -{ - __c11_atomic_store(&__a->__a_value, __val, static_cast<__memory_order_underlying_t>(__order)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY _Tp -__cxx_atomic_load(__cxx_atomic_base_impl<_Tp> const volatile* __a, memory_order __order) noexcept -{ - using __ptr_type = typename remove_const__a_value)>::type*; - return __c11_atomic_load(const_cast<__ptr_type>(&__a->__a_value), static_cast<__memory_order_underlying_t>(__order)); -} -template -_LIBCUDACXX_INLINE_VISIBILITY _Tp -__cxx_atomic_load(__cxx_atomic_base_impl<_Tp> const* __a, memory_order __order) noexcept -{ - using __ptr_type = typename remove_const__a_value)>::type*; - return __c11_atomic_load(const_cast<__ptr_type>(&__a->__a_value), static_cast<__memory_order_underlying_t>(__order)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY _Tp -__cxx_atomic_exchange(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp __value, memory_order __order) noexcept -{ - return __c11_atomic_exchange(&__a->__a_value, __value, static_cast<__memory_order_underlying_t>(__order)); -} -template -_LIBCUDACXX_INLINE_VISIBILITY _Tp -__cxx_atomic_exchange(__cxx_atomic_base_impl<_Tp>* __a, _Tp __value, memory_order __order) noexcept -{ - return __c11_atomic_exchange(&__a->__a_value, __value, static_cast<__memory_order_underlying_t>(__order)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY bool __cxx_atomic_compare_exchange_strong( - __cxx_atomic_base_impl<_Tp> volatile* __a, - _Tp* __expected, - _Tp __value, - memory_order __success, - memory_order __failure) noexcept -{ - return __c11_atomic_compare_exchange_strong( - &__a->__a_value, - __expected, - __value, - static_cast<__memory_order_underlying_t>(__success), - static_cast<__memory_order_underlying_t>(__failure)); -} -template -_LIBCUDACXX_INLINE_VISIBILITY bool __cxx_atomic_compare_exchange_strong( - __cxx_atomic_base_impl<_Tp>* __a, - _Tp* __expected, - _Tp __value, - memory_order __success, - memory_order __failure) noexcept -{ - return __c11_atomic_compare_exchange_strong( - &__a->__a_value, - __expected, - __value, - static_cast<__memory_order_underlying_t>(__success), - static_cast<__memory_order_underlying_t>(__failure)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY bool __cxx_atomic_compare_exchange_weak( - __cxx_atomic_base_impl<_Tp> volatile* __a, - _Tp* __expected, - _Tp __value, - memory_order __success, - memory_order __failure) noexcept -{ - return __c11_atomic_compare_exchange_weak( - &__a->__a_value, - __expected, - __value, - static_cast<__memory_order_underlying_t>(__success), - static_cast<__memory_order_underlying_t>(__failure)); -} -template -_LIBCUDACXX_INLINE_VISIBILITY bool __cxx_atomic_compare_exchange_weak( - __cxx_atomic_base_impl<_Tp>* __a, - _Tp* __expected, - _Tp __value, - memory_order __success, - memory_order __failure) noexcept -{ - return __c11_atomic_compare_exchange_weak( - &__a->__a_value, - __expected, - __value, - static_cast<__memory_order_underlying_t>(__success), - static_cast<__memory_order_underlying_t>(__failure)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY _Tp -__cxx_atomic_fetch_add(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp __delta, memory_order __order) noexcept -{ - return __c11_atomic_fetch_add(&__a->__a_value, __delta, static_cast<__memory_order_underlying_t>(__order)); -} -template -_LIBCUDACXX_INLINE_VISIBILITY _Tp -__cxx_atomic_fetch_add(__cxx_atomic_base_impl<_Tp>* __a, _Tp __delta, memory_order __order) noexcept -{ - return __c11_atomic_fetch_add(&__a->__a_value, __delta, static_cast<__memory_order_underlying_t>(__order)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY _Tp* -__cxx_atomic_fetch_add(__cxx_atomic_base_impl<_Tp*> volatile* __a, ptrdiff_t __delta, memory_order __order) noexcept -{ - return __c11_atomic_fetch_add(&__a->__a_value, __delta, static_cast<__memory_order_underlying_t>(__order)); -} -template -_LIBCUDACXX_INLINE_VISIBILITY _Tp* -__cxx_atomic_fetch_add(__cxx_atomic_base_impl<_Tp*>* __a, ptrdiff_t __delta, memory_order __order) noexcept -{ - return __c11_atomic_fetch_add(&__a->__a_value, __delta, static_cast<__memory_order_underlying_t>(__order)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY _Tp -__cxx_atomic_fetch_sub(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp __delta, memory_order __order) noexcept -{ - return __c11_atomic_fetch_sub(&__a->__a_value, __delta, static_cast<__memory_order_underlying_t>(__order)); -} -template -_LIBCUDACXX_INLINE_VISIBILITY _Tp -__cxx_atomic_fetch_sub(__cxx_atomic_base_impl<_Tp>* __a, _Tp __delta, memory_order __order) noexcept -{ - return __c11_atomic_fetch_sub(&__a->__a_value, __delta, static_cast<__memory_order_underlying_t>(__order)); -} -template -_LIBCUDACXX_INLINE_VISIBILITY _Tp* -__cxx_atomic_fetch_sub(__cxx_atomic_base_impl<_Tp*> volatile* __a, ptrdiff_t __delta, memory_order __order) noexcept -{ - return __c11_atomic_fetch_sub(&__a->__a_value, __delta, static_cast<__memory_order_underlying_t>(__order)); -} -template -_LIBCUDACXX_INLINE_VISIBILITY _Tp* -__cxx_atomic_fetch_sub(__cxx_atomic_base_impl<_Tp*>* __a, ptrdiff_t __delta, memory_order __order) noexcept -{ - return __c11_atomic_fetch_sub(&__a->__a_value, __delta, static_cast<__memory_order_underlying_t>(__order)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY _Tp -__cxx_atomic_fetch_and(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp __pattern, memory_order __order) noexcept -{ - return __c11_atomic_fetch_and(&__a->__a_value, __pattern, static_cast<__memory_order_underlying_t>(__order)); -} -template -_LIBCUDACXX_INLINE_VISIBILITY _Tp -__cxx_atomic_fetch_and(__cxx_atomic_base_impl<_Tp>* __a, _Tp __pattern, memory_order __order) noexcept -{ - return __c11_atomic_fetch_and(&__a->__a_value, __pattern, static_cast<__memory_order_underlying_t>(__order)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY _Tp -__cxx_atomic_fetch_or(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp __pattern, memory_order __order) noexcept -{ - return __c11_atomic_fetch_or(&__a->__a_value, __pattern, static_cast<__memory_order_underlying_t>(__order)); -} -template -_LIBCUDACXX_INLINE_VISIBILITY _Tp -__cxx_atomic_fetch_or(__cxx_atomic_base_impl<_Tp>* __a, _Tp __pattern, memory_order __order) noexcept -{ - return __c11_atomic_fetch_or(&__a->__a_value, __pattern, static_cast<__memory_order_underlying_t>(__order)); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY _Tp -__cxx_atomic_fetch_xor(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp __pattern, memory_order __order) noexcept -{ - return __c11_atomic_fetch_xor(&__a->__a_value, __pattern, static_cast<__memory_order_underlying_t>(__order)); -} -template -_LIBCUDACXX_INLINE_VISIBILITY _Tp -__cxx_atomic_fetch_xor(__cxx_atomic_base_impl<_Tp>* __a, _Tp __pattern, memory_order __order) noexcept -{ - return __c11_atomic_fetch_xor(&__a->__a_value, __pattern, static_cast<__memory_order_underlying_t>(__order)); -} diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda.h b/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda.h deleted file mode 100644 index b6fa9a16fd..0000000000 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda.h +++ /dev/null @@ -1,787 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of libcu++, the C++ Standard Library for your entire system, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -#if defined(__CUDA_MINIMUM_ARCH__) \ - && ((!defined(_CCCL_COMPILER_MSVC) && __CUDA_MINIMUM_ARCH__ < 600) \ - || (defined(_CCCL_COMPILER_MSVC) && __CUDA_MINIMUM_ARCH__ < 700)) -# error "CUDA atomics are only supported for sm_60 and up on *nix and sm_70 and up on Windows." -#endif - -inline _CCCL_HOST_DEVICE int __stronger_order_cuda(int __a, int __b) -{ - int const __max = __a > __b ? __a : __b; - if (__max != __ATOMIC_RELEASE) - { - return __max; - } - static int const __xform[] = {__ATOMIC_RELEASE, __ATOMIC_ACQ_REL, __ATOMIC_ACQ_REL, __ATOMIC_RELEASE}; - return __xform[__a < __b ? __a : __b]; -} - -// pre-define lock free query for heterogeneous compatibility -#ifndef _LIBCUDACXX_ATOMIC_IS_LOCK_FREE -# define _LIBCUDACXX_ATOMIC_IS_LOCK_FREE(__x) (__x <= 8) -#endif - -// Wrap host atomic implementations into a sub-namespace -namespace __host -{ -#if defined(_CCCL_COMPILER_MSVC) -# include -#elif defined(_LIBCUDACXX_HAS_GCC_ATOMIC_IMP) -# include -#elif defined(_LIBCUDACXX_HAS_C11_ATOMIC_IMP) -// TODO -// # include -#elif defined(_CCCL_COMPILER_NVRTC) -# include -#endif -} // namespace __host - -using __host::__cxx_atomic_underlying_t; - -#include -#include - -_CCCL_HOST_DEVICE inline void __cxx_atomic_thread_fence(memory_order __order) -{ - NV_DISPATCH_TARGET( - NV_IS_DEVICE, - (__atomic_thread_fence_cuda(static_cast<__memory_order_underlying_t>(__order), __thread_scope_system_tag());), - NV_IS_HOST, - (__host::__cxx_atomic_thread_fence(__order);)) -} - -_CCCL_HOST_DEVICE inline void __cxx_atomic_signal_fence(memory_order __order) -{ - NV_DISPATCH_TARGET(NV_IS_DEVICE, - (__atomic_signal_fence_cuda(static_cast<__memory_order_underlying_t>(__order));), - NV_IS_HOST, - (__host::__cxx_atomic_signal_fence(__order);)) -} - -template -struct __cxx_atomic_base_heterogeneous_impl -{ - __cxx_atomic_base_heterogeneous_impl() noexcept = default; - - _CCCL_HOST_DEVICE constexpr explicit __cxx_atomic_base_heterogeneous_impl(_Tp __value) - : __a_value(__value) - {} - - using __underlying_t = _Tp; - static constexpr int __sco = _Sco; - - __host::__cxx_atomic_base_impl<_Tp, _Sco> __a_value; -}; - -template -struct __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, true> -{ - __cxx_atomic_base_heterogeneous_impl() noexcept = default; - - static_assert(sizeof(_Tp) >= 4, "atomic_ref does not support 1 or 2 byte types"); - static_assert(sizeof(_Tp) <= 8, "atomic_ref does not support types larger than 8 bytes"); - - _CCCL_HOST_DEVICE constexpr explicit __cxx_atomic_base_heterogeneous_impl(_Tp& __value) - : __a_value(__value) - {} - - using __underlying_t = _Tp; - static constexpr int __sco = _Sco; - - __host::__cxx_atomic_ref_base_impl<_Tp, _Sco> __a_value; -}; - -template -_CCCL_HOST_DEVICE constexpr _Tp* -__cxx_get_underlying_device_atomic(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref>* __a) noexcept -{ - return __cxx_get_underlying_atomic(&__a->__a_value); -} - -template -_CCCL_HOST_DEVICE constexpr volatile _Tp* -__cxx_get_underlying_device_atomic(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a) noexcept -{ - return __cxx_get_underlying_atomic(&__a->__a_value); -} - -template -_CCCL_HOST_DEVICE constexpr const _Tp* -__cxx_get_underlying_device_atomic(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> const* __a) noexcept -{ - return __cxx_get_underlying_atomic(&__a->__a_value); -} - -template -_CCCL_HOST_DEVICE constexpr const volatile _Tp* -__cxx_get_underlying_device_atomic(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> const volatile* __a) noexcept -{ - return __cxx_get_underlying_atomic(&__a->__a_value); -} - -template -using __cxx_atomic_small_to_32 = __conditional_t::value, int32_t, uint32_t>; - -// Arithmetic conversions to/from proxy types -template ::value, int> = 0> -constexpr _CCCL_HOST_DEVICE inline __cxx_atomic_small_to_32<_Tp> __cxx_small_to_32(_Tp __val) -{ - return static_cast<__cxx_atomic_small_to_32<_Tp>>(__val); -} - -template ::value, int> = 0> -constexpr _CCCL_HOST_DEVICE inline _Tp __cxx_small_from_32(__cxx_atomic_small_to_32<_Tp> __val) -{ - return static_cast<_Tp>(__val); -} - -// Non-arithmetic conversion to/from proxy types -template ::value, int> = 0> -_CCCL_HOST_DEVICE inline __cxx_atomic_small_to_32<_Tp> __cxx_small_to_32(_Tp __val) -{ - __cxx_atomic_small_to_32<_Tp> __temp{}; - memcpy(&__temp, &__val, sizeof(_Tp)); - return __temp; -} - -template ::value, int> = 0> -_CCCL_HOST_DEVICE inline _Tp __cxx_small_from_32(__cxx_atomic_small_to_32<_Tp> __val) -{ - _Tp __temp{}; - memcpy(&__temp, &__val, sizeof(_Tp)); - return __temp; -} - -template -struct __cxx_atomic_base_small_impl -{ - __cxx_atomic_base_small_impl() noexcept = default; - _CCCL_HOST_DEVICE constexpr explicit __cxx_atomic_base_small_impl(_Tp __value) - : __a_value(__cxx_small_to_32(__value)) - {} - - using __underlying_t = _Tp; - static constexpr int __sco = _Sco; - - __cxx_atomic_base_heterogeneous_impl<__cxx_atomic_small_to_32<_Tp>, _Sco, false> __a_value; -}; - -template -using __cxx_atomic_base_impl = - __conditional_t, - __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco>>; - -template -using __cxx_atomic_ref_base_impl = __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, true>; - -template -_CCCL_HOST_DEVICE void __cxx_atomic_init(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref>* __a, _Tp __val) -{ - alignas(_Tp) auto __tmp = __val; - __cxx_atomic_assign_volatile(*__cxx_get_underlying_device_atomic(__a), __tmp); -} - -template -_CCCL_HOST_DEVICE void __cxx_atomic_init(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp __val) -{ - alignas(_Tp) auto __tmp = __val; - __cxx_atomic_assign_volatile(*__cxx_get_underlying_device_atomic(__a), __tmp); -} - -template -_CCCL_HOST_DEVICE void -__cxx_atomic_store(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref>* __a, _Tp __val, memory_order __order) -{ - alignas(_Tp) auto __tmp = __val; - NV_DISPATCH_TARGET( - NV_IS_DEVICE, - (__atomic_store_n_cuda(__cxx_get_underlying_device_atomic(__a), - __tmp, - static_cast<__memory_order_underlying_t>(__order), - __scope_tag<_Sco>());), - NV_IS_HOST, - (__host::__cxx_atomic_store(&__a->__a_value, __tmp, __order);)) -} - -template -_CCCL_HOST_DEVICE void -__cxx_atomic_store(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp __val, memory_order __order) -{ - alignas(_Tp) auto __tmp = __val; - NV_DISPATCH_TARGET( - NV_IS_DEVICE, - (__atomic_store_n_cuda(__cxx_get_underlying_device_atomic(__a), - __tmp, - static_cast<__memory_order_underlying_t>(__order), - __scope_tag<_Sco>());), - NV_IS_HOST, - (__host::__cxx_atomic_store(&__a->__a_value, __tmp, __order);)) -} - -template -_CCCL_HOST_DEVICE _Tp -__cxx_atomic_load(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> const* __a, memory_order __order) -{ - NV_DISPATCH_TARGET( - NV_IS_DEVICE, - (return __atomic_load_n_cuda(__cxx_get_underlying_device_atomic(__a), - static_cast<__memory_order_underlying_t>(__order), - __scope_tag<_Sco>());), - NV_IS_HOST, - (return __host::__cxx_atomic_load(&__a->__a_value, __order);)) -} - -template -_CCCL_HOST_DEVICE _Tp -__cxx_atomic_load(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> const volatile* __a, memory_order __order) -{ - NV_DISPATCH_TARGET( - NV_IS_DEVICE, - (return __atomic_load_n_cuda(__cxx_get_underlying_device_atomic(__a), - static_cast<__memory_order_underlying_t>(__order), - __scope_tag<_Sco>());), - NV_IS_HOST, - (return __host::__cxx_atomic_load(&__a->__a_value, __order);)) -} - -template -_CCCL_HOST_DEVICE _Tp -__cxx_atomic_exchange(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref>* __a, _Tp __val, memory_order __order) -{ - alignas(_Tp) auto __tmp = __val; - NV_DISPATCH_TARGET( - NV_IS_DEVICE, - (return __atomic_exchange_n_cuda(__cxx_get_underlying_device_atomic(__a), - __tmp, - static_cast<__memory_order_underlying_t>(__order), - __scope_tag<_Sco>());), - NV_IS_HOST, - (return __host::__cxx_atomic_exchange(&__a->__a_value, __tmp, __order);)) -} - -template -_CCCL_HOST_DEVICE _Tp __cxx_atomic_exchange( - __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp __val, memory_order __order) -{ - alignas(_Tp) auto __tmp = __val; - NV_DISPATCH_TARGET( - NV_IS_DEVICE, - (return __atomic_exchange_n_cuda(__cxx_get_underlying_device_atomic(__a), - __tmp, - static_cast<__memory_order_underlying_t>(__order), - __scope_tag<_Sco>());), - NV_IS_HOST, - (return __host::__cxx_atomic_exchange(&__a->__a_value, __tmp, __order);)) -} - -template -_CCCL_HOST_DEVICE bool __cxx_atomic_compare_exchange_strong( - __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref>* __a, - _Tp* __expected, - _Tp __val, - memory_order __success, - memory_order __failure) -{ - alignas(_Tp) auto __tmp = *__expected; - bool __result = false; - NV_DISPATCH_TARGET( - NV_IS_DEVICE, - (alignas(_Tp) auto __tmp_v = __val; - __result = __atomic_compare_exchange_cuda( - __cxx_get_underlying_device_atomic(__a), - &__tmp, - &__tmp_v, - false, - static_cast<__memory_order_underlying_t>(__success), - static_cast<__memory_order_underlying_t>(__failure), - __scope_tag<_Sco>());), - NV_IS_HOST, - (__result = __host::__cxx_atomic_compare_exchange_strong(&__a->__a_value, &__tmp, __val, __success, __failure);)) - *__expected = __tmp; - return __result; -} - -template -_CCCL_HOST_DEVICE bool __cxx_atomic_compare_exchange_strong( - __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, - _Tp* __expected, - _Tp __val, - memory_order __success, - memory_order __failure) -{ - alignas(_Tp) auto __tmp = *__expected; - bool __result = false; - NV_DISPATCH_TARGET( - NV_IS_DEVICE, - (alignas(_Tp) auto __tmp_v = __val; - __result = __atomic_compare_exchange_cuda( - __cxx_get_underlying_device_atomic(__a), - &__tmp, - &__tmp_v, - false, - static_cast<__memory_order_underlying_t>(__success), - static_cast<__memory_order_underlying_t>(__failure), - __scope_tag<_Sco>());), - NV_IS_HOST, - (__result = __host::__cxx_atomic_compare_exchange_strong(&__a->__a_value, &__tmp, __val, __success, __failure);)) - *__expected = __tmp; - return __result; -} - -template -_CCCL_HOST_DEVICE bool __cxx_atomic_compare_exchange_weak( - __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref>* __a, - _Tp* __expected, - _Tp __val, - memory_order __success, - memory_order __failure) -{ - alignas(_Tp) auto __tmp = *__expected; - bool __result = false; - NV_DISPATCH_TARGET( - NV_IS_DEVICE, - (alignas(_Tp) auto __tmp_v = __val; - __result = __atomic_compare_exchange_cuda( - __cxx_get_underlying_device_atomic(__a), - &__tmp, - &__tmp_v, - true, - static_cast<__memory_order_underlying_t>(__success), - static_cast<__memory_order_underlying_t>(__failure), - __scope_tag<_Sco>());), - NV_IS_HOST, - (__result = __host::__cxx_atomic_compare_exchange_weak(&__a->__a_value, &__tmp, __val, __success, __failure);)) - *__expected = __tmp; - return __result; -} - -template -_CCCL_HOST_DEVICE bool __cxx_atomic_compare_exchange_weak( - __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, - _Tp* __expected, - _Tp __val, - memory_order __success, - memory_order __failure) -{ - alignas(_Tp) auto __tmp = *__expected; - bool __result = false; - NV_DISPATCH_TARGET( - NV_IS_DEVICE, - (alignas(_Tp) auto __tmp_v = __val; - __result = __atomic_compare_exchange_cuda( - __cxx_get_underlying_device_atomic(__a), - &__tmp, - &__tmp_v, - true, - static_cast<__memory_order_underlying_t>(__success), - static_cast<__memory_order_underlying_t>(__failure), - __scope_tag<_Sco>());), - NV_IS_HOST, - (__result = __host::__cxx_atomic_compare_exchange_weak(&__a->__a_value, &__tmp, __val, __success, __failure);)) - *__expected = __tmp; - return __result; -} - -template -_CCCL_HOST_DEVICE _Tp -__cxx_atomic_fetch_add(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref>* __a, _Tp __delta, memory_order __order) -{ - NV_DISPATCH_TARGET( - NV_IS_DEVICE, - (return __atomic_fetch_add_cuda(__cxx_get_underlying_device_atomic(__a), - __delta, - static_cast<__memory_order_underlying_t>(__order), - __scope_tag<_Sco>());), - NV_IS_HOST, - (return __host::__cxx_atomic_fetch_add(&__a->__a_value, __delta, __order);)) -} - -template -_CCCL_HOST_DEVICE _Tp __cxx_atomic_fetch_add( - __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp __delta, memory_order __order) -{ - NV_DISPATCH_TARGET( - NV_IS_DEVICE, - (return __atomic_fetch_add_cuda(__cxx_get_underlying_device_atomic(__a), - __delta, - static_cast<__memory_order_underlying_t>(__order), - __scope_tag<_Sco>());), - NV_IS_HOST, - (return __host::__cxx_atomic_fetch_add(&__a->__a_value, __delta, __order);)) -} - -template -_CCCL_HOST_DEVICE _Tp* __cxx_atomic_fetch_add( - __cxx_atomic_base_heterogeneous_impl<_Tp*, _Sco, _Ref>* __a, ptrdiff_t __delta, memory_order __order) -{ - NV_DISPATCH_TARGET( - NV_IS_DEVICE, - (return __atomic_fetch_add_cuda(__cxx_get_underlying_device_atomic(__a), - __delta, - static_cast<__memory_order_underlying_t>(__order), - __scope_tag<_Sco>());), - NV_IS_HOST, - (return __host::__cxx_atomic_fetch_add(&__a->__a_value, __delta, __order);)) -} - -template -_CCCL_HOST_DEVICE _Tp* __cxx_atomic_fetch_add( - __cxx_atomic_base_heterogeneous_impl<_Tp*, _Sco, _Ref> volatile* __a, ptrdiff_t __delta, memory_order __order) -{ - NV_DISPATCH_TARGET( - NV_IS_DEVICE, - (return __atomic_fetch_add_cuda(__cxx_get_underlying_device_atomic(__a), - __delta, - static_cast<__memory_order_underlying_t>(__order), - __scope_tag<_Sco>());), - NV_IS_HOST, - (return __host::__cxx_atomic_fetch_add(&__a->__a_value, __delta, __order);)) -} - -template -_CCCL_HOST_DEVICE _Tp -__cxx_atomic_fetch_sub(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref>* __a, _Tp __delta, memory_order __order) -{ - NV_DISPATCH_TARGET( - NV_IS_DEVICE, - (return __atomic_fetch_sub_cuda(__cxx_get_underlying_device_atomic(__a), - __delta, - static_cast<__memory_order_underlying_t>(__order), - __scope_tag<_Sco>());), - NV_IS_HOST, - (return __host::__cxx_atomic_fetch_sub(&__a->__a_value, __delta, __order);)) -} - -template -_CCCL_HOST_DEVICE _Tp __cxx_atomic_fetch_sub( - __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp __delta, memory_order __order) -{ - NV_DISPATCH_TARGET( - NV_IS_DEVICE, - (return __atomic_fetch_sub_cuda(__cxx_get_underlying_device_atomic(__a), - __delta, - static_cast<__memory_order_underlying_t>(__order), - __scope_tag<_Sco>());), - NV_IS_HOST, - (return __host::__cxx_atomic_fetch_sub(&__a->__a_value, __delta, __order);)) -} - -template -_CCCL_HOST_DEVICE _Tp* __cxx_atomic_fetch_sub( - __cxx_atomic_base_heterogeneous_impl<_Tp*, _Sco, _Ref>* __a, ptrdiff_t __delta, memory_order __order) -{ - NV_DISPATCH_TARGET( - NV_IS_DEVICE, - (return __atomic_fetch_sub_cuda(__cxx_get_underlying_device_atomic(__a), - __delta, - static_cast<__memory_order_underlying_t>(__order), - __scope_tag<_Sco>());), - NV_IS_HOST, - (return __host::__cxx_atomic_fetch_sub(&__a->__a_value, __delta, __order);)) -} - -template -_CCCL_HOST_DEVICE _Tp* __cxx_atomic_fetch_sub( - __cxx_atomic_base_heterogeneous_impl<_Tp*, _Sco, _Ref> volatile* __a, ptrdiff_t __delta, memory_order __order) -{ - NV_DISPATCH_TARGET( - NV_IS_DEVICE, - (return __atomic_fetch_sub_cuda(__cxx_get_underlying_device_atomic(__a), - __delta, - static_cast<__memory_order_underlying_t>(__order), - __scope_tag<_Sco>());), - NV_IS_HOST, - (return __host::__cxx_atomic_fetch_sub(&__a->__a_value, __delta, __order);)) -} - -template -_CCCL_HOST_DEVICE _Tp -__cxx_atomic_fetch_and(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref>* __a, _Tp __pattern, memory_order __order) -{ - NV_DISPATCH_TARGET( - NV_IS_DEVICE, - (return __atomic_fetch_and_cuda(__cxx_get_underlying_device_atomic(__a), - __pattern, - static_cast<__memory_order_underlying_t>(__order), - __scope_tag<_Sco>());), - NV_IS_HOST, - (return __host::__cxx_atomic_fetch_and(&__a->__a_value, __pattern, __order);)) -} - -template -_CCCL_HOST_DEVICE _Tp __cxx_atomic_fetch_and( - __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp __pattern, memory_order __order) -{ - NV_DISPATCH_TARGET( - NV_IS_DEVICE, - (return __atomic_fetch_and_cuda(__cxx_get_underlying_device_atomic(__a), - __pattern, - static_cast<__memory_order_underlying_t>(__order), - __scope_tag<_Sco>());), - NV_IS_HOST, - (return __host::__cxx_atomic_fetch_and(&__a->__a_value, __pattern, __order);)) -} - -template -_CCCL_HOST_DEVICE _Tp -__cxx_atomic_fetch_or(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref>* __a, _Tp __pattern, memory_order __order) -{ - NV_DISPATCH_TARGET( - NV_IS_DEVICE, - (return __atomic_fetch_or_cuda(__cxx_get_underlying_device_atomic(__a), - __pattern, - static_cast<__memory_order_underlying_t>(__order), - __scope_tag<_Sco>());), - NV_IS_HOST, - (return __host::__cxx_atomic_fetch_or(&__a->__a_value, __pattern, __order);)) -} - -template -_CCCL_HOST_DEVICE _Tp __cxx_atomic_fetch_or( - __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp __pattern, memory_order __order) -{ - NV_DISPATCH_TARGET( - NV_IS_DEVICE, - (return __atomic_fetch_or_cuda(__cxx_get_underlying_device_atomic(__a), - __pattern, - static_cast<__memory_order_underlying_t>(__order), - __scope_tag<_Sco>());), - NV_IS_HOST, - (return __host::__cxx_atomic_fetch_or(&__a->__a_value, __pattern, __order);)) -} - -template -_CCCL_HOST_DEVICE _Tp -__cxx_atomic_fetch_xor(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref>* __a, _Tp __pattern, memory_order __order) -{ - NV_DISPATCH_TARGET( - NV_IS_DEVICE, - (return __atomic_fetch_xor_cuda(__cxx_get_underlying_device_atomic(__a), - __pattern, - static_cast<__memory_order_underlying_t>(__order), - __scope_tag<_Sco>());), - NV_IS_HOST, - (return __host::__cxx_atomic_fetch_xor(&__a->__a_value, __pattern, __order);)) -} - -template -_CCCL_HOST_DEVICE _Tp __cxx_atomic_fetch_xor( - __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp __pattern, memory_order __order) -{ - NV_DISPATCH_TARGET( - NV_IS_DEVICE, - (return __atomic_fetch_xor_cuda(__cxx_get_underlying_device_atomic(__a), - __pattern, - static_cast<__memory_order_underlying_t>(__order), - __scope_tag<_Sco>());), - NV_IS_HOST, - (return __host::__cxx_atomic_fetch_xor(&__a->__a_value, __pattern, __order);)) -} - -template -_CCCL_HOST_DEVICE _Tp -__cxx_atomic_fetch_max(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref>* __a, _Delta __val, memory_order __order) -{ - NV_IF_TARGET( - NV_IS_DEVICE, - (return __atomic_fetch_max_cuda(__cxx_get_underlying_device_atomic(__a), - __val, - static_cast<__memory_order_underlying_t>(__order), - __scope_tag<_Sco>());), - (return __host::__cxx_atomic_fetch_max(&__a->__a_value, __val, __order);)) -} - -template -_CCCL_HOST_DEVICE _Tp __cxx_atomic_fetch_max( - __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Delta __val, memory_order __order) -{ - NV_IF_TARGET( - NV_IS_DEVICE, - (return __atomic_fetch_max_cuda(__cxx_get_underlying_device_atomic(__a), - __val, - static_cast<__memory_order_underlying_t>(__order), - __scope_tag<_Sco>());), - (return __host::__cxx_atomic_fetch_max(&__a->__a_value, __val, __order);)) -} - -template -_CCCL_HOST_DEVICE _Tp -__cxx_atomic_fetch_min(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref>* __a, _Delta __val, memory_order __order) -{ - NV_IF_TARGET( - NV_IS_DEVICE, - (return __atomic_fetch_min_cuda(__cxx_get_underlying_device_atomic(__a), - __val, - static_cast<__memory_order_underlying_t>(__order), - __scope_tag<_Sco>());), - (return __host::__cxx_atomic_fetch_min(&__a->__a_value, __val, __order);)) -} - -template -_CCCL_HOST_DEVICE _Tp __cxx_atomic_fetch_min( - __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Delta __val, memory_order __order) -{ - NV_IF_TARGET( - NV_IS_DEVICE, - (return __atomic_fetch_min_cuda(__cxx_get_underlying_device_atomic(__a), - __val, - static_cast<__memory_order_underlying_t>(__order), - __scope_tag<_Sco>());), - (return __host::__cxx_atomic_fetch_min(&__a->__a_value, __val, __order);)) -} - -template -_CCCL_HOST_DEVICE inline void __cxx_atomic_init(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp __val) -{ - __cxx_atomic_init(&__a->__a_value, __cxx_small_to_32(__val)); -} - -template -_CCCL_HOST_DEVICE inline void -__cxx_atomic_store(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp __val, memory_order __order) -{ - __cxx_atomic_store(&__a->__a_value, __cxx_small_to_32(__val), __order); -} - -template -_CCCL_HOST_DEVICE inline _Tp -__cxx_atomic_load(__cxx_atomic_base_small_impl<_Tp, _Sco> const volatile* __a, memory_order __order) -{ - return __cxx_small_from_32<_Tp>(__cxx_atomic_load(&__a->__a_value, __order)); -} - -template -_CCCL_HOST_DEVICE inline _Tp -__cxx_atomic_exchange(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp __value, memory_order __order) -{ - return __cxx_small_from_32<_Tp>(__cxx_atomic_exchange(&__a->__a_value, __cxx_small_to_32(__value), __order)); -} -_CCCL_HOST_DEVICE inline int __cuda_memcmp(void const* __lhs, void const* __rhs, size_t __count) -{ - NV_DISPATCH_TARGET( - NV_IS_DEVICE, - (auto __lhs_c = reinterpret_cast(__lhs); - auto __rhs_c = reinterpret_cast(__rhs); - while (__count--) { - auto const __lhs_v = *__lhs_c++; - auto const __rhs_v = *__rhs_c++; - if (__lhs_v < __rhs_v) - { - return -1; - } - if (__lhs_v > __rhs_v) - { - return 1; - } - } return 0;), - NV_IS_HOST, - (return memcmp(__lhs, __rhs, __count);)) -} - -template -_CCCL_HOST_DEVICE inline bool __cxx_atomic_compare_exchange_weak( - __cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, - _Tp* __expected, - _Tp __value, - memory_order __success, - memory_order __failure) -{ - auto __temp = __cxx_small_to_32(*__expected); - auto const __ret = - __cxx_atomic_compare_exchange_weak(&__a->__a_value, &__temp, __cxx_small_to_32(__value), __success, __failure); - auto const __actual = __cxx_small_from_32<_Tp>(__temp); - constexpr auto __mask = static_cast((1u << (8 * sizeof(_Tp))) - 1); - if (!__ret) - { - if (0 == __cuda_memcmp(&__actual, __expected, sizeof(_Tp))) - { - __cxx_atomic_fetch_and(&__a->__a_value, __mask, memory_order_relaxed); - } - else - { - *__expected = __actual; - } - } - return __ret; -} - -template -_CCCL_HOST_DEVICE inline bool __cxx_atomic_compare_exchange_strong( - __cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, - _Tp* __expected, - _Tp __value, - memory_order __success, - memory_order __failure) -{ - auto const __old = *__expected; - while (1) - { - if (__cxx_atomic_compare_exchange_weak(__a, __expected, __value, __success, __failure)) - { - return true; - } - if (0 != __cuda_memcmp(&__old, __expected, sizeof(_Tp))) - { - return false; - } - } -} - -template -_CCCL_HOST_DEVICE inline _Tp -__cxx_atomic_fetch_add(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp __delta, memory_order __order) -{ - return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_add(&__a->__a_value, __cxx_small_to_32(__delta), __order)); -} - -template -_CCCL_HOST_DEVICE inline _Tp -__cxx_atomic_fetch_sub(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp __delta, memory_order __order) -{ - return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_sub(&__a->__a_value, __cxx_small_to_32(__delta), __order)); -} - -template -_CCCL_HOST_DEVICE inline _Tp -__cxx_atomic_fetch_and(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp __pattern, memory_order __order) -{ - return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_and(&__a->__a_value, __cxx_small_to_32(__pattern), __order)); -} - -template -_CCCL_HOST_DEVICE inline _Tp -__cxx_atomic_fetch_or(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp __pattern, memory_order __order) -{ - return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_or(&__a->__a_value, __cxx_small_to_32(__pattern), __order)); -} - -template -_CCCL_HOST_DEVICE inline _Tp -__cxx_atomic_fetch_xor(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp __pattern, memory_order __order) -{ - return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_xor(&__a->__a_value, __cxx_small_to_32(__pattern), __order)); -} - -template -_CCCL_HOST_DEVICE inline _Tp -__cxx_atomic_fetch_max(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Delta __val, memory_order __order) -{ - return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_max(&__a->__a_value, __cxx_small_to_32(__val), __order)); -} - -template -_CCCL_HOST_DEVICE inline _Tp -__cxx_atomic_fetch_min(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Delta __val, memory_order __order) -{ - return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_min(&__a->__a_value, __cxx_small_to_32(__val), __order)); -} diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_nvrtc.h b/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_nvrtc.h deleted file mode 100644 index 129b088081..0000000000 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_nvrtc.h +++ /dev/null @@ -1,17 +0,0 @@ -// -*- C++ -*- -//===----------------------------------------------------------------------===// -// -// Part of libcu++, the C++ Standard Library for your entire system, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -#ifndef _LIBCUDACXX_ATOMIC_NVRTC_H -#define _LIBCUDACXX_ATOMIC_NVRTC_H - -#include - -#endif // _LIBCUDACXX_ATOMIC_NVRTC_H diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_scopes.h b/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_scopes.h deleted file mode 100644 index 9a035b1e4d..0000000000 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_scopes.h +++ /dev/null @@ -1,67 +0,0 @@ -#ifndef __LIBCUDACXX_ATOMIC_SCOPES_H -#define __LIBCUDACXX_ATOMIC_SCOPES_H - -// REMEMBER CHANGES TO THESE ARE ABI BREAKING -// TODO: Space values out for potential new scopes -#ifndef __ATOMIC_BLOCK -# define __ATOMIC_SYSTEM 0 // 0 indicates default -# define __ATOMIC_DEVICE 1 -# define __ATOMIC_BLOCK 2 -# define __ATOMIC_THREAD 10 -#endif //__ATOMIC_BLOCK - -enum thread_scope -{ - thread_scope_system = __ATOMIC_SYSTEM, - thread_scope_device = __ATOMIC_DEVICE, - thread_scope_block = __ATOMIC_BLOCK, - thread_scope_thread = __ATOMIC_THREAD -}; - -#define _LIBCUDACXX_ATOMIC_SCOPE_TYPE ::cuda::thread_scope -#define _LIBCUDACXX_ATOMIC_SCOPE_DEFAULT ::cuda::thread_scope::system - -struct __thread_scope_thread_tag -{}; -struct __thread_scope_block_tag -{}; -struct __thread_scope_device_tag -{}; -struct __thread_scope_system_tag -{}; - -template -struct __scope_enum_to_tag -{}; -/* This would be the implementation once an actual thread-scope backend exists. -template<> struct __scope_enum_to_tag<(int)thread_scope_thread> { - using type = __thread_scope_thread_tag; }; -Until then: */ -template <> -struct __scope_enum_to_tag<(int) thread_scope_thread> -{ - using type = __thread_scope_block_tag; -}; -template <> -struct __scope_enum_to_tag<(int) thread_scope_block> -{ - using type = __thread_scope_block_tag; -}; -template <> -struct __scope_enum_to_tag<(int) thread_scope_device> -{ - using type = __thread_scope_device_tag; -}; -template <> -struct __scope_enum_to_tag<(int) thread_scope_system> -{ - using type = __thread_scope_system_tag; -}; - -template -_LIBCUDACXX_INLINE_VISIBILITY auto constexpr __scope_tag() -> typename __scope_enum_to_tag<_Scope>::type -{ - return typename __scope_enum_to_tag<_Scope>::type(); -} - -#endif // __LIBCUDACXX_ATOMIC_SCOPES_H diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/cxx_atomic.h b/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/cxx_atomic.h deleted file mode 100644 index a4212f44a7..0000000000 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/cxx_atomic.h +++ /dev/null @@ -1,180 +0,0 @@ -// -*- C++ -*- -//===----------------------------------------------------------------------===// -// -// Part of libcu++, the C++ Standard Library for your entire system, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -#ifndef _LIBCUDACXX_CXX_ATOMIC_H -#define _LIBCUDACXX_CXX_ATOMIC_H - -template -struct __cxx_atomic_base_impl -{ - using __underlying_t = _Tp; - using __temporary_t = __cxx_atomic_base_impl<_Tp, _Sco>; - using __wrap_t = __cxx_atomic_base_impl<_Tp, _Sco>; - - static constexpr int __sco = _Sco; - -#if !defined(_CCCL_COMPILER_GCC) || (__GNUC__ >= 5) - static_assert(is_trivially_copyable<_Tp>::value, "std::atomic requires that 'Tp' be a trivially copyable type"); -#endif - - constexpr __cxx_atomic_base_impl() noexcept = default; - constexpr __cxx_atomic_base_impl(__cxx_atomic_base_impl&&) noexcept = default; - _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit __cxx_atomic_base_impl(_Tp value) noexcept - : __a_value(value) - {} - - __cxx_atomic_base_impl& operator=(const __cxx_atomic_base_impl&) noexcept = default; - - _CCCL_ALIGNAS(sizeof(_Tp)) _Tp __a_value; -}; - -template -_LIBCUDACXX_INLINE_VISIBILITY constexpr _Tp* __cxx_get_underlying_atomic(__cxx_atomic_base_impl<_Tp, _Sco>* __a) noexcept -{ - return &__a->__a_value; -} -template -_LIBCUDACXX_INLINE_VISIBILITY constexpr volatile _Tp* -__cxx_get_underlying_atomic(__cxx_atomic_base_impl<_Tp, _Sco> volatile* __a) noexcept -{ - return &__a->__a_value; -} -template -_LIBCUDACXX_INLINE_VISIBILITY constexpr const _Tp* -__cxx_get_underlying_atomic(__cxx_atomic_base_impl<_Tp, _Sco> const* __a) noexcept -{ - return &__a->__a_value; -} -template -_LIBCUDACXX_INLINE_VISIBILITY constexpr const volatile _Tp* -__cxx_get_underlying_atomic(__cxx_atomic_base_impl<_Tp, _Sco> const volatile* __a) noexcept -{ - return &__a->__a_value; -} -template -_LIBCUDACXX_INLINE_VISIBILITY constexpr __cxx_atomic_base_impl<_Tp, _Sco>* -__cxx_atomic_unwrap(__cxx_atomic_base_impl<_Tp, _Sco>* __a) noexcept -{ - return __a; -} -template -_LIBCUDACXX_INLINE_VISIBILITY constexpr volatile __cxx_atomic_base_impl<_Tp, _Sco>* -__cxx_atomic_unwrap(__cxx_atomic_base_impl<_Tp, _Sco> volatile* __a) noexcept -{ - return __a; -} -template -_LIBCUDACXX_INLINE_VISIBILITY constexpr const __cxx_atomic_base_impl<_Tp, _Sco>* -__cxx_atomic_unwrap(__cxx_atomic_base_impl<_Tp, _Sco> const* __a) noexcept -{ - return __a; -} -template -_LIBCUDACXX_INLINE_VISIBILITY constexpr const volatile __cxx_atomic_base_impl<_Tp, _Sco>* -__cxx_atomic_unwrap(__cxx_atomic_base_impl<_Tp, _Sco> const volatile* __a) noexcept -{ - return __a; -} - -template -struct __cxx_atomic_ref_base_impl -{ - using __underlying_t = _Tp; - using __temporary_t = _Tp; - using __wrap_t = _Tp; - - static constexpr int __sco = _Sco; - -#if !defined(_CCCL_COMPILER_GCC) || (__GNUC__ >= 5) - static_assert(is_trivially_copyable<_Tp>::value, - "std::atomic_ref requires that 'Tp' be a trivially copyable type"); -#endif - - constexpr __cxx_atomic_ref_base_impl() noexcept = delete; - constexpr __cxx_atomic_ref_base_impl(__cxx_atomic_ref_base_impl&&) noexcept = default; - constexpr __cxx_atomic_ref_base_impl(const __cxx_atomic_ref_base_impl&) noexcept = default; - _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit __cxx_atomic_ref_base_impl(_Tp& value) noexcept - : __a_value(&value) - {} - - _Tp* __a_value; -}; - -template -_LIBCUDACXX_INLINE_VISIBILITY constexpr _Tp* -__cxx_get_underlying_atomic(__cxx_atomic_ref_base_impl<_Tp, _Sco>* __a) noexcept -{ - return __a->__a_value; -} -template -_LIBCUDACXX_INLINE_VISIBILITY constexpr volatile _Tp* -__cxx_get_underlying_atomic(__cxx_atomic_ref_base_impl<_Tp, _Sco> volatile* __a) noexcept -{ - return __a->__a_value; -} -template -_LIBCUDACXX_INLINE_VISIBILITY constexpr const _Tp* -__cxx_get_underlying_atomic(__cxx_atomic_ref_base_impl<_Tp, _Sco> const* __a) noexcept -{ - return __a->__a_value; -} -template -_LIBCUDACXX_INLINE_VISIBILITY constexpr const volatile _Tp* -__cxx_get_underlying_atomic(__cxx_atomic_ref_base_impl<_Tp, _Sco> const volatile* __a) noexcept -{ - return __a->__a_value; -} -template -_LIBCUDACXX_INLINE_VISIBILITY constexpr _Tp* __cxx_atomic_unwrap(__cxx_atomic_ref_base_impl<_Tp, _Sco>* __a) noexcept -{ - return __cxx_get_underlying_atomic(__a); -} -template -_LIBCUDACXX_INLINE_VISIBILITY constexpr volatile _Tp* -__cxx_atomic_unwrap(__cxx_atomic_ref_base_impl<_Tp, _Sco> volatile* __a) noexcept -{ - return __cxx_get_underlying_atomic(__a); -} -template -_LIBCUDACXX_INLINE_VISIBILITY constexpr const _Tp* -__cxx_atomic_unwrap(__cxx_atomic_ref_base_impl<_Tp, _Sco> const* __a) noexcept -{ - return __cxx_get_underlying_atomic(__a); -} -template -_LIBCUDACXX_INLINE_VISIBILITY constexpr const volatile _Tp* -__cxx_atomic_unwrap(__cxx_atomic_ref_base_impl<_Tp, _Sco> const volatile* __a) noexcept -{ - return __cxx_get_underlying_atomic(__a); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY constexpr _Tp* __cxx_get_underlying_atomic(_Tp* __a) noexcept -{ - return __a; -} - -template -_LIBCUDACXX_INLINE_VISIBILITY constexpr auto __cxx_atomic_wrap_to_base(_Tp*, _Up __val) noexcept -> - typename _Tp::__wrap_t -{ - return typename _Tp::__wrap_t(__val); -} -template -_LIBCUDACXX_INLINE_VISIBILITY constexpr auto __cxx_atomic_base_temporary(_Tp*) noexcept -> typename _Tp::__temporary_t -{ - return typename _Tp::__temporary_t(); -} - -template -using __cxx_atomic_underlying_t = typename _Tp::__underlying_t; - -#endif //_LIBCUDACXX_CXX_ATOMIC_H From 76294d11026a771868d4b636b95deba7af2b4e19 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Fri, 12 Apr 2024 19:40:57 -0700 Subject: [PATCH 05/71] First pass at making atomic use new backends --- libcudacxx/codegen/CMakeLists.txt | 2 +- libcudacxx/codegen/codegen.cpp | 8 +- .../atomic_cuda_ptx_derived.h | 0 .../atomic_cuda_ptx_generated.h | 31 +- .../std/__atomic/operations/heterogeneous.h | 265 ++++++ .../cuda/std/__atomic/operations/host.h | 182 ++++ libcudacxx/include/cuda/std/__atomic/order.h | 126 +++ .../include/cuda/std/__atomic/platform.h | 14 - .../std/__atomic/platform/msvc_to_builtins.h | 12 + .../cuda/std/__atomic/platform/platform.h | 59 ++ libcudacxx/include/cuda/std/__atomic/scopes.h | 52 + .../include/cuda/std/__atomic/storage/base.h | 60 ++ .../cuda/std/__atomic/storage/common.h | 46 + .../cuda/std/__atomic/storage/locked.h | 204 ++++ .../cuda/std/__atomic/storage/reference.h | 48 + .../include/cuda/std/__atomic/storage/small.h | 177 ++++ .../cuda/std/__atomic/wait/notify_wait.h | 188 ++++ .../include/cuda/std/__atomic/wait/polling.h | 56 ++ libcudacxx/include/cuda/std/atomic | 901 +++--------------- 19 files changed, 1624 insertions(+), 807 deletions(-) rename libcudacxx/include/cuda/std/__atomic/{ => operations}/atomic_cuda_ptx_derived.h (100%) rename libcudacxx/include/cuda/std/__atomic/{ => operations}/atomic_cuda_ptx_generated.h (99%) create mode 100644 libcudacxx/include/cuda/std/__atomic/operations/heterogeneous.h create mode 100644 libcudacxx/include/cuda/std/__atomic/operations/host.h create mode 100644 libcudacxx/include/cuda/std/__atomic/order.h delete mode 100644 libcudacxx/include/cuda/std/__atomic/platform.h create mode 100644 libcudacxx/include/cuda/std/__atomic/platform/platform.h create mode 100644 libcudacxx/include/cuda/std/__atomic/scopes.h create mode 100644 libcudacxx/include/cuda/std/__atomic/storage/base.h create mode 100644 libcudacxx/include/cuda/std/__atomic/storage/common.h create mode 100644 libcudacxx/include/cuda/std/__atomic/storage/locked.h create mode 100644 libcudacxx/include/cuda/std/__atomic/storage/reference.h create mode 100644 libcudacxx/include/cuda/std/__atomic/storage/small.h create mode 100644 libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h create mode 100644 libcudacxx/include/cuda/std/__atomic/wait/polling.h diff --git a/libcudacxx/codegen/CMakeLists.txt b/libcudacxx/codegen/CMakeLists.txt index af1b6bdb8a..77e749b83b 100644 --- a/libcudacxx/codegen/CMakeLists.txt +++ b/libcudacxx/codegen/CMakeLists.txt @@ -20,7 +20,7 @@ target_compile_features( add_dependencies(libcudacxx.atomics.codegen codegen) set(atomic_generated_output "${libcudacxx_BINARY_DIR}/codegen/atomic_cuda_ptx_generated.h") -set(atomic_install_location "${libcudacxx_SOURCE_DIR}/include/cuda/std/__atomic") +set(atomic_install_location "${libcudacxx_SOURCE_DIR}/include/cuda/std/__atomic/operations") add_custom_target( libcudacxx.atomics.codegen.execute diff --git a/libcudacxx/codegen/codegen.cpp b/libcudacxx/codegen/codegen.cpp index fd032d1d4b..c1f809bd4b 100644 --- a/libcudacxx/codegen/codegen.cpp +++ b/libcudacxx/codegen/codegen.cpp @@ -78,8 +78,11 @@ int main() // //===----------------------------------------------------------------------===// -// This is a autogenerated file, we want to ensure that it contains exactly the contentes we want to generate +// This is a autogenerated file, we want to ensure that it contains exactly the contents we want to generate // clang-format off + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + )XXX"; auto scopenametag = [&](auto scope) { @@ -306,7 +309,7 @@ int main() "__failure_memorder, " << scopenametag(s.first) << ") {\n"; out << " uint" << sz << "_t __tmp = 0, __old = 0, __old_tmp;\n"; - out << " memcpy(&__tmp, __desired, " << sz / 8 << ");\n"; + out << " memcpy(&__tmp, &__desired, " << sz / 8 << ");\n"; out << " memcpy(&__old, __expected, " << sz / 8 << ");\n"; out << " __old_tmp = __old;\n"; out << " NV_DISPATCH_TARGET(\n"; @@ -503,6 +506,7 @@ int main() } } + out << "\n_LIBCUDACXX_END_NAMESPACE_STD\n"; out << "\n// clang-format on\n"; return 0; diff --git a/libcudacxx/include/cuda/std/__atomic/atomic_cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/operations/atomic_cuda_ptx_derived.h similarity index 100% rename from libcudacxx/include/cuda/std/__atomic/atomic_cuda_ptx_derived.h rename to libcudacxx/include/cuda/std/__atomic/operations/atomic_cuda_ptx_derived.h diff --git a/libcudacxx/include/cuda/std/__atomic/atomic_cuda_ptx_generated.h b/libcudacxx/include/cuda/std/__atomic/operations/atomic_cuda_ptx_generated.h similarity index 99% rename from libcudacxx/include/cuda/std/__atomic/atomic_cuda_ptx_generated.h rename to libcudacxx/include/cuda/std/__atomic/operations/atomic_cuda_ptx_generated.h index 648de27352..ff1bdcf1ff 100644 --- a/libcudacxx/include/cuda/std/__atomic/atomic_cuda_ptx_generated.h +++ b/libcudacxx/include/cuda/std/__atomic/operations/atomic_cuda_ptx_generated.h @@ -8,8 +8,11 @@ // //===----------------------------------------------------------------------===// -// This is a autogenerated file, we want to ensure that it contains exactly the contentes we want to generate +// This is a autogenerated file, we want to ensure that it contains exactly the contents we want to generate // clang-format off + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + static inline _CCCL_DEVICE void __cuda_membar_block() { asm volatile("membar.cta;":::"memory"); } static inline _CCCL_DEVICE void __cuda_fence_acq_rel_block() { asm volatile("fence.acq_rel.cta;":::"memory"); } static inline _CCCL_DEVICE void __cuda_fence_sc_block() { asm volatile("fence.sc.cta;":::"memory"); } @@ -251,7 +254,7 @@ template static inli template = 0> _CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) { uint32_t __tmp = 0, __old = 0, __old_tmp; - memcpy(&__tmp, __desired, 4); + memcpy(&__tmp, &__desired, 4); memcpy(&__old, __expected, 4); __old_tmp = __old; NV_DISPATCH_TARGET( @@ -285,7 +288,7 @@ _CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *_ template = 0> _CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) { uint32_t __tmp = 0, __old = 0, __old_tmp; - memcpy(&__tmp, __desired, 4); + memcpy(&__tmp, &__desired, 4); memcpy(&__old, __expected, 4); __old_tmp = __old; NV_DISPATCH_TARGET( @@ -1158,7 +1161,7 @@ template static inli template = 0> _CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) { uint64_t __tmp = 0, __old = 0, __old_tmp; - memcpy(&__tmp, __desired, 8); + memcpy(&__tmp, &__desired, 8); memcpy(&__old, __expected, 8); __old_tmp = __old; NV_DISPATCH_TARGET( @@ -1192,7 +1195,7 @@ _CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *_ template = 0> _CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) { uint64_t __tmp = 0, __old = 0, __old_tmp; - memcpy(&__tmp, __desired, 8); + memcpy(&__tmp, &__desired, 8); memcpy(&__old, __expected, 8); __old_tmp = __old; NV_DISPATCH_TARGET( @@ -2428,7 +2431,7 @@ template static inli template = 0> _CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) { uint32_t __tmp = 0, __old = 0, __old_tmp; - memcpy(&__tmp, __desired, 4); + memcpy(&__tmp, &__desired, 4); memcpy(&__old, __expected, 4); __old_tmp = __old; NV_DISPATCH_TARGET( @@ -2462,7 +2465,7 @@ _CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *_ template = 0> _CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) { uint32_t __tmp = 0, __old = 0, __old_tmp; - memcpy(&__tmp, __desired, 4); + memcpy(&__tmp, &__desired, 4); memcpy(&__old, __expected, 4); __old_tmp = __old; NV_DISPATCH_TARGET( @@ -3335,7 +3338,7 @@ template static inli template = 0> _CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) { uint64_t __tmp = 0, __old = 0, __old_tmp; - memcpy(&__tmp, __desired, 8); + memcpy(&__tmp, &__desired, 8); memcpy(&__old, __expected, 8); __old_tmp = __old; NV_DISPATCH_TARGET( @@ -3369,7 +3372,7 @@ _CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *_ template = 0> _CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) { uint64_t __tmp = 0, __old = 0, __old_tmp; - memcpy(&__tmp, __desired, 8); + memcpy(&__tmp, &__desired, 8); memcpy(&__old, __expected, 8); __old_tmp = __old; NV_DISPATCH_TARGET( @@ -4605,7 +4608,7 @@ template static inli template = 0> _CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) { uint32_t __tmp = 0, __old = 0, __old_tmp; - memcpy(&__tmp, __desired, 4); + memcpy(&__tmp, &__desired, 4); memcpy(&__old, __expected, 4); __old_tmp = __old; NV_DISPATCH_TARGET( @@ -4639,7 +4642,7 @@ _CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *_ template = 0> _CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) { uint32_t __tmp = 0, __old = 0, __old_tmp; - memcpy(&__tmp, __desired, 4); + memcpy(&__tmp, &__desired, 4); memcpy(&__old, __expected, 4); __old_tmp = __old; NV_DISPATCH_TARGET( @@ -5512,7 +5515,7 @@ template static inli template = 0> _CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) { uint64_t __tmp = 0, __old = 0, __old_tmp; - memcpy(&__tmp, __desired, 8); + memcpy(&__tmp, &__desired, 8); memcpy(&__old, __expected, 8); __old_tmp = __old; NV_DISPATCH_TARGET( @@ -5546,7 +5549,7 @@ _CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *_ template = 0> _CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) { uint64_t __tmp = 0, __old = 0, __old_tmp; - memcpy(&__tmp, __desired, 8); + memcpy(&__tmp, &__desired, 8); memcpy(&__old, __expected, 8); __old_tmp = __old; NV_DISPATCH_TARGET( @@ -6542,4 +6545,6 @@ _CCCL_DEVICE _Type* __atomic_fetch_sub_cuda(_Type **__ptr, ptrdiff_t __val, int return __ret; } +_LIBCUDACXX_END_NAMESPACE_STD + // clang-format on diff --git a/libcudacxx/include/cuda/std/__atomic/operations/heterogeneous.h b/libcudacxx/include/cuda/std/__atomic/operations/heterogeneous.h new file mode 100644 index 0000000000..86a142de08 --- /dev/null +++ b/libcudacxx/include/cuda/std/__atomic/operations/heterogeneous.h @@ -0,0 +1,265 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef __LIBCUDACXX___ATOMIC_DISPATCH_H +#define __LIBCUDACXX___ATOMIC_DISPATCH_H + +#include + +#include +#include +#include + +#include +#include +#include + +// Dispatch directly calls PTX/Host backends for atomic objects. +// By default these objects support extracting the address contained with operator()() +// this provides some amount of syntactic sugar to avoid duplicating every function that requires `volatile`. +// `_Tp` is able to be volatile and will simply be instatiated into a new function. +// It is up to the underlying backends to implement the correct volatile behavior + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +_LIBCUDACXX_HOST_DEVICE +inline + void __atomic_thread_fence_dispatch(memory_order __order) { + NV_DISPATCH_TARGET( + NV_IS_DEVICE, ( + __atomic_thread_fence_cuda(static_cast<__memory_order_underlying_t>(__order), __thread_scope_system_tag()); + ), + NV_IS_HOST, ( + __atomic_thread_fence_host(__order); + ) + ) +} + +_LIBCUDACXX_HOST_DEVICE +inline + void __atomic_signal_fence_dispatch(memory_order __order) { + NV_DISPATCH_TARGET( + NV_IS_DEVICE, ( + __atomic_signal_fence_cuda(static_cast<__memory_order_underlying_t>(__order)); + ), + NV_IS_HOST, ( + __atomic_signal_fence_host(__order); + ) + ) +} + +// Regarding __atomic_base_Tag +// It *is* possible to define it as: +// _Tag = __atomic_enable_if_default_base_t<_Tp> and make all tag types default to the 'base' backend +// I don't know if it's necessary to do that though. For now, this just adds some kind of protection +// preventing access to the functions with the wrong tag type. +template +using __atomic_enable_if_default_base_t = __enable_if_t, __atomic_base_tag>::value, __atomic_tag_t<_Tp>>; + +template > +_LIBCUDACXX_HOST_DEVICE + void __atomic_init_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, _Tag = {}) { + __atomic_assign_volatile(__a(), __val); +} + +template > +_LIBCUDACXX_HOST_DEVICE + void __atomic_store_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco = {}, _Tag = {}) { + alignas(_Tp) auto __tmp = __val; + NV_DISPATCH_TARGET( + NV_IS_DEVICE, ( + __atomic_store_n_cuda(__a(), __tmp, static_cast<__memory_order_underlying_t>(__order), _Sco{}); + ), + NV_IS_HOST, ( + __atomic_store_host(__a(), __tmp, __order); + ) + ) +} + +template > +_LIBCUDACXX_HOST_DEVICE + auto __atomic_load_dispatch(_Tp const& __a, memory_order __order, _Sco = {}, _Tag = {}) -> __atomic_underlying_t<_Tp> { + NV_DISPATCH_TARGET( + NV_IS_DEVICE, ( + return __atomic_load_n_cuda(__a(), static_cast<__memory_order_underlying_t>(__order), _Sco{}); + ), + NV_IS_HOST, ( + return __atomic_load_host(__a(), __order); + ) + ) +} + +template > +_LIBCUDACXX_HOST_DEVICE +__atomic_underlying_t<_Tp> __atomic_exchange_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __value, memory_order __order, _Sco = {}, _Tag = {}) { + alignas(_Tp) auto __tmp = __value; + NV_DISPATCH_TARGET( + NV_IS_DEVICE, ( + return __atomic_exchange_n_cuda(__a(), __tmp, static_cast<__memory_order_underlying_t>(__order), _Sco{}); + ), + NV_IS_HOST, ( + return __atomic_exchange_host(__a(), __tmp, __order); + ) + ) +} + +template > +_LIBCUDACXX_HOST_DEVICE + bool __atomic_compare_exchange_strong_dispatch(_Tp& __a, __atomic_underlying_t<_Tp>* __expected, __atomic_underlying_t<_Tp> __val, memory_order __success, memory_order __failure, _Sco = {}, _Tag = {}) { + bool __result = false; + NV_DISPATCH_TARGET( + NV_IS_DEVICE, ( + __result = __atomic_compare_exchange_cuda(__a(), __expected, __val, false, static_cast<__memory_order_underlying_t>(__success), static_cast<__memory_order_underlying_t>(__failure), _Sco{}); + ), + NV_IS_HOST, ( + __result = __atomic_compare_exchange_strong_host(__a(), __expected, __val, __success, __failure); + ) + ) + return __result; +} + +template > +_LIBCUDACXX_HOST_DEVICE + bool __atomic_compare_exchange_weak_dispatch(_Tp& __a, __atomic_underlying_t<_Tp>* __expected, __atomic_underlying_t<_Tp> __val, memory_order __success, memory_order __failure, _Sco = {}, _Tag = {}) { + bool __result = false; + NV_DISPATCH_TARGET( + NV_IS_DEVICE, ( + __result = __atomic_compare_exchange_cuda(__a(), __expected, __val, true, static_cast<__memory_order_underlying_t>(__success), static_cast<__memory_order_underlying_t>(__failure), _Sco{}); + ), + NV_IS_HOST, ( + __result = __atomic_compare_exchange_weak_host(__a(), __expected, __val, __success, __failure); + ) + ) + return __result; +} + +template +using __atomic_enable_if_ptr = __enable_if_t>::value, __atomic_underlying_t<_Tp>>; +template +using __atomic_enable_if_not_ptr = __enable_if_t>::value, __atomic_underlying_t<_Tp>>; + +template > +_LIBCUDACXX_HOST_DEVICE + __atomic_enable_if_not_ptr<_Tp> __atomic_fetch_add_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __delta, memory_order __order, _Sco = {}, _Tag = {}) { + NV_DISPATCH_TARGET( + NV_IS_DEVICE, ( + return __atomic_fetch_add_cuda(__a(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{}); + ), + NV_IS_HOST, ( + return __atomic_fetch_add_host(__a(), __delta, __order); + ) + ) +} + +template > +_LIBCUDACXX_HOST_DEVICE + __atomic_enable_if_ptr<_Tp> __atomic_fetch_add_dispatch(_Tp& __a, ptrdiff_t __delta, memory_order __order, _Sco = {}, _Tag = {}) { + NV_DISPATCH_TARGET( + NV_IS_DEVICE, ( + return __atomic_fetch_add_cuda(__a(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{}); + ), + NV_IS_HOST, ( + return __atomic_fetch_add_host(__a(), __delta, __order); + ) + ) +} + +template > +_LIBCUDACXX_HOST_DEVICE + __atomic_enable_if_not_ptr<_Tp> __atomic_fetch_sub_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __delta, memory_order __order, _Sco = {}, _Tag = {}) { + NV_DISPATCH_TARGET( + NV_IS_DEVICE, ( + return __atomic_fetch_sub_cuda(__a(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{}); + ), + NV_IS_HOST, ( + return __atomic_fetch_sub_cuda(__a(), __delta, __order); + ) + ) +} + +template > +_LIBCUDACXX_HOST_DEVICE + __atomic_enable_if_ptr<_Tp> __atomic_fetch_sub_dispatch(_Tp& __a, ptrdiff_t __delta, memory_order __order, _Sco = {}, _Tag = {}) { + NV_DISPATCH_TARGET( + NV_IS_DEVICE, ( + return __atomic_fetch_sub_cuda(__a(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{}); + ), + NV_IS_HOST, ( + return __atomic_fetch_sub_host(__a(), __delta, __order); + ) + ) +} + +template > +_LIBCUDACXX_HOST_DEVICE + __atomic_underlying_t<_Tp> __atomic_fetch_and_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order __order, _Sco = {}, _Tag = {}) { + NV_DISPATCH_TARGET( + NV_IS_DEVICE, ( + return __atomic_fetch_and_cuda(__a(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{}); + ), + NV_IS_HOST, ( + return __atomic_fetch_and_host(__a(), __pattern, __order); + ) + ) +} + +template > +_LIBCUDACXX_HOST_DEVICE + __atomic_underlying_t<_Tp> __atomic_fetch_or_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order __order, _Sco = {}, _Tag = {}) { + NV_DISPATCH_TARGET( + NV_IS_DEVICE, ( + return __atomic_fetch_or_cuda(__a(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{}); + ), + NV_IS_HOST, ( + return __atomic_fetch_or_host(__a(), __pattern, __order); + ) + ) +} + +template > +_LIBCUDACXX_HOST_DEVICE + __atomic_underlying_t<_Tp> __atomic_fetch_xor_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order __order, _Sco = {}, _Tag = {}) { + NV_DISPATCH_TARGET( + NV_IS_DEVICE, ( + return __atomic_fetch_xor_cuda(__a(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{}); + ), + NV_IS_HOST, ( + return __atomic_fetch_xor_host(__a(), __pattern, __order); + ) + ) +} + +template > +_LIBCUDACXX_HOST_DEVICE + __atomic_underlying_t<_Tp> __atomic_fetch_max_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco = {}, _Tag = {}) { + NV_IF_TARGET( + NV_IS_DEVICE, ( + return __atomic_fetch_max_cuda(__a(), __val, static_cast<__memory_order_underlying_t>(__order), _Sco{}); + ), ( + return __atomic_fetch_max_host(__a(), __val, __order); + ) + ) +} + +template > +_LIBCUDACXX_HOST_DEVICE + __atomic_underlying_t<_Tp> __atomic_fetch_min_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco = {}, _Tag = {}) { + NV_IF_TARGET( + NV_IS_DEVICE, ( + return __atomic_fetch_min_cuda(__a(), __val, static_cast<__memory_order_underlying_t>(__order), _Sco{}); + ), ( + return __atomic_fetch_min_host(__a(), __val, __order); + ) + ) +} + +_LIBCUDACXX_END_NAMESPACE_STD + +#endif // __LIBCUDACXX___ATOMIC_DISPATCH_H diff --git a/libcudacxx/include/cuda/std/__atomic/operations/host.h b/libcudacxx/include/cuda/std/__atomic/operations/host.h new file mode 100644 index 0000000000..4870c011c4 --- /dev/null +++ b/libcudacxx/include/cuda/std/__atomic/operations/host.h @@ -0,0 +1,182 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCUDACXX___ATOMICS_HOST_H +#define _LIBCUDACXX___ATOMICS_HOST_H + +#include +#include + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +// Guard ifdef for lock free query in case it is assigned elsewhere (MSVC/CUDA) +#ifndef _LIBCUDACXX_ATOMIC_IS_LOCK_FREE +#define _LIBCUDACXX_ATOMIC_IS_LOCK_FREE(__x) __atomic_is_lock_free(__x, 0) +#endif + +inline +void __atomic_thread_fence_host(memory_order __order) { + __atomic_thread_fence(__atomic_order_to_int(__order)); +} + +inline +void __atomic_signal_fence_host(memory_order __order) { + __atomic_signal_fence(__atomic_order_to_int(__order)); +} + +template +inline void __atomic_store_host(_Tp* __a, _Up __val, memory_order __order) { + __atomic_store(__a, &__val, __atomic_order_to_int(__order)); +} + +template +inline auto __atomic_load_host(_Tp* __a, memory_order __order) -> _Tp { + __remove_cvref_t<_Tp> __ret{}; + __atomic_load(__a, &__ret, __atomic_order_to_int(__order)); + return __ret; +} + +template +inline auto __atomic_exchange_host(_Tp* __a, _Up __val, memory_order __order) -> _Tp { + __remove_cvref_t<_Tp> __ret{}; + __atomic_exchange(__a, &__val, &__ret, __atomic_order_to_int(__order)); + return __ret; +} + +template +inline bool __atomic_compare_exchange_strong_host( + _Tp* __a, _Up* __expected, _Up __value, memory_order __success, + memory_order __failure) { + (void)__expected; + return __atomic_compare_exchange(__a, + __expected, &__value, false, + __atomic_order_to_int(__success), + __atomic_failure_order_to_int(__failure)); +} + +template +inline bool __atomic_compare_exchange_weak_host( + _Tp* __a, _Up* __expected, _Up __value, memory_order __success, + memory_order __failure) { + (void)__expected; + return __atomic_compare_exchange(__a, + __expected, &__value, true, + __atomic_order_to_int(__success), + __atomic_failure_order_to_int(__failure)); +} + +template +struct __atomic_ptr_inc { enum {value = 1}; }; + +template +struct __atomic_ptr_inc<_Tp*> { enum {value = sizeof(_Tp)}; }; + +// FIXME: Haven't figured out what the spec says about using arrays with +// atomic_fetch_add. Force a failure rather than creating bad behavior. +template +struct __atomic_ptr_inc<_Tp[]> { }; +template +struct __atomic_ptr_inc<_Tp[n]> { }; + +template ::value, int> = 0> +inline _Tp __atomic_fetch_add_host(_Tp* __a, _Td __delta, + memory_order __order) { + constexpr auto __skip_v = __atomic_ptr_inc<_Tp>::value; + return __atomic_fetch_add(__a, __delta * __skip_v, + __atomic_order_to_int(__order)); +} + +template ::value, int> = 0> +inline _Tp __atomic_fetch_add_host(_Tp* __a, _Td __delta, + memory_order __order) { + auto __expected = __atomic_load_host(__a, memory_order_relaxed); + auto __desired = __expected + __delta; + + while(!__atomic_compare_exchange_strong_host(__a, &__expected, __desired, __order, __order)) { + __desired = __expected + __delta; + } + + return __expected; +} + +template ::value, int> = 0> +inline _Tp __atomic_fetch_sub_host(_Tp* __a, _Td __delta, + memory_order __order) { + constexpr auto __skip_v = __atomic_ptr_inc<_Tp>::value; + return __atomic_fetch_sub(__a, __delta * __skip_v, + __atomic_order_to_int(__order)); +} + +template ::value, int> = 0> +inline _Tp __atomic_fetch_sub_host(_Tp* __a, _Td __delta, + memory_order __order) { + auto __expected = __atomic_load_host(__a, memory_order_relaxed); + auto __desired = __expected - __delta; + + while(!__atomic_compare_exchange_strong_host(__a, &__expected, __desired, __order, __order)) { + __desired = __expected - __delta; + } + + return __expected; +} + +template +inline _Tp __atomic_fetch_and_host(_Tp* __a, _Td __pattern, + memory_order __order) { + return __atomic_fetch_and(__a, __pattern, + __atomic_order_to_int(__order)); +} + +template +inline _Tp __atomic_fetch_or_host(_Tp* __a, _Td __pattern, + memory_order __order) { + return __atomic_fetch_or(__a, __pattern, + __atomic_order_to_int(__order)); +} + +template +inline _Tp __atomic_fetch_xor_host(_Tp* __a, _Td __pattern, + memory_order __order) { + return __atomic_fetch_xor(__a, __pattern, + __atomic_order_to_int(__order)); +} + +template +inline _Tp __atomic_fetch_max_host(_Tp* __a, _Td __val, + memory_order __order) { + auto __expected = __atomic_load_host(__a, memory_order_relaxed); + auto __desired = __expected > __val ? __expected : __val; + + while(__desired == __val && + !__atomic_compare_exchange_strong_host(__a, &__expected, __desired, __order, __order)) { + __desired = __expected > __val ? __expected : __val; + } + + return __expected; +} + +template +inline _Tp __atomic_fetch_min_host(_Tp* __a, _Td __val, + memory_order __order) { + auto __expected = __atomic_load_host(__a, memory_order_relaxed); + auto __desired = __expected < __val ? __expected : __val; + + while(__desired == __val && + !__atomic_compare_exchange_strong_host(__a, &__expected, __desired, __order, __order)) { + __desired = __expected < __val ? __expected : __val; + } + + return __expected; +} + +_LIBCUDACXX_END_NAMESPACE_STD + +#endif // _LIBCUDACXX___ATOMICS_HOST_H diff --git a/libcudacxx/include/cuda/std/__atomic/order.h b/libcudacxx/include/cuda/std/__atomic/order.h new file mode 100644 index 0000000000..d5c37c45ec --- /dev/null +++ b/libcudacxx/include/cuda/std/__atomic/order.h @@ -0,0 +1,126 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef __LIBCUDACXX_ATOMIC_ORDER_H +#define __LIBCUDACXX_ATOMIC_ORDER_H + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +#define _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m) \ + _LIBCUDACXX_DIAGNOSE_WARNING(__m == memory_order_consume || \ + __m == memory_order_acquire || \ + __m == memory_order_acq_rel, \ + "memory order argument to atomic operation is invalid") + +#define _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m) \ + _LIBCUDACXX_DIAGNOSE_WARNING(__m == memory_order_release || \ + __m == memory_order_acq_rel, \ + "memory order argument to atomic operation is invalid") + +#define _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__m, __f) \ + _LIBCUDACXX_DIAGNOSE_WARNING(__f == memory_order_release || \ + __f == memory_order_acq_rel, \ + "memory order argument to atomic operation is invalid") + +#ifndef __ATOMIC_RELAXED +#define __ATOMIC_RELAXED 0 +#define __ATOMIC_CONSUME 1 +#define __ATOMIC_ACQUIRE 2 +#define __ATOMIC_RELEASE 3 +#define __ATOMIC_ACQ_REL 4 +#define __ATOMIC_SEQ_CST 5 +#endif //__ATOMIC_RELAXED + +// Figure out what the underlying type for `memory_order` would be if it were +// declared as an unscoped enum (accounting for -fshort-enums). Use this result +// to pin the underlying type in C++20. +enum __legacy_memory_order { + __mo_relaxed, + __mo_consume, + __mo_acquire, + __mo_release, + __mo_acq_rel, + __mo_seq_cst +}; + +typedef underlying_type<__legacy_memory_order>::type __memory_order_underlying_t; + +#if _CCCL_STD_VER > 2017 + +enum class memory_order : __memory_order_underlying_t { + relaxed = __mo_relaxed, + consume = __mo_consume, + acquire = __mo_acquire, + release = __mo_release, + acq_rel = __mo_acq_rel, + seq_cst = __mo_seq_cst +}; + +inline constexpr auto memory_order_relaxed = memory_order::relaxed; +inline constexpr auto memory_order_consume = memory_order::consume; +inline constexpr auto memory_order_acquire = memory_order::acquire; +inline constexpr auto memory_order_release = memory_order::release; +inline constexpr auto memory_order_acq_rel = memory_order::acq_rel; +inline constexpr auto memory_order_seq_cst = memory_order::seq_cst; + +#else + +typedef enum memory_order { + memory_order_relaxed = __mo_relaxed, + memory_order_consume = __mo_consume, + memory_order_acquire = __mo_acquire, + memory_order_release = __mo_release, + memory_order_acq_rel = __mo_acq_rel, + memory_order_seq_cst = __mo_seq_cst, +} memory_order; + +#endif // _CCCL_STD_VER > 2017 + +_LIBCUDACXX_HOST_DEVICE +inline int __stronger_order_cuda(int __a, int __b) { + int const __max = __a > __b ? __a : __b; + if(__max != __ATOMIC_RELEASE) + return __max; + static int const __xform[] = { + __ATOMIC_RELEASE, + __ATOMIC_ACQ_REL, + __ATOMIC_ACQ_REL, + __ATOMIC_RELEASE }; + return __xform[__a < __b ? __a : __b]; +} + +_LIBCUDACXX_HOST_DEVICE +inline constexpr int __atomic_order_to_int(memory_order __order) { + // Avoid switch statement to make this a constexpr. + return __order == memory_order_relaxed ? __ATOMIC_RELAXED: + (__order == memory_order_acquire ? __ATOMIC_ACQUIRE: + (__order == memory_order_release ? __ATOMIC_RELEASE: + (__order == memory_order_seq_cst ? __ATOMIC_SEQ_CST: + (__order == memory_order_acq_rel ? __ATOMIC_ACQ_REL: + __ATOMIC_CONSUME)))); +} + +_LIBCUDACXX_HOST_DEVICE +inline constexpr int __atomic_failure_order_to_int(memory_order __order) { + // Avoid switch statement to make this a constexpr. + return __order == memory_order_relaxed ? __ATOMIC_RELAXED: + (__order == memory_order_acquire ? __ATOMIC_ACQUIRE: + (__order == memory_order_release ? __ATOMIC_RELAXED: + (__order == memory_order_seq_cst ? __ATOMIC_SEQ_CST: + (__order == memory_order_acq_rel ? __ATOMIC_ACQUIRE: + __ATOMIC_CONSUME)))); +} + +static_assert((is_same::type, __memory_order_underlying_t>::value), + "unexpected underlying type for std::memory_order"); + +_LIBCUDACXX_END_NAMESPACE_STD + +#endif // __LIBCUDACXX_ATOMIC_ORDER_H diff --git a/libcudacxx/include/cuda/std/__atomic/platform.h b/libcudacxx/include/cuda/std/__atomic/platform.h deleted file mode 100644 index 9a2f683d15..0000000000 --- a/libcudacxx/include/cuda/std/__atomic/platform.h +++ /dev/null @@ -1,14 +0,0 @@ -// -*- C++ -*- -//===----------------------------------------------------------------------===// -// -// Part of libcu++, the C++ Standard Library for your entire system, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -#if defined(_CCCL_COMPILER_MSVC) -#include -#endif diff --git a/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h b/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h index d48c68acb4..f1ddff6dfd 100644 --- a/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h +++ b/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h @@ -9,10 +9,18 @@ // //===----------------------------------------------------------------------===// +#ifndef __LIBCUDACXX___ATOMIC_PLATFORM_MSVC_H +#define __LIBCUDACXX___ATOMIC_PLATFORM_MSVC_H + #ifndef _MSC_VER # error "This file is only for CL.EXE's benefit" #endif +#include +#include + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + #define _LIBCUDACXX_COMPILER_BARRIER() _ReadWriteBarrier() #if defined(_M_ARM) || defined(_M_ARM64) @@ -621,3 +629,7 @@ _Type __atomic_fetch_min(_Type volatile* __ptr, _Delta __val, int __memorder) } return __expected; } + +_LIBCUDACXX_END_NAMESPACE_STD + +#endif // __LIBCUDACXX___ATOMIC_PLATFORM_MSVC_H diff --git a/libcudacxx/include/cuda/std/__atomic/platform/platform.h b/libcudacxx/include/cuda/std/__atomic/platform/platform.h new file mode 100644 index 0000000000..cabb9de827 --- /dev/null +++ b/libcudacxx/include/cuda/std/__atomic/platform/platform.h @@ -0,0 +1,59 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#if defined(_CCCL_COMPILER_MSVC) +#include +#endif + +#if defined(__CLANG_ATOMIC_BOOL_LOCK_FREE) +# define ATOMIC_BOOL_LOCK_FREE __CLANG_ATOMIC_BOOL_LOCK_FREE +# define ATOMIC_CHAR_LOCK_FREE __CLANG_ATOMIC_CHAR_LOCK_FREE +# define ATOMIC_CHAR16_T_LOCK_FREE __CLANG_ATOMIC_CHAR16_T_LOCK_FREE +# define ATOMIC_CHAR32_T_LOCK_FREE __CLANG_ATOMIC_CHAR32_T_LOCK_FREE +# define ATOMIC_WCHAR_T_LOCK_FREE __CLANG_ATOMIC_WCHAR_T_LOCK_FREE +# define ATOMIC_SHORT_LOCK_FREE __CLANG_ATOMIC_SHORT_LOCK_FREE +# define ATOMIC_INT_LOCK_FREE __CLANG_ATOMIC_INT_LOCK_FREE +# define ATOMIC_LONG_LOCK_FREE __CLANG_ATOMIC_LONG_LOCK_FREE +# define ATOMIC_LLONG_LOCK_FREE __CLANG_ATOMIC_LLONG_LOCK_FREE +# define ATOMIC_POINTER_LOCK_FREE __CLANG_ATOMIC_POINTER_LOCK_FREE +#elif defined(__GCC_ATOMIC_BOOL_LOCK_FREE) +# define ATOMIC_BOOL_LOCK_FREE __GCC_ATOMIC_BOOL_LOCK_FREE +# define ATOMIC_CHAR_LOCK_FREE __GCC_ATOMIC_CHAR_LOCK_FREE +# define ATOMIC_CHAR16_T_LOCK_FREE __GCC_ATOMIC_CHAR16_T_LOCK_FREE +# define ATOMIC_CHAR32_T_LOCK_FREE __GCC_ATOMIC_CHAR32_T_LOCK_FREE +# define ATOMIC_WCHAR_T_LOCK_FREE __GCC_ATOMIC_WCHAR_T_LOCK_FREE +# define ATOMIC_SHORT_LOCK_FREE __GCC_ATOMIC_SHORT_LOCK_FREE +# define ATOMIC_INT_LOCK_FREE __GCC_ATOMIC_INT_LOCK_FREE +# define ATOMIC_LONG_LOCK_FREE __GCC_ATOMIC_LONG_LOCK_FREE +# define ATOMIC_LLONG_LOCK_FREE __GCC_ATOMIC_LLONG_LOCK_FREE +# define ATOMIC_POINTER_LOCK_FREE __GCC_ATOMIC_POINTER_LOCK_FREE +#endif + +#if !defined(__CLANG_ATOMIC_BOOL_LOCK_FREE) && !defined(__GCC_ATOMIC_BOOL_LOCK_FREE) +#define ATOMIC_BOOL_LOCK_FREE 2 +#define ATOMIC_CHAR_LOCK_FREE 2 +#define ATOMIC_CHAR16_T_LOCK_FREE 2 +#define ATOMIC_CHAR32_T_LOCK_FREE 2 +#define ATOMIC_WCHAR_T_LOCK_FREE 2 +#define ATOMIC_SHORT_LOCK_FREE 2 +#define ATOMIC_INT_LOCK_FREE 2 +#define ATOMIC_LONG_LOCK_FREE 2 +#define ATOMIC_LLONG_LOCK_FREE 2 +#define ATOMIC_POINTER_LOCK_FREE 2 +#endif //!defined(__CLANG_ATOMIC_BOOL_LOCK_FREE) && !defined(__GCC_ATOMIC_BOOL_LOCK_FREE) + +#if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE) +template struct __atomic_is_always_lock_free { + enum { __value = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0) }; }; +#else +template struct __atomic_is_always_lock_free { + enum { __value = sizeof(_Tp) <= 8 }; }; +#endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE) diff --git a/libcudacxx/include/cuda/std/__atomic/scopes.h b/libcudacxx/include/cuda/std/__atomic/scopes.h new file mode 100644 index 0000000000..3208227dc8 --- /dev/null +++ b/libcudacxx/include/cuda/std/__atomic/scopes.h @@ -0,0 +1,52 @@ +#ifndef __LIBCUDACXX_ATOMIC_SCOPES_H +#define __LIBCUDACXX_ATOMIC_SCOPES_H + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +// REMEMBER CHANGES TO THESE ARE ABI BREAKING +// TODO: Space values out for potential new scopes +#ifndef __ATOMIC_BLOCK +#define __ATOMIC_SYSTEM 0 // 0 indicates default +#define __ATOMIC_DEVICE 1 +#define __ATOMIC_BLOCK 2 +#define __ATOMIC_THREAD 10 +#endif //__ATOMIC_BLOCK + +enum thread_scope { + thread_scope_system = __ATOMIC_SYSTEM, + thread_scope_device = __ATOMIC_DEVICE, + thread_scope_block = __ATOMIC_BLOCK, + thread_scope_thread = __ATOMIC_THREAD +}; + +#define _LIBCUDACXX_ATOMIC_SCOPE_TYPE ::cuda::thread_scope +#define _LIBCUDACXX_ATOMIC_SCOPE_DEFAULT ::cuda::thread_scope::system + +struct __thread_scope_thread_tag { }; +struct __thread_scope_block_tag { }; +struct __thread_scope_device_tag { }; +struct __thread_scope_system_tag { }; + +template struct __scope_enum_to_tag { }; +/* This would be the implementation once an actual thread-scope backend exists. +template<> struct __scope_enum_to_tag<(int)thread_scope_thread> { + using type = __thread_scope_thread_tag; }; +Until then: */ +template<> struct __scope_enum_to_tag<(int)thread_scope_thread> { + using type = __thread_scope_block_tag; }; +template<> struct __scope_enum_to_tag<(int)thread_scope_block> { + using type = __thread_scope_block_tag; }; +template<> struct __scope_enum_to_tag<(int)thread_scope_device> { + using type = __thread_scope_device_tag; }; +template<> struct __scope_enum_to_tag<(int)thread_scope_system> { + using type = __thread_scope_system_tag; }; + +template +_LIBCUDACXX_INLINE_VISIBILITY auto constexpr __scope_tag() -> + typename __scope_enum_to_tag<_Scope>::type { + return typename __scope_enum_to_tag<_Scope>::type(); +} + +_LIBCUDACXX_END_NAMESPACE_STD + +#endif // __LIBCUDACXX_ATOMIC_SCOPES_H diff --git a/libcudacxx/include/cuda/std/__atomic/storage/base.h b/libcudacxx/include/cuda/std/__atomic/storage/base.h new file mode 100644 index 0000000000..ef197fd4ef --- /dev/null +++ b/libcudacxx/include/cuda/std/__atomic/storage/base.h @@ -0,0 +1,60 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCUDACXX___ATOMIC_STORAGE_BASE_H +#define _LIBCUDACXX___ATOMIC_STORAGE_BASE_H + +#include + +#include + +#include + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +struct __atomic_base_tag {}; + +template +struct __atomic_storage { + using __underlying_t = _Tp; + using __tag_t = __atomic_base_tag; + +#if !defined(_CCCL_COMPILER_GCC) || (__GNUC__ >= 5) + static_assert(is_trivially_copyable<_Tp>::value, + "std::atomic requires that 'Tp' be a trivially copyable type"); +#endif + + _ALIGNAS(sizeof(_Tp)) _Tp __a_value; + + _LIBCUDACXX_HOST_DEVICE + __atomic_storage() noexcept + : __a_value() {} + _LIBCUDACXX_HOST_DEVICE constexpr explicit + __atomic_storage(_Tp value) noexcept + : __a_value(value) {} + + _LIBCUDACXX_HOST_DEVICE inline auto operator()() -> __underlying_t* { + return &__a_value; + } + _LIBCUDACXX_HOST_DEVICE inline auto operator()() volatile -> volatile __underlying_t* { + return &__a_value; + } + _LIBCUDACXX_HOST_DEVICE inline auto operator()() const -> const __underlying_t* { + return &__a_value; + } + _LIBCUDACXX_HOST_DEVICE inline auto operator()() const volatile -> const volatile __underlying_t* { + return &__a_value; + } +}; + +_LIBCUDACXX_END_NAMESPACE_STD + +#endif // _LIBCUDACXX___ATOMIC_STORAGE_BASE_H diff --git a/libcudacxx/include/cuda/std/__atomic/storage/common.h b/libcudacxx/include/cuda/std/__atomic/storage/common.h new file mode 100644 index 0000000000..22f946aada --- /dev/null +++ b/libcudacxx/include/cuda/std/__atomic/storage/common.h @@ -0,0 +1,46 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCUDACXX___ATOMIC_STORAGE_COMMON_H +#define _LIBCUDACXX___ATOMIC_STORAGE_COMMON_H + +#include + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +// [atomics.types.generic]p1 guarantees _Tp is trivially copyable. Because +// the default operator= in an object is not volatile, a byte-by-byte copy +// is required. +template +__enable_if_t::value> +_LIBCUDACXX_HOST_DEVICE __atomic_assign_volatile(_Tp& __a_value, _Tv const& __val) { + __a_value = __val; +} + +template +__enable_if_t::value> +_LIBCUDACXX_HOST_DEVICE __atomic_assign_volatile(_Tp volatile& __a_value, _Tv volatile const& __val) { + volatile char* __to = reinterpret_cast(&__a_value); + volatile char* __end = __to + sizeof(_Tp); + volatile const char* __from = reinterpret_cast(&__val); + while (__to != __end) + *__to++ = *__from++; +} + +template +using __atomic_underlying_t = typename __remove_cvref_t<_Tp>::__underlying_t; + +template +using __atomic_tag_t = typename __remove_cvref_t<_Tp>::__tag_t; + +_LIBCUDACXX_END_NAMESPACE_STD + +#endif // _LIBCUDACXX___ATOMIC_STORAGE_COMMON_H diff --git a/libcudacxx/include/cuda/std/__atomic/storage/locked.h b/libcudacxx/include/cuda/std/__atomic/storage/locked.h new file mode 100644 index 0000000000..ab359bc780 --- /dev/null +++ b/libcudacxx/include/cuda/std/__atomic/storage/locked.h @@ -0,0 +1,204 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCUDACXX___ATOMIC_STORAGE_LOCKED_H +#define _LIBCUDACXX___ATOMIC_STORAGE_LOCKED_H + +#include + +#include + +#include +#include +#include + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +// Locked atomics must override the dispatch to be able to implement RMW primitives around the embedded lock. +struct __atomic_locked_tag {}; + +template +struct __atomic_locked_storage { + using __underlying_t = typename remove_cv<_Tp>::type; + using __tag_t = typename __atomic_locked_tag; + + _LIBCUDACXX_HOST_DEVICE + __atomic_locked_storage() noexcept + : __a_value(), __a_lock(0) {} + _LIBCUDACXX_HOST_DEVICE constexpr explicit + __atomic_locked_storage(_Tp value) noexcept + : __a_value(value), __a_lock(0) {} + + _Tp __a_value; + mutable __atomic_storage<_LIBCUDACXX_ATOMIC_FLAG_TYPE> __a_lock; + + template + _LIBCUDACXX_HOST_DEVICE void __lock(_Sco) const volatile { + while(1 == __atomic_exchange_dispatch(__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), memory_order_acquire, _Sco{})) + /*spin*/; + } + template + _LIBCUDACXX_HOST_DEVICE void __lock(_Sco) const { + while(1 == __atomic_exchange_dispatch(__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), memory_order_acquire, _Sco{})) + /*spin*/; + } + template + _LIBCUDACXX_HOST_DEVICE void __unlock(_Sco) const volatile { + __atomic_store_dispatch(__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), memory_order_release, _Sco{}); + } + template + _LIBCUDACXX_HOST_DEVICE void __unlock(_Sco) const { + __atomic_store_dispatch(__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), memory_order_release, _Sco{}); + } +}; + +template +_LIBCUDACXX_HOST_DEVICE +void __atomic_init_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, _Sco, __atomic_locked_tag) { + __atomic_assign_volatile(__a.__a_value, __val); +} + +template +_LIBCUDACXX_HOST_DEVICE +void __atomic_store_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, memory_order, _Sco, __atomic_locked_tag) { + __a.__lock(_Sco{}); + __atomic_assign_volatile(__a.__a_value, __val); + __a.__unlock(_Sco{}); +} + +template +_LIBCUDACXX_HOST_DEVICE +__atomic_underlying_t<_Tp> __atomic_load_dispatch(const _Tp& __a, memory_order, _Sco, __atomic_locked_tag) { + __atomic_underlying_t<_Tp> __old; + __a.__lock(_Sco{}); + __atomic_assign_volatile(__old, __a.__a_value); + __a.__unlock(_Sco{}); + return __old; +} + +template +_LIBCUDACXX_HOST_DEVICE +__atomic_underlying_t<_Tp> __atomic_exchange_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __value, memory_order, _Sco, __atomic_locked_tag) { + __atomic_underlying_t<_Tp> __old; + __a.__lock(_Sco{}); + __atomic_assign_volatile(__old, __a.__a_value); + __atomic_assign_volatile(__a.__a_value, __value); + __a.__unlock(_Sco{}); + return __old; +} + +template +_LIBCUDACXX_HOST_DEVICE +bool __atomic_compare_exchange_strong_dispatch(_Tp& __a, + __atomic_underlying_t<_Tp>* __expected, __atomic_underlying_t<_Tp> __value, memory_order, memory_order, _Sco, __atomic_locked_tag) { + __atomic_underlying_t<_Tp> __temp; + __a.__lock(_Sco{}); + __atomic_assign_volatile(__temp, __a.__a_value); + bool __ret = __temp == *__expected; + if(__ret) + __atomic_assign_volatile(__a.__a_value, __value); + else + __atomic_assign_volatile(*__expected, __a.__a_value); + __a.__unlock(_Sco{}); + return __ret; +} + +template +_LIBCUDACXX_HOST_DEVICE +bool __atomic_compare_exchange_weak_dispatch(_Tp& __a, + __atomic_underlying_t<_Tp>* __expected, __atomic_underlying_t<_Tp> __value, memory_order, memory_order, _Sco, __atomic_locked_tag) { + __atomic_underlying_t<_Tp> __temp; + __a.__lock(_Sco{}); + __atomic_assign_volatile(__temp, __a.__a_value); + bool __ret = __temp == *__expected; + if(__ret) + __atomic_assign_volatile(__a.__a_value, __value); + else + __atomic_assign_volatile(*__expected, __a.__a_value); + __a.__unlock(_Sco{}); + return __ret; +} + +template +_LIBCUDACXX_HOST_DEVICE +__atomic_underlying_t<_Tp> __atomic_fetch_add_dispatch(_Tp& __a, + _Td __delta, memory_order, _Sco, __atomic_locked_tag) { + __atomic_underlying_t<_Tp> __old; + __a.__lock(_Sco{}); + __atomic_assign_volatile(__old, __a.__a_value); + __atomic_assign_volatile(__a.__a_value, __atomic_underlying_t<_Tp>(__old + __delta)); + __a.__unlock(_Sco{}); + return __old; +} + +template +_LIBCUDACXX_HOST_DEVICE +__atomic_underlying_t<_Tp> __atomic_fetch_add_dispatch(_Tp& __a, + ptrdiff_t __delta, memory_order, _Sco, __atomic_locked_tag) { + __atomic_underlying_t<_Tp> __old; + __a.__lock(_Sco{}); + __atomic_assign_volatile(__old, __a.__a_value); + __atomic_assign_volatile(__a.__a_value, __old + __delta); + __a.__unlock(_Sco{}); + return __old; +} + +template +_LIBCUDACXX_HOST_DEVICE +__atomic_underlying_t<_Tp> __atomic_fetch_sub_dispatch(_Tp& __a, + __atomic_underlying_t<_Tp> __delta, memory_order, _Sco, __atomic_locked_tag) { + __atomic_underlying_t<_Tp> __old; + __a.__lock(_Sco{}); + __atomic_assign_volatile(__old, __a.__a_value); + __atomic_assign_volatile(__a.__a_value, __atomic_underlying_t<_Tp>(__old - __delta)); + __a.__unlock(_Sco{}); + return __old; +} + +template +_LIBCUDACXX_HOST_DEVICE +__atomic_underlying_t<_Tp> __atomic_fetch_and_dispatch(_Tp& __a, + __atomic_underlying_t<_Tp> __pattern, memory_order, _Sco, __atomic_locked_tag) { + __atomic_underlying_t<_Tp> __old; + __a.__lock(_Sco{}); + __atomic_assign_volatile(__old, __a.__a_value); + __atomic_assign_volatile(__a.__a_value, __atomic_underlying_t<_Tp>(__old & __pattern)); + __a.__unlock(_Sco{}); + return __old; +} + +template +_LIBCUDACXX_HOST_DEVICE +__atomic_underlying_t<_Tp> __atomic_fetch_or_dispatch(_Tp& __a, + __atomic_underlying_t<_Tp> __pattern, memory_order, _Sco, __atomic_locked_tag) { + __atomic_underlying_t<_Tp> __old; + __a.__lock(_Sco{}); + __atomic_assign_volatile(__old, __a.__a_value); + __atomic_assign_volatile(__a.__a_value, __atomic_underlying_t<_Tp>(__old | __pattern)); + __a.__unlock(_Sco{}); + return __old; +} + +template +_LIBCUDACXX_HOST_DEVICE +__atomic_underlying_t<_Tp> __atomic_fetch_xor_dispatch(_Tp& __a, + __atomic_underlying_t<_Tp> __pattern, memory_order, _Sco, __atomic_locked_tag) { + __atomic_underlying_t<_Tp> __old; + __a.__lock(_Sco{}); + __atomic_assign_volatile(__old, __a.__a_value); + __atomic_assign_volatile(__a.__a_value, _Tp(__old ^ __pattern)); + __a.__unlock(_Sco{}); + return __old; +} + +_LIBCUDACXX_END_NAMESPACE_STD + +#endif // _LIBCUDACXX___ATOMIC_STORAGE_LOCKED_H diff --git a/libcudacxx/include/cuda/std/__atomic/storage/reference.h b/libcudacxx/include/cuda/std/__atomic/storage/reference.h new file mode 100644 index 0000000000..3ead98703a --- /dev/null +++ b/libcudacxx/include/cuda/std/__atomic/storage/reference.h @@ -0,0 +1,48 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCUDACXX___ATOMIC_STORAGE_REF_H +#define _LIBCUDACXX___ATOMIC_STORAGE_REF_H + +#include + +#include + +#include + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +// Reference is compatible with __atomic_base_tag and uses default dispatch + +template +struct __atomic_ref_storage { + using __underlying_t = _Tp; + using __tag_t = __atomic_base_tag; + +#if !defined(_CCCL_COMPILER_GCC) || (__GNUC__ >= 5) + static_assert(is_trivially_copyable<_Tp>::value, + "std::atomic_ref requires that 'Tp' be a trivially copyable type"); +#endif + + _Tp* __a_value; + + _LIBCUDACXX_HOST_DEVICE constexpr explicit + __atomic_ref_storage(_Tp& value) noexcept + : __a_value(&value) {} + + _LIBCUDACXX_HOST_DEVICE inline auto operator()() -> __underlying_t* { + return __a_value; + } +}; + +_LIBCUDACXX_END_NAMESPACE_STD + +#endif // _LIBCUDACXX___ATOMIC_STORAGE_REF_H diff --git a/libcudacxx/include/cuda/std/__atomic/storage/small.h b/libcudacxx/include/cuda/std/__atomic/storage/small.h new file mode 100644 index 0000000000..679fbd5487 --- /dev/null +++ b/libcudacxx/include/cuda/std/__atomic/storage/small.h @@ -0,0 +1,177 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCUDACXX___ATOMIC_STORAGE_SMALL_H +#define _LIBCUDACXX___ATOMIC_STORAGE_SMALL_H + +#include + +#include + +#include +#include +#include + +#include + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +// Atomic small types require conversion to/from a proxy type that can be +// manipulated by PTX without any performance overhead +struct __atomic_small_tag {}; + +template +using __atomic_small_proxy_t = __conditional_t::value, int32_t, uint32_t>; + +// Arithmetic conversions to/from proxy types +template::value, int> = 0> +constexpr _LIBCUDACXX_HOST_DEVICE inline __atomic_small_proxy_t<_Tp> __atomic_small_to_32(_Tp __val) { + return static_cast<__atomic_small_proxy_t<_Tp>>(__val); +} + +template::value, int> = 0> +constexpr _LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_small_from_32(__atomic_small_proxy_t<_Tp> __val) { + return static_cast<_Tp>(__val); +} + +// Non-arithmetic conversion to/from proxy types +template::value, int> = 0> +_LIBCUDACXX_HOST_DEVICE inline __atomic_small_proxy_t<_Tp> __atomic_small_to_32(_Tp __val) { + __atomic_small_proxy_t<_Tp> __temp{}; + memcpy(&__temp, &__val, sizeof(_Tp)); + return __temp; +} + +template::value, int> = 0> +_LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_small_from_32(__atomic_small_proxy_t<_Tp> __val) { + _Tp __temp{}; + memcpy(&__temp, &__val, sizeof(_Tp)); + return __temp; +} + +template +struct __atomic_small_storage { + using __underlying_t = _Tp; + using __tag_t = __atomic_small_tag; + using __proxy_t = __atomic_small_proxy_t<_Tp>; + + __atomic_small_storage() noexcept = default; + + _LIBCUDACXX_HOST_DEVICE + constexpr explicit __atomic_small_storage(_Tp __value) : __a_value(__atomic_small_to_32(__value)) {} + + __atomic_storage<__proxy_t> __a_value; +}; + +template +_LIBCUDACXX_HOST_DEVICE +void __atomic_init_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, _Sco, __atomic_small_tag) { + __atomic_init_dispatch(__a.__a_value, __atomic_small_to_32(__val), _Sco{}); +} + +template +_LIBCUDACXX_HOST_DEVICE inline void __atomic_store_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco, __atomic_small_tag) { + __atomic_store_dispatch(__a.__a_value, __atomic_small_to_32(__val), __order, _Sco{}); +} + +template +_LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_load_dispatch(_Tp const& __a, memory_order __order, _Sco, __atomic_small_tag) { + return __atomic_small_from_32<_Tp>(__atomic_load_dispatch(__a.__a_value, __order, _Sco{})); +} + +template +_LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_exchange_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __value, memory_order __order, _Sco, __atomic_small_tag) { + return __atomic_small_from_32<_Tp>(__atomic_exchange_dispatch(__a.__a_value, __atomic_small_to_32(__value), __order, _Sco{})); +} +_LIBCUDACXX_HOST_DEVICE +inline int __cuda_memcmp(void const * __lhs, void const * __rhs, size_t __count) { + NV_DISPATCH_TARGET( + NV_IS_DEVICE, ( + auto __lhs_c = reinterpret_cast(__lhs); + auto __rhs_c = reinterpret_cast(__rhs); + while (__count--) { + auto const __lhs_v = *__lhs_c++; + auto const __rhs_v = *__rhs_c++; + if (__lhs_v < __rhs_v) { return -1; } + if (__lhs_v > __rhs_v) { return 1; } + } + return 0; + ), + NV_IS_HOST, ( + return memcmp(__lhs, __rhs, __count); + ) + ) +} + +template +_LIBCUDACXX_HOST_DEVICE inline bool __atomic_compare_exchange_weak_dispatch(_Tp& __a, __atomic_underlying_t<_Tp>* __expected, __atomic_underlying_t<_Tp> __value, memory_order __success, memory_order __failure, _Sco, __atomic_small_tag) { + auto __temp_expected = __atomic_small_to_32(*__expected); + auto const __ret = __atomic_compare_exchange_weak_dispatch(__a.__a_value, &__temp_expected, __atomic_small_to_32(__value), __success, __failure, _Sco{}); + auto const __actual = __atomic_small_from_32<__atomic_underlying_t<_Tp>>(__temp_expected); + constexpr auto __mask = static_cast((1u << (8*sizeof(__atomic_underlying_t<_Tp>))) - 1); + if(!__ret) { + if(0 == __cuda_memcmp(&__actual, __expected, sizeof(__atomic_underlying_t<_Tp>))) + __atomic_fetch_and_dispatch(__a.__a_value, __mask, memory_order_relaxed, _Sco{}); + else + *__expected = __actual; + } + return __ret; +} + +template +_LIBCUDACXX_HOST_DEVICE inline bool __atomic_compare_exchange_strong_dispatch(_Tp& __a, __atomic_underlying_t<_Tp>* __expected, __atomic_underlying_t<_Tp> __value, memory_order __success, memory_order __failure, _Sco, __atomic_small_tag) { + auto const __old = *__expected; + while(1) { + if(__atomic_compare_exchange_weak_dispatch(__a, __expected, __value, __success, __failure, _Sco{}, __atomic_small_tag{})) + return true; + if(0 != __cuda_memcmp(&__old, __expected, sizeof(__atomic_underlying_t<_Tp>))) + return false; + } +} + +template +_LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_fetch_add_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __delta, memory_order __order, _Sco, __atomic_small_tag) { + return __atomic_small_from_32<_Tp>(__atomic_fetch_add_dispatch(__a.__a_value, __atomic_small_to_32(__delta), __order, _Sco{})); +} + +template +_LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_fetch_sub_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __delta, memory_order __order, _Sco, __atomic_small_tag) { + return __atomic_small_from_32<_Tp>(__atomic_fetch_sub_dispatch(__a.__a_value, __atomic_small_to_32(__delta), __order, _Sco{})); +} + +template +_LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_fetch_and_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order __order, _Sco, __atomic_small_tag) { + return __atomic_small_from_32<_Tp>(__atomic_fetch_and_dispatch(__a.__a_value, __atomic_small_to_32(__pattern), __order, _Sco{})); +} + +template +_LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_fetch_or_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order __order, _Sco, __atomic_small_tag) { + return __atomic_small_from_32<_Tp>(__atomic_fetch_or_dispatch(__a.__a_value, __atomic_small_to_32(__pattern), __order, _Sco{})); +} + +template +_LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_fetch_xor_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order __order, _Sco, __atomic_small_tag) { + return __atomic_small_from_32<_Tp>(__atomic_fetch_xor_dispatch(__a.__a_value, __atomic_small_to_32(__pattern), __order, _Sco{})); +} + +template +_LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_fetch_max_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco, __atomic_small_tag) { + return __atomic_small_from_32<_Tp>(__atomic_fetch_max_dispatch(__a.__a_value, __atomic_small_to_32(__val), __order, _Sco{})); +} + +template +_LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_fetch_min_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco, __atomic_small_tag) { + return __atomic_small_from_32<_Tp>(__atomic_fetch_min_dispatch(__a.__a_value, __atomic_small_to_32(__val), __order, _Sco{})); +} + +_LIBCUDACXX_END_NAMESPACE_STD + +#endif // _LIBCUDACXX___ATOMIC_STORAGE_SMALL_H diff --git a/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h b/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h new file mode 100644 index 0000000000..87ac58ca73 --- /dev/null +++ b/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h @@ -0,0 +1,188 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCUDACXX___ATOMIC_WAIT_NOTIFY_WAIT_H +#define _LIBCUDACXX___ATOMIC_WAIT_NOTIFY_WAIT_H + +#include + +#include +#include + +#include +#include + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +// Leaving this in to figure out if we want this. +// For now this should be dead code, as we don't support platform wait. +#ifdef _LIBCUDACXX_HAS_PLATFORM_WAIT + +template ::__value, int> = 1> +_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_all(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a) { +#ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE + auto * const __c = __libcpp_contention_state(__a); + __cxx_atomic_fetch_add(__cxx_atomic_rebind<_Sco>(&__c->__version), (__libcpp_platform_wait_t)1, memory_order_relaxed); + __cxx_atomic_thread_fence(memory_order_seq_cst); + if (0 != __cxx_atomic_exchange(__cxx_atomic_rebind<_Sco>(&__c->__waiters), (ptrdiff_t)0, memory_order_relaxed)) + __libcpp_platform_wake(&__c->__version, true); +#endif +} +template ::__value, int> = 1> +_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_one(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a) { + __cxx_atomic_notify_all(__a); +} +template , int _Sco = _Ty::__sco, __enable_if_t::__value, int> = 1> +_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_try_wait_slow(_Ty const volatile* __a, _Tp const __val, memory_order __order) { +#ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE + auto * const __c = __libcpp_contention_state(__a); + __cxx_atomic_store(__cxx_atomic_rebind<_Sco>(&__c->__waiters), (ptrdiff_t)1, memory_order_relaxed); + __cxx_atomic_thread_fence(memory_order_seq_cst); + auto const __version = __cxx_atomic_load(__cxx_atomic_rebind<_Sco>(&__c->__version), memory_order_relaxed); + if (!__cxx_nonatomic_compare_equal(__cxx_atomic_load(__a, __order), __val)) + return; + if(sizeof(__libcpp_platform_wait_t) < 8) { + constexpr timespec __timeout = { 2, 0 }; // Hedge on rare 'int version' aliasing. + __libcpp_platform_wait(&__c->__version, __version, &__timeout); + } + else + __libcpp_platform_wait(&__c->__version, __version, nullptr); +#else + __cxx_atomic_try_wait_slow_fallback(__a, __val, __order); +#endif // _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE +} + +template ::__value, int> = 1> +_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_try_wait_slow(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a, _Tp __val, memory_order) { +#ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE + auto * const __c = __libcpp_contention_state(__a); + __cxx_atomic_fetch_add(__cxx_atomic_rebind<_Sco>(&__c->__waiters), (ptrdiff_t)1, memory_order_relaxed); + __cxx_atomic_thread_fence(memory_order_seq_cst); +#endif + __libcpp_platform_wait((_Tp*)__a, __val, nullptr); +#ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE + __cxx_atomic_fetch_sub(__cxx_atomic_rebind<_Sco>(&__c->__waiters), (ptrdiff_t)1, memory_order_relaxed); +#endif +} +template ::__value, int> = 1> +_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_all(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a) { +#ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE + auto * const __c = __libcpp_contention_state(__a); + __cxx_atomic_thread_fence(memory_order_seq_cst); + if (0 != __cxx_atomic_load(__cxx_atomic_rebind<_Sco>(&__c->__waiters), memory_order_relaxed)) +#endif + __libcpp_platform_wake((_Tp*)__a, true); +} +template ::__value, int> = 1> +_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_one(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a) { +#ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE + auto * const __c = __libcpp_contention_state(__a); + __cxx_atomic_thread_fence(memory_order_seq_cst); + if (0 != __cxx_atomic_load(__cxx_atomic_rebind<_Sco>(&__c->__waiters), memory_order_relaxed)) +#endif + __libcpp_platform_wake((_Tp*)__a, false); +} + +// Contention table wait/notify is also not supported as above. +#elif !defined(_LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE) + +template +_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_all(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a) { + auto * const __c = __libcpp_contention_state(__a); + __cxx_atomic_thread_fence(memory_order_seq_cst); + if(0 == __cxx_atomic_load(__cxx_atomic_rebind<_Sco>(&__c->__credit), memory_order_relaxed)) + return; + if(0 != __cxx_atomic_exchange(__cxx_atomic_rebind<_Sco>(&__c->__credit), (ptrdiff_t)0, memory_order_relaxed)) { + __libcpp_mutex_lock(&__c->__mutex); + __libcpp_mutex_unlock(&__c->__mutex); + __libcpp_condvar_broadcast(&__c->__condvar); + } +} +template +_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_one(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a) { + __cxx_atomic_notify_all(__a); +} +template +_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_try_wait_slow(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a, _Tp const __val, memory_order __order) { + auto * const __c = __libcpp_contention_state(__a); + __libcpp_mutex_lock(&__c->__mutex); + __cxx_atomic_store(__cxx_atomic_rebind<_Sco>(&__c->__credit), (ptrdiff_t)1, memory_order_relaxed); + __cxx_atomic_thread_fence(memory_order_seq_cst); + if (__cxx_nonatomic_compare_equal(__cxx_atomic_load(__a, __order), __val)) + __libcpp_condvar_wait(&__c->__condvar, &__c->__mutex); + __libcpp_mutex_unlock(&__c->__mutex); +} + +#else + +// Heterogeneous atomic impl begins here +extern "C" _CCCL_DEVICE void __atomic_try_wait_unsupported_before_SM_70__(); + +template +_LIBCUDACXX_INLINE_VISIBILITY void __atomic_try_wait_slow(_Tp const volatile* __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco) { + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70, + __atomic_try_wait_slow_fallback(__a, __val, __order, _Sco{});, + NV_IS_HOST, + __atomic_try_wait_slow_fallback(__a, __val, __order, _Sco{});, + NV_ANY_TARGET, + __atomic_try_wait_unsupported_before_SM_70__(); + ); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY void __atomic_notify_one(_Tp const volatile*, _Sco) { + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70,, + NV_IS_HOST,, + NV_ANY_TARGET, + __atomic_try_wait_unsupported_before_SM_70__(); + ); +} + +template +_LIBCUDACXX_INLINE_VISIBILITY void __atomic_notify_all(_Tp const volatile*, _Sco) { + NV_DISPATCH_TARGET( + NV_PROVIDES_SM_70,, + NV_IS_HOST,, + NV_ANY_TARGET, + __atomic_try_wait_unsupported_before_SM_70__(); + ); +} + +#endif // _LIBCUDACXX_HAS_PLATFORM_WAIT || !defined(_LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE) + +template _LIBCUDACXX_INLINE_VISIBILITY +bool __nonatomic_compare_equal(_Tp const& __lhs, _Tp const& __rhs) { +#if defined(_CCCL_CUDA_COMPILER) + return __lhs == __rhs; +#else + return memcmp(&__lhs, &__rhs, sizeof(_Tp)) == 0; +#endif +} + +template +_LIBCUDACXX_INLINE_VISIBILITY void __atomic_wait(_Tp const volatile* __a, __atomic_underlying_t<_Tp> const __val, memory_order __order, _Sco = {}) { + for(int __i = 0; __i < _LIBCUDACXX_POLLING_COUNT; ++__i) { + if(!__nonatomic_compare_equal(__atomic_load_dispatch(*__a, __order, _Sco{}, __atomic_tag_t<_Tp>{}), __val)) + return; + if(__i < 12) + __libcpp_thread_yield_processor(); + else + __libcpp_thread_yield(); + } + while(__nonatomic_compare_equal(__atomic_load_dispatch(*__a, __order, _Sco{}, __atomic_tag_t<_Tp>{}), __val)) + __atomic_try_wait_slow(__a, __val, __order, _Sco{}); +} + +_LIBCUDACXX_END_NAMESPACE_STD + +#endif _LIBCUDACXX___ATOMIC_WAIT_NOTIFY_WAIT_H diff --git a/libcudacxx/include/cuda/std/__atomic/wait/polling.h b/libcudacxx/include/cuda/std/__atomic/wait/polling.h new file mode 100644 index 0000000000..4f4a8dd9a3 --- /dev/null +++ b/libcudacxx/include/cuda/std/__atomic/wait/polling.h @@ -0,0 +1,56 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCUDACXX___ATOMIC_WAIT_POLLING_H +#define _LIBCUDACXX___ATOMIC_WAIT_POLLING_H + +#include + +#include +#include +#include + +#include + +#include + +_LIBCUDACXX_BEGIN_NAMESPACE_STD + +template +struct __atomic_poll_tester { + using __underlying_t = __atomic_underlying_t<_Tp>; + + _Tp const volatile* __atom; + __underlying_t __val; + memory_order __order; + + _LIBCUDACXX_HOST_DEVICE + __atomic_poll_tester(_Tp const volatile* __a, __underlying_t __v, memory_order __o) + : __atom(__a) + , __val(__v) + , __order(__o) + {} + + _LIBCUDACXX_HOST_DEVICE + bool operator()() const { + return !(__atomic_load_dispatch(*__atom, __order, _Sco{}, __atomic_tag_t<_Tp>{}) == __val); + } +}; + +template +_LIBCUDACXX_HOST_DEVICE +void __atomic_try_wait_slow_fallback(_Tp const volatile* __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco) { + __libcpp_thread_poll_with_backoff(__atomic_poll_tester<_Tp, _Sco>(__a, __val, __order)); +} + +_LIBCUDACXX_END_NAMESPACE_STD + +#endif // _LIBCUDACXX___ATOMIC_WAIT_POLLING_H diff --git a/libcudacxx/include/cuda/std/atomic b/libcudacxx/include/cuda/std/atomic index 298b69726f..8e9eaf1664 100644 --- a/libcudacxx/include/cuda/std/atomic +++ b/libcudacxx/include/cuda/std/atomic @@ -589,166 +589,20 @@ void atomic_signal_fence(memory_order m) noexcept; # error C++ standard library is incompatible with #endif -#define _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m) \ - _LIBCUDACXX_DIAGNOSE_WARNING(__m == memory_order_consume || \ - __m == memory_order_acquire || \ - __m == memory_order_acq_rel, \ - "memory order argument to atomic operation is invalid") - -#define _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m) \ - _LIBCUDACXX_DIAGNOSE_WARNING(__m == memory_order_release || \ - __m == memory_order_acq_rel, \ - "memory order argument to atomic operation is invalid") - -#define _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__m, __f) \ - _LIBCUDACXX_DIAGNOSE_WARNING(__f == memory_order_release || \ - __f == memory_order_acq_rel, \ - "memory order argument to atomic operation is invalid") - -#if defined(_LIBCUDACXX_HAS_MSVC_ATOMIC_IMPL) -# include -#endif +#include +#include -#if !defined(_CCCL_COMPILER_NVRTC) -# include -#endif +#include +#include +#include +#include +#include -#if !defined(__CLANG_ATOMIC_BOOL_LOCK_FREE) && !defined(__GCC_ATOMIC_BOOL_LOCK_FREE) -#define ATOMIC_BOOL_LOCK_FREE 2 -#define ATOMIC_CHAR_LOCK_FREE 2 -#define ATOMIC_CHAR16_T_LOCK_FREE 2 -#define ATOMIC_CHAR32_T_LOCK_FREE 2 -#define ATOMIC_WCHAR_T_LOCK_FREE 2 -#define ATOMIC_SHORT_LOCK_FREE 2 -#define ATOMIC_INT_LOCK_FREE 2 -#define ATOMIC_LONG_LOCK_FREE 2 -#define ATOMIC_LLONG_LOCK_FREE 2 -#define ATOMIC_POINTER_LOCK_FREE 2 -#endif //!defined(__CLANG_ATOMIC_BOOL_LOCK_FREE) && !defined(__GCC_ATOMIC_BOOL_LOCK_FREE) - -#ifndef __ATOMIC_RELAXED -#define __ATOMIC_RELAXED 0 -#define __ATOMIC_CONSUME 1 -#define __ATOMIC_ACQUIRE 2 -#define __ATOMIC_RELEASE 3 -#define __ATOMIC_ACQ_REL 4 -#define __ATOMIC_SEQ_CST 5 -#endif //__ATOMIC_RELAXED +#include +#include _LIBCUDACXX_BEGIN_NAMESPACE_STD -// Figure out what the underlying type for `memory_order` would be if it were -// declared as an unscoped enum (accounting for -fshort-enums). Use this result -// to pin the underlying type in C++20. -enum __legacy_memory_order { - __mo_relaxed, - __mo_consume, - __mo_acquire, - __mo_release, - __mo_acq_rel, - __mo_seq_cst -}; - -typedef underlying_type<__legacy_memory_order>::type __memory_order_underlying_t; - -#if _CCCL_STD_VER > 2017 - -enum class memory_order : __memory_order_underlying_t { - relaxed = __mo_relaxed, - consume = __mo_consume, - acquire = __mo_acquire, - release = __mo_release, - acq_rel = __mo_acq_rel, - seq_cst = __mo_seq_cst -}; - -inline constexpr auto memory_order_relaxed = memory_order::relaxed; -inline constexpr auto memory_order_consume = memory_order::consume; -inline constexpr auto memory_order_acquire = memory_order::acquire; -inline constexpr auto memory_order_release = memory_order::release; -inline constexpr auto memory_order_acq_rel = memory_order::acq_rel; -inline constexpr auto memory_order_seq_cst = memory_order::seq_cst; - -#else - -typedef enum memory_order { - memory_order_relaxed = __mo_relaxed, - memory_order_consume = __mo_consume, - memory_order_acquire = __mo_acquire, - memory_order_release = __mo_release, - memory_order_acq_rel = __mo_acq_rel, - memory_order_seq_cst = __mo_seq_cst, -} memory_order; - -#endif // _CCCL_STD_VER > 2017 - -template _LIBCUDACXX_INLINE_VISIBILITY -bool __cxx_nonatomic_compare_equal(_Tp const& __lhs, _Tp const& __rhs) { -#if defined(_CCCL_CUDA_COMPILER) - return __lhs == __rhs; -#else - return memcmp(&__lhs, &__rhs, sizeof(_Tp)) == 0; -#endif -} - -static_assert((is_same::type, __memory_order_underlying_t>::value), - "unexpected underlying type for std::memory_order"); - -#if defined(_LIBCUDACXX_HAS_GCC_ATOMIC_IMP) || \ - defined(_LIBCUDACXX_ATOMIC_ONLY_USE_BUILTINS) - -// [atomics.types.generic]p1 guarantees _Tp is trivially copyable. Because -// the default operator= in an object is not volatile, a byte-by-byte copy -// is required. -template _LIBCUDACXX_INLINE_VISIBILITY -__enable_if_t::value> -__cxx_atomic_assign_volatile(_Tp& __a_value, _Tv const& __val) { - __a_value = __val; -} -template _LIBCUDACXX_INLINE_VISIBILITY -__enable_if_t::value> -__cxx_atomic_assign_volatile(_Tp volatile& __a_value, _Tv volatile const& __val) { - volatile char* __to = reinterpret_cast(&__a_value); - volatile char* __end = __to + sizeof(_Tp); - volatile const char* __from = reinterpret_cast(&__val); - while (__to != __end) - *__to++ = *__from++; -} - -#endif - -// Headers are wrapped like so: (cuda::std::|std::)detail -namespace __detail { -#if defined(_LIBCUDACXX_HAS_CUDA_ATOMIC_EXT) -# include -#endif - -#if defined(_LIBCUDACXX_HAS_CUDA_ATOMIC_IMPL) -# include -#elif defined(_LIBCUDACXX_HAS_MSVC_ATOMIC_IMPL) -# include -#elif defined(_LIBCUDACXX_HAS_GCC_ATOMIC_IMP) -# include -#elif defined(_LIBCUDACXX_HAS_C_ATOMIC_IMP) -// TODO: Maybe support C11 atomics? -// #include -#endif // _LIBCUDACXX_HAS_GCC_ATOMIC_IMP, _LIBCUDACXX_HAS_C_ATOMIC_IMP -} - -using __detail::__cxx_atomic_base_impl; -using __detail::__cxx_atomic_ref_base_impl; -using __detail::__cxx_atomic_thread_fence; -using __detail::__cxx_atomic_signal_fence; -using __detail::__cxx_atomic_load; -using __detail::__cxx_atomic_store; -using __detail::__cxx_atomic_exchange; -using __detail::__cxx_atomic_compare_exchange_weak; -using __detail::__cxx_atomic_compare_exchange_strong; -using __detail::__cxx_atomic_fetch_add; -using __detail::__cxx_atomic_fetch_sub; -using __detail::__cxx_atomic_fetch_or; -using __detail::__cxx_atomic_fetch_and; -using __detail::__cxx_atomic_fetch_xor; template _LIBCUDACXX_INLINE_VISIBILITY @@ -757,529 +611,22 @@ _Tp kill_dependency(_Tp __y) noexcept return __y; } -#if defined(__CLANG_ATOMIC_BOOL_LOCK_FREE) -# define ATOMIC_BOOL_LOCK_FREE __CLANG_ATOMIC_BOOL_LOCK_FREE -# define ATOMIC_CHAR_LOCK_FREE __CLANG_ATOMIC_CHAR_LOCK_FREE -# define ATOMIC_CHAR16_T_LOCK_FREE __CLANG_ATOMIC_CHAR16_T_LOCK_FREE -# define ATOMIC_CHAR32_T_LOCK_FREE __CLANG_ATOMIC_CHAR32_T_LOCK_FREE -# define ATOMIC_WCHAR_T_LOCK_FREE __CLANG_ATOMIC_WCHAR_T_LOCK_FREE -# define ATOMIC_SHORT_LOCK_FREE __CLANG_ATOMIC_SHORT_LOCK_FREE -# define ATOMIC_INT_LOCK_FREE __CLANG_ATOMIC_INT_LOCK_FREE -# define ATOMIC_LONG_LOCK_FREE __CLANG_ATOMIC_LONG_LOCK_FREE -# define ATOMIC_LLONG_LOCK_FREE __CLANG_ATOMIC_LLONG_LOCK_FREE -# define ATOMIC_POINTER_LOCK_FREE __CLANG_ATOMIC_POINTER_LOCK_FREE -#elif defined(__GCC_ATOMIC_BOOL_LOCK_FREE) -# define ATOMIC_BOOL_LOCK_FREE __GCC_ATOMIC_BOOL_LOCK_FREE -# define ATOMIC_CHAR_LOCK_FREE __GCC_ATOMIC_CHAR_LOCK_FREE -# define ATOMIC_CHAR16_T_LOCK_FREE __GCC_ATOMIC_CHAR16_T_LOCK_FREE -# define ATOMIC_CHAR32_T_LOCK_FREE __GCC_ATOMIC_CHAR32_T_LOCK_FREE -# define ATOMIC_WCHAR_T_LOCK_FREE __GCC_ATOMIC_WCHAR_T_LOCK_FREE -# define ATOMIC_SHORT_LOCK_FREE __GCC_ATOMIC_SHORT_LOCK_FREE -# define ATOMIC_INT_LOCK_FREE __GCC_ATOMIC_INT_LOCK_FREE -# define ATOMIC_LONG_LOCK_FREE __GCC_ATOMIC_LONG_LOCK_FREE -# define ATOMIC_LLONG_LOCK_FREE __GCC_ATOMIC_LLONG_LOCK_FREE -# define ATOMIC_POINTER_LOCK_FREE __GCC_ATOMIC_POINTER_LOCK_FREE -#endif - -#ifdef _LIBCUDACXX_ATOMIC_ONLY_USE_BUILTINS - -template -struct __cxx_atomic_lock_impl { - - _LIBCUDACXX_INLINE_VISIBILITY - __cxx_atomic_lock_impl() noexcept - : __a_value(), __a_lock(0) {} - _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit - __cxx_atomic_lock_impl(_Tp value) noexcept - : __a_value(value), __a_lock(0) {} - - _Tp __a_value; - mutable __cxx_atomic_base_impl<_LIBCUDACXX_ATOMIC_FLAG_TYPE, _Sco> __a_lock; - - _LIBCUDACXX_INLINE_VISIBILITY void __lock() const volatile { - while(1 == __cxx_atomic_exchange(&__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), memory_order_acquire)) - /*spin*/; - } - _LIBCUDACXX_INLINE_VISIBILITY void __lock() const { - while(1 == __cxx_atomic_exchange(&__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), memory_order_acquire)) - /*spin*/; - } - _LIBCUDACXX_INLINE_VISIBILITY void __unlock() const volatile { - __cxx_atomic_store(&__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), memory_order_release); - } - _LIBCUDACXX_INLINE_VISIBILITY void __unlock() const { - __cxx_atomic_store(&__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), memory_order_release); - } - _LIBCUDACXX_INLINE_VISIBILITY _Tp __read() const volatile { - __lock(); - _Tp __old; - __cxx_atomic_assign_volatile(__old, __a_value); - __unlock(); - return __old; - } - _LIBCUDACXX_INLINE_VISIBILITY _Tp __read() const { - __lock(); - _Tp __old = __a_value; - __unlock(); - return __old; - } -}; - -template -_LIBCUDACXX_INLINE_VISIBILITY -void __cxx_atomic_init(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __val) { - __cxx_atomic_assign_volatile(__a->__a_value, __val); -} -template -_LIBCUDACXX_INLINE_VISIBILITY -void __cxx_atomic_init(__cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __val) { - __a->__a_value = __val; -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -void __cxx_atomic_store(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __val, memory_order) { - __a->__lock(); - __cxx_atomic_assign_volatile(__a->__a_value, __val); - __a->__unlock(); -} -template -_LIBCUDACXX_INLINE_VISIBILITY -void __cxx_atomic_store(__cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __val, memory_order) { - __a->__lock(); - __a->__a_value = __val; - __a->__unlock(); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_load(const volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, memory_order) { - return __a->__read(); -} -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_load(const __cxx_atomic_lock_impl<_Tp, _Sco>* __a, memory_order) { - return __a->__read(); -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_exchange(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __value, memory_order) { - __a->__lock(); - _Tp __old; - __cxx_atomic_assign_volatile(__old, __a->__a_value); - __cxx_atomic_assign_volatile(__a->__a_value, __value); - __a->__unlock(); - return __old; -} -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_exchange(__cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __value, memory_order) { - __a->__lock(); - _Tp __old = __a->__a_value; - __a->__a_value = __value; - __a->__unlock(); - return __old; -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -bool __cxx_atomic_compare_exchange_strong(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, - _Tp* __expected, _Tp __value, memory_order, memory_order) { - __a->__lock(); - _Tp __temp; - __cxx_atomic_assign_volatile(__temp, __a->__a_value); - bool __ret = __temp == *__expected; - if(__ret) - __cxx_atomic_assign_volatile(__a->__a_value, __value); - else - __cxx_atomic_assign_volatile(*__expected, __a->__a_value); - __a->__unlock(); - return __ret; -} -template -_LIBCUDACXX_INLINE_VISIBILITY -bool __cxx_atomic_compare_exchange_strong(__cxx_atomic_lock_impl<_Tp, _Sco>* __a, - _Tp* __expected, _Tp __value, memory_order, memory_order) { - __a->__lock(); - bool __ret = __a->__a_value == *__expected; - if(__ret) - __a->__a_value = __value; - else - *__expected = __a->__a_value; - __a->__unlock(); - return __ret; -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -bool __cxx_atomic_compare_exchange_weak(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, - _Tp* __expected, _Tp __value, memory_order, memory_order) { - __a->__lock(); - _Tp __temp; - __cxx_atomic_assign_volatile(__temp, __a->__a_value); - bool __ret = __temp == *__expected; - if(__ret) - __cxx_atomic_assign_volatile(__a->__a_value, __value); - else - __cxx_atomic_assign_volatile(*__expected, __a->__a_value); - __a->__unlock(); - return __ret; -} -template -_LIBCUDACXX_INLINE_VISIBILITY -bool __cxx_atomic_compare_exchange_weak(__cxx_atomic_lock_impl<_Tp, _Sco>* __a, - _Tp* __expected, _Tp __value, memory_order, memory_order) { - __a->__lock(); - bool __ret = __a->__a_value == *__expected; - if(__ret) - __a->__a_value = __value; - else - *__expected = __a->__a_value; - __a->__unlock(); - return __ret; -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_add(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, - _Td __delta, memory_order) { - __a->__lock(); - _Tp __old; - __cxx_atomic_assign_volatile(__old, __a->__a_value); - __cxx_atomic_assign_volatile(__a->__a_value, _Tp(__old + __delta)); - __a->__unlock(); - return __old; -} -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_add(__cxx_atomic_lock_impl<_Tp, _Sco>* __a, - _Td __delta, memory_order) { - __a->__lock(); - _Tp __old = __a->__a_value; - __a->__a_value += __delta; - __a->__unlock(); - return __old; -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp* __cxx_atomic_fetch_add(volatile __cxx_atomic_lock_impl<_Tp*, _Sco>* __a, - ptrdiff_t __delta, memory_order) { - __a->__lock(); - _Tp* __old; - __cxx_atomic_assign_volatile(__old, __a->__a_value); - __cxx_atomic_assign_volatile(__a->__a_value, __old + __delta); - __a->__unlock(); - return __old; -} -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp* __cxx_atomic_fetch_add(__cxx_atomic_lock_impl<_Tp*, _Sco>* __a, - ptrdiff_t __delta, memory_order) { - __a->__lock(); - _Tp* __old = __a->__a_value; - __a->__a_value += __delta; - __a->__unlock(); - return __old; -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_sub(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, - _Td __delta, memory_order) { - __a->__lock(); - _Tp __old; - __cxx_atomic_assign_volatile(__old, __a->__a_value); - __cxx_atomic_assign_volatile(__a->__a_value, _Tp(__old - __delta)); - __a->__unlock(); - return __old; -} -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_sub(__cxx_atomic_lock_impl<_Tp, _Sco>* __a, - _Td __delta, memory_order) { - __a->__lock(); - _Tp __old = __a->__a_value; - __a->__a_value -= __delta; - __a->__unlock(); - return __old; -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_and(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, - _Tp __pattern, memory_order) { - __a->__lock(); - _Tp __old; - __cxx_atomic_assign_volatile(__old, __a->__a_value); - __cxx_atomic_assign_volatile(__a->__a_value, _Tp(__old & __pattern)); - __a->__unlock(); - return __old; -} -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_and(__cxx_atomic_lock_impl<_Tp, _Sco>* __a, - _Tp __pattern, memory_order) { - __a->__lock(); - _Tp __old = __a->__a_value; - __a->__a_value &= __pattern; - __a->__unlock(); - return __old; -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_or(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, - _Tp __pattern, memory_order) { - __a->__lock(); - _Tp __old; - __cxx_atomic_assign_volatile(__old, __a->__a_value); - __cxx_atomic_assign_volatile(__a->__a_value, _Tp(__old | __pattern)); - __a->__unlock(); - return __old; -} -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_or(__cxx_atomic_lock_impl<_Tp, _Sco>* __a, - _Tp __pattern, memory_order) { - __a->__lock(); - _Tp __old = __a->__a_value; - __a->__a_value |= __pattern; - __a->__unlock(); - return __old; -} - -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_xor(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, - _Tp __pattern, memory_order) { - __a->__lock(); - _Tp __old; - __cxx_atomic_assign_volatile(__old, __a->__a_value); - __cxx_atomic_assign_volatile(__a->__a_value, _Tp(__old ^ __pattern)); - __a->__unlock(); - return __old; -} -template -_LIBCUDACXX_INLINE_VISIBILITY -_Tp __cxx_atomic_fetch_xor(__cxx_atomic_lock_impl<_Tp, _Sco>* __a, - _Tp __pattern, memory_order) { - __a->__lock(); - _Tp __old = __a->__a_value; - __a->__a_value ^= __pattern; - __a->__unlock(); - return __old; -} - -#if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE) - -template struct __cxx_is_always_lock_free { - enum { __value = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0) }; }; - -#else - -template struct __cxx_is_always_lock_free { - enum { __value = sizeof(_Tp) <= 8 }; }; - -#endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE) - -template -struct __cxx_atomic_impl_conditional { - using type = __conditional_t<__cxx_is_always_lock_free<_Tp>::__value, - __cxx_atomic_base_impl<_Tp, _Sco>, - __cxx_atomic_lock_impl<_Tp, _Sco> >; -}; - -template ::type > -#else -template > -#endif //_LIBCUDACXX_ATOMIC_ONLY_USE_BUILTINS -struct __cxx_atomic_impl : public _Base { - __cxx_atomic_impl() noexcept = default; - _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit __cxx_atomic_impl(_Tp value) noexcept - : _Base(value) {} -}; - - -template -_LIBCUDACXX_INLINE_VISIBILITY -__cxx_atomic_impl<_Tp, _Sco>* __cxx_atomic_rebind(_Tp* __inst) { - static_assert(sizeof(__cxx_atomic_impl<_Tp, _Sco>) == sizeof(_Tp),""); - static_assert(alignof(__cxx_atomic_impl<_Tp, _Sco>) == alignof(_Tp),""); - return (__cxx_atomic_impl<_Tp, _Sco>*)__inst; -} - -template -using __cxx_atomic_ref_impl = __cxx_atomic_ref_base_impl<_Tp, _Sco>; - -#ifdef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE +template +struct __atomic_impl_traits { + static constexpr bool __atomic_requires_lock = __atomic_is_always_lock_free<_Tp>::__value; + static constexpr bool __atomic_requires_small = sizeof(_Tp) < 4; + static constexpr bool __atomic_supports_reference = sizeof(_Tp) >= 4 && sizeof(_Tp) <= 8; -template , int _Sco = _Ty::__sco> -struct __cxx_atomic_poll_tester { - _Ty const volatile* __a; - _Tp __val; - memory_order __order; + using __atomic_storage_t = typename __conditional_t<__atomic_requires_small, + __atomic_small_storage<_Tp>, + __conditional_t<__atomic_requires_lock, + __atomic_locked_storage<_Tp>, + __atomic_storage<_Tp> + >>; - _LIBCUDACXX_INLINE_VISIBILITY __cxx_atomic_poll_tester(_Ty const volatile* __a_, _Tp __val_, memory_order __order_) - : __a(__a_) - , __val(__val_) - , __order(__order_) - {} - - _LIBCUDACXX_INLINE_VISIBILITY bool operator()() const { - return !(__cxx_atomic_load(__a, __order) == __val); - } + using __atomic_ref_storage_t = typename __atomic_ref_storage<_Tp>; }; -template , int _Sco = _Ty::__sco> -_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_try_wait_slow_fallback(_Ty const volatile* __a, _Tp __val, memory_order __order) { - __libcpp_thread_poll_with_backoff(__cxx_atomic_poll_tester<_Ty>(__a, __val, __order)); -} - -#endif - -#ifdef _LIBCUDACXX_HAS_PLATFORM_WAIT - -template ::__value, int> = 1> -_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_all(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a) { -#ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE - auto * const __c = __libcpp_contention_state(__a); - __cxx_atomic_fetch_add(__cxx_atomic_rebind<_Sco>(&__c->__version), (__libcpp_platform_wait_t)1, memory_order_relaxed); - __cxx_atomic_thread_fence(memory_order_seq_cst); - if (0 != __cxx_atomic_exchange(__cxx_atomic_rebind<_Sco>(&__c->__waiters), (ptrdiff_t)0, memory_order_relaxed)) - __libcpp_platform_wake(&__c->__version, true); -#endif -} -template ::__value, int> = 1> -_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_one(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a) { - __cxx_atomic_notify_all(__a); -} -template , int _Sco = _Ty::__sco, __enable_if_t::__value, int> = 1> -_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_try_wait_slow(_Ty const volatile* __a, _Tp const __val, memory_order __order) { -#ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE - auto * const __c = __libcpp_contention_state(__a); - __cxx_atomic_store(__cxx_atomic_rebind<_Sco>(&__c->__waiters), (ptrdiff_t)1, memory_order_relaxed); - __cxx_atomic_thread_fence(memory_order_seq_cst); - auto const __version = __cxx_atomic_load(__cxx_atomic_rebind<_Sco>(&__c->__version), memory_order_relaxed); - if (!__cxx_nonatomic_compare_equal(__cxx_atomic_load(__a, __order), __val)) - return; - if(sizeof(__libcpp_platform_wait_t) < 8) { - constexpr timespec __timeout = { 2, 0 }; // Hedge on rare 'int version' aliasing. - __libcpp_platform_wait(&__c->__version, __version, &__timeout); - } - else - __libcpp_platform_wait(&__c->__version, __version, nullptr); -#else - __cxx_atomic_try_wait_slow_fallback(__a, __val, __order); -#endif // _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE -} - -template ::__value, int> = 1> -_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_try_wait_slow(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a, _Tp __val, memory_order) { -#ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE - auto * const __c = __libcpp_contention_state(__a); - __cxx_atomic_fetch_add(__cxx_atomic_rebind<_Sco>(&__c->__waiters), (ptrdiff_t)1, memory_order_relaxed); - __cxx_atomic_thread_fence(memory_order_seq_cst); -#endif - __libcpp_platform_wait((_Tp*)__a, __val, nullptr); -#ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE - __cxx_atomic_fetch_sub(__cxx_atomic_rebind<_Sco>(&__c->__waiters), (ptrdiff_t)1, memory_order_relaxed); -#endif -} -template ::__value, int> = 1> -_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_all(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a) { -#ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE - auto * const __c = __libcpp_contention_state(__a); - __cxx_atomic_thread_fence(memory_order_seq_cst); - if (0 != __cxx_atomic_load(__cxx_atomic_rebind<_Sco>(&__c->__waiters), memory_order_relaxed)) -#endif - __libcpp_platform_wake((_Tp*)__a, true); -} -template ::__value, int> = 1> -_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_one(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a) { -#ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE - auto * const __c = __libcpp_contention_state(__a); - __cxx_atomic_thread_fence(memory_order_seq_cst); - if (0 != __cxx_atomic_load(__cxx_atomic_rebind<_Sco>(&__c->__waiters), memory_order_relaxed)) -#endif - __libcpp_platform_wake((_Tp*)__a, false); -} - -#elif !defined(_LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE) - -template -_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_all(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a) { - auto * const __c = __libcpp_contention_state(__a); - __cxx_atomic_thread_fence(memory_order_seq_cst); - if(0 == __cxx_atomic_load(__cxx_atomic_rebind<_Sco>(&__c->__credit), memory_order_relaxed)) - return; - if(0 != __cxx_atomic_exchange(__cxx_atomic_rebind<_Sco>(&__c->__credit), (ptrdiff_t)0, memory_order_relaxed)) { - __libcpp_mutex_lock(&__c->__mutex); - __libcpp_mutex_unlock(&__c->__mutex); - __libcpp_condvar_broadcast(&__c->__condvar); - } -} -template -_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_one(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a) { - __cxx_atomic_notify_all(__a); -} -template -_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_try_wait_slow(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a, _Tp const __val, memory_order __order) { - auto * const __c = __libcpp_contention_state(__a); - __libcpp_mutex_lock(&__c->__mutex); - __cxx_atomic_store(__cxx_atomic_rebind<_Sco>(&__c->__credit), (ptrdiff_t)1, memory_order_relaxed); - __cxx_atomic_thread_fence(memory_order_seq_cst); - if (__cxx_nonatomic_compare_equal(__cxx_atomic_load(__a, __order), __val)) - __libcpp_condvar_wait(&__c->__condvar, &__c->__mutex); - __libcpp_mutex_unlock(&__c->__mutex); -} - -#else - -template -struct __atomic_wait_and_notify_supported -#if defined(__CUDA_MINIMUM_ARCH__) && __CUDA_MINIMUM_ARCH__ < 700 - : false_type -#else - : true_type -#endif -{}; - -template > -_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_try_wait_slow(_Ty const volatile* __a, _Tp __val, memory_order __order) { - static_assert(__atomic_wait_and_notify_supported<_Tp>::value, "atomic wait operations are unsupported on Pascal"); - __cxx_atomic_try_wait_slow_fallback(__a, __val, __order); -} - -template > -_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_one(_Ty const volatile*) { - static_assert(__atomic_wait_and_notify_supported<_Tp>::value, "atomic notify-one operations are unsupported on Pascal"); -} - -template > -_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_all(_Ty const volatile*) { - static_assert(__atomic_wait_and_notify_supported<_Tp>::value, "atomic notify-all operations are unsupported on Pascal"); -} - -#endif // _LIBCUDACXX_HAS_PLATFORM_WAIT || !defined(_LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE) - -template > -_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_wait(_Ty const volatile* __a, _Tp const __val, memory_order __order) { - for(int __i = 0; __i < _LIBCUDACXX_POLLING_COUNT; ++__i) { - if(!__cxx_nonatomic_compare_equal(__cxx_atomic_load(__a, __order), __val)) - return; - if(__i < 12) - __libcpp_thread_yield_processor(); - else - __libcpp_thread_yield(); - } - while(__cxx_nonatomic_compare_equal(__cxx_atomic_load(__a, __order), __val)) - __cxx_atomic_try_wait_slow(__a, __val, __order); -} - template struct __atomic_base_storage { mutable _Storage __a_; @@ -1321,103 +668,103 @@ struct __atomic_base_core : public __atomic_base_storage<_Tp, _Storage>{ void store(_Tp __d, memory_order __m = memory_order_seq_cst) volatile noexcept _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m) - {__cxx_atomic_store(&this->__a_, __d, __m);} + {__atomic_store_dispatch(this->__a_, __d, __m);} _LIBCUDACXX_INLINE_VISIBILITY void store(_Tp __d, memory_order __m = memory_order_seq_cst) noexcept _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m) - {__cxx_atomic_store(&this->__a_, __d, __m);} + {__atomic_store_dispatch(this->__a_, __d, __m);} _LIBCUDACXX_INLINE_VISIBILITY _Tp load(memory_order __m = memory_order_seq_cst) const volatile noexcept _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m) - {return __cxx_atomic_load(&this->__a_, __m);} + {return __atomic_load_dispatch(this->__a_, __m);} _LIBCUDACXX_INLINE_VISIBILITY _Tp load(memory_order __m = memory_order_seq_cst) const noexcept _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m) - {return __cxx_atomic_load(&this->__a_, __m);} + {return __atomic_load_dispatch(this->__a_, __m);} _LIBCUDACXX_INLINE_VISIBILITY operator _Tp() const volatile noexcept {return load();} _LIBCUDACXX_INLINE_VISIBILITY operator _Tp() const noexcept {return load();} _LIBCUDACXX_INLINE_VISIBILITY _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) volatile noexcept - {return __cxx_atomic_exchange(&this->__a_, __d, __m);} + {return __atomic_exchange_dispatch(this->__a_, __d, __m);} _LIBCUDACXX_INLINE_VISIBILITY _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) noexcept - {return __cxx_atomic_exchange(&this->__a_, __d, __m);} + {return __atomic_exchange_dispatch(this->__a_, __d, __m);} _LIBCUDACXX_INLINE_VISIBILITY bool compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __s, memory_order __f) volatile noexcept _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f) - {return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __s, __f);} + {return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __s, __f);} _LIBCUDACXX_INLINE_VISIBILITY bool compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __s, memory_order __f) noexcept _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f) - {return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __s, __f);} + {return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __s, __f);} _LIBCUDACXX_INLINE_VISIBILITY bool compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __s, memory_order __f) volatile noexcept _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f) - {return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __s, __f);} + {return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __s, __f);} _LIBCUDACXX_INLINE_VISIBILITY bool compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __s, memory_order __f) noexcept _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f) - {return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __s, __f);} + {return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __s, __f);} _LIBCUDACXX_INLINE_VISIBILITY bool compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst) volatile noexcept { if (memory_order_acq_rel == __m) - return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_acquire); + return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __m, memory_order_acquire); else if (memory_order_release == __m) - return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_relaxed); + return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __m, memory_order_relaxed); else - return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, __m); + return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __m, __m); } _LIBCUDACXX_INLINE_VISIBILITY bool compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst) noexcept { if(memory_order_acq_rel == __m) - return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_acquire); + return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __m, memory_order_acquire); else if(memory_order_release == __m) - return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_relaxed); + return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __m, memory_order_relaxed); else - return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, __m); + return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __m, __m); } _LIBCUDACXX_INLINE_VISIBILITY bool compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst) volatile noexcept { if (memory_order_acq_rel == __m) - return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_acquire); + return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __m, memory_order_acquire); else if (memory_order_release == __m) - return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_relaxed); + return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __m, memory_order_relaxed); else - return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, __m); + return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __m, __m); } _LIBCUDACXX_INLINE_VISIBILITY bool compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst) noexcept { if (memory_order_acq_rel == __m) - return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_acquire); + return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __m, memory_order_acquire); else if (memory_order_release == __m) - return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_relaxed); + return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __m, memory_order_relaxed); else - return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, __m); + return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __m, __m); } _LIBCUDACXX_INLINE_VISIBILITY void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const volatile noexcept - {__cxx_atomic_wait(&this->__a_, __v, __m);} + {__atomic_wait_dispatch(this->__a_, __v, __m);} _LIBCUDACXX_INLINE_VISIBILITY void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const noexcept - {__cxx_atomic_wait(&this->__a_, __v, __m);} + {__atomic_wait_dispatch(this->__a_, __v, __m);} _LIBCUDACXX_INLINE_VISIBILITY void notify_one() volatile noexcept - {__cxx_atomic_notify_one(&this->__a_);} + {__atomic_notify_one_dispatch(this->__a_);} _LIBCUDACXX_INLINE_VISIBILITY void notify_one() noexcept - {__cxx_atomic_notify_one(&this->__a_);} + {__atomic_notify_one_dispatch(this->__a_);} _LIBCUDACXX_INLINE_VISIBILITY void notify_all() volatile noexcept - {__cxx_atomic_notify_all(&this->__a_);} + {__atomic_notify_all_dispatch(this->__a_);} _LIBCUDACXX_INLINE_VISIBILITY void notify_all() noexcept - {__cxx_atomic_notify_all(&this->__a_);} + {__atomic_notify_all_dispatch(this->__a_);} }; template @@ -1446,103 +793,103 @@ struct __atomic_base_core<_Tp, true, _Storage> : public __atomic_base_storage<_T void store(_Tp __d, memory_order __m = memory_order_seq_cst) const volatile noexcept _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m) - {__cxx_atomic_store(&this->__a_, __d, __m);} + {__atomic_store_dispatch(this->__a_, __d, __m);} _LIBCUDACXX_INLINE_VISIBILITY void store(_Tp __d, memory_order __m = memory_order_seq_cst) const noexcept _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m) - {__cxx_atomic_store(&this->__a_, __d, __m);} + {__atomic_store_dispatch(this->__a_, __d, __m);} _LIBCUDACXX_INLINE_VISIBILITY _Tp load(memory_order __m = memory_order_seq_cst) const volatile noexcept _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m) - {return __cxx_atomic_load(&this->__a_, __m);} + {return __atomic_load_dispatch(this->__a_, __m);} _LIBCUDACXX_INLINE_VISIBILITY _Tp load(memory_order __m = memory_order_seq_cst) const noexcept _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m) - {return __cxx_atomic_load(&this->__a_, __m);} + {return __atomic_load_dispatch(this->__a_, __m);} _LIBCUDACXX_INLINE_VISIBILITY operator _Tp() const volatile noexcept {return load();} _LIBCUDACXX_INLINE_VISIBILITY operator _Tp() const noexcept {return load();} _LIBCUDACXX_INLINE_VISIBILITY _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) const volatile noexcept - {return __cxx_atomic_exchange(&this->__a_, __d, __m);} + {return __atomic_exchange_dispatch(this->__a_, __d, __m);} _LIBCUDACXX_INLINE_VISIBILITY _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) const noexcept - {return __cxx_atomic_exchange(&this->__a_, __d, __m);} + {return __atomic_exchange_dispatch(this->__a_, __d, __m);} _LIBCUDACXX_INLINE_VISIBILITY bool compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __s, memory_order __f) const volatile noexcept _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f) - {return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __s, __f);} + {return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __s, __f);} _LIBCUDACXX_INLINE_VISIBILITY bool compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __s, memory_order __f) const noexcept _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f) - {return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __s, __f);} + {return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __s, __f);} _LIBCUDACXX_INLINE_VISIBILITY bool compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __s, memory_order __f) const volatile noexcept _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f) - {return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __s, __f);} + {return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __s, __f);} _LIBCUDACXX_INLINE_VISIBILITY bool compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __s, memory_order __f) const noexcept _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f) - {return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __s, __f);} + {return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __s, __f);} _LIBCUDACXX_INLINE_VISIBILITY bool compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst) const volatile noexcept { if (memory_order_acq_rel == __m) - return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_acquire); + return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __m, memory_order_acquire); else if (memory_order_release == __m) - return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_relaxed); + return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __m, memory_order_relaxed); else - return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, __m); + return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __m, __m); } _LIBCUDACXX_INLINE_VISIBILITY bool compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst) const noexcept { if(memory_order_acq_rel == __m) - return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_acquire); + return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __m, memory_order_acquire); else if(memory_order_release == __m) - return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_relaxed); + return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __m, memory_order_relaxed); else - return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, __m); + return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __m, __m); } _LIBCUDACXX_INLINE_VISIBILITY bool compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst) const volatile noexcept { if (memory_order_acq_rel == __m) - return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_acquire); + return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __m, memory_order_acquire); else if (memory_order_release == __m) - return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_relaxed); + return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __m, memory_order_relaxed); else - return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, __m); + return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __m, __m); } _LIBCUDACXX_INLINE_VISIBILITY bool compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst) const noexcept { if (memory_order_acq_rel == __m) - return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_acquire); + return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __m, memory_order_acquire); else if (memory_order_release == __m) - return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_relaxed); + return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __m, memory_order_relaxed); else - return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, __m); + return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __m, __m); } _LIBCUDACXX_INLINE_VISIBILITY void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const volatile noexcept - {__cxx_atomic_wait(&this->__a_, __v, __m);} + {__atomic_wait_dispatch(this->__a_, __v, __m);} _LIBCUDACXX_INLINE_VISIBILITY void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const noexcept - {__cxx_atomic_wait(&this->__a_, __v, __m);} + {__atomic_wait_dispatch(this->__a_, __v, __m);} _LIBCUDACXX_INLINE_VISIBILITY void notify_one() const volatile noexcept - {__cxx_atomic_notify_one(&this->__a_);} + {__atomic_notify_one_dispatch(this->__a_);} _LIBCUDACXX_INLINE_VISIBILITY void notify_one() const noexcept - {__cxx_atomic_notify_one(&this->__a_);} + {__atomic_notify_one_dispatch(this->__a_);} _LIBCUDACXX_INLINE_VISIBILITY void notify_all() const volatile noexcept - {__cxx_atomic_notify_all(&this->__a_);} + {__atomic_notify_all_dispatch(this->__a_);} _LIBCUDACXX_INLINE_VISIBILITY void notify_all() const noexcept - {__cxx_atomic_notify_all(&this->__a_);} + {__atomic_notify_all_dispatch(this->__a_);} }; template @@ -1559,16 +906,16 @@ struct __atomic_base_arithmetic : public __atomic_base_core<_Tp, _Cq, _Storage> _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept - {return __cxx_atomic_fetch_add(&this->__a_, __op, __m);} + {return __atomic_fetch_add_dispatch(this->__a_, __op, __m);} _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept - {return __cxx_atomic_fetch_add(&this->__a_, __op, __m);} + {return __atomic_fetch_add_dispatch(this->__a_, __op, __m);} _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept - {return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);} + {return __atomic_fetch_sub_dispatch(this->__a_, __op, __m);} _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept - {return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);} + {return __atomic_fetch_sub_dispatch(this->__a_, __op, __m);} _LIBCUDACXX_INLINE_VISIBILITY _Tp operator++(int) volatile noexcept {return fetch_add(_Tp(1));} @@ -1610,16 +957,16 @@ struct __atomic_base_arithmetic<_Tp, true, _Storage> : public __atomic_base_core _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) const volatile noexcept - {return __cxx_atomic_fetch_add(&this->__a_, __op, __m);} + {return __atomic_fetch_add_dispatch(this->__a_, __op, __m);} _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept - {return __cxx_atomic_fetch_add(&this->__a_, __op, __m);} + {return __atomic_fetch_add_dispatch(this->__a_, __op, __m);} _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) const volatile noexcept - {return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);} + {return __atomic_fetch_sub_dispatch(this->__a_, __op, __m);} _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept - {return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);} + {return __atomic_fetch_sub_dispatch(this->__a_, __op, __m);} _LIBCUDACXX_INLINE_VISIBILITY _Tp operator++(int) const volatile noexcept {return fetch_add(_Tp(1));} @@ -1661,22 +1008,22 @@ struct __atomic_base_bitwise : public __atomic_base_arithmetic<_Tp, _Cq, _Storag _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept - {return __cxx_atomic_fetch_and(&this->__a_, __op, __m);} + {return __atomic_fetch_and_dispatch(this->__a_, __op, __m);} _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept - {return __cxx_atomic_fetch_and(&this->__a_, __op, __m);} + {return __atomic_fetch_and_dispatch(this->__a_, __op, __m);} _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept - {return __cxx_atomic_fetch_or(&this->__a_, __op, __m);} + {return __atomic_fetch_or_dispatch(this->__a_, __op, __m);} _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept - {return __cxx_atomic_fetch_or(&this->__a_, __op, __m);} + {return __atomic_fetch_or_dispatch(this->__a_, __op, __m);} _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept - {return __cxx_atomic_fetch_xor(&this->__a_, __op, __m);} + {return __atomic_fetch_xor_dispatch(this->__a_, __op, __m);} _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept - {return __cxx_atomic_fetch_xor(&this->__a_, __op, __m);} + {return __atomic_fetch_xor_dispatch(this->__a_, __op, __m);} _LIBCUDACXX_INLINE_VISIBILITY _Tp operator&=(_Tp __op) volatile noexcept {return fetch_and(__op) & __op;} @@ -1706,22 +1053,22 @@ struct __atomic_base_bitwise<_Tp, true, _Storage> : public __atomic_base_arithme _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) const volatile noexcept - {return __cxx_atomic_fetch_and(&this->__a_, __op, __m);} + {return __atomic_fetch_and_dispatch(this->__a_, __op, __m);} _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept - {return __cxx_atomic_fetch_and(&this->__a_, __op, __m);} + {return __atomic_fetch_and_dispatch(this->__a_, __op, __m);} _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) const volatile noexcept - {return __cxx_atomic_fetch_or(&this->__a_, __op, __m);} + {return __atomic_fetch_or_dispatch(this->__a_, __op, __m);} _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept - {return __cxx_atomic_fetch_or(&this->__a_, __op, __m);} + {return __atomic_fetch_or_dispatch(this->__a_, __op, __m);} _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) const volatile noexcept - {return __cxx_atomic_fetch_xor(&this->__a_, __op, __m);} + {return __atomic_fetch_xor_dispatch(this->__a_, __op, __m);} _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept - {return __cxx_atomic_fetch_xor(&this->__a_, __op, __m);} + {return __atomic_fetch_xor_dispatch(this->__a_, __op, __m);} _LIBCUDACXX_INLINE_VISIBILITY _Tp operator&=(_Tp __op) const volatile noexcept {return fetch_and(__op) & __op;} @@ -1744,7 +1091,7 @@ using __atomic_select_base = __conditional_t::value, __atomic_base_bitwise<_Tp, _Cq, _Storage>, __atomic_base_core<_Tp, _Cq, _Storage> >>; -template >> +template ::__atomic_storage_t>> struct __atomic_base : public _Base { __atomic_base() = default; __atomic_base(const __atomic_base&) = delete; @@ -1755,10 +1102,10 @@ struct __atomic_base : public _Base { _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_base(const _Tp& __a) noexcept : - _Base(__cxx_atomic_impl<_Tp, _Sco>(__a)) {} + _Base(__atomic_impl_traits<_Tp>::__atomic_storage_t(__a)) {} }; -template >> +template ::__atomic_ref_storage_t>> struct __atomic_base_ref : public _Base { __atomic_base_ref() = default; __atomic_base_ref(const __atomic_base_ref&) = default; @@ -1769,7 +1116,7 @@ struct __atomic_base_ref : public _Base { _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_base_ref(_Tp& __a) noexcept : - _Base(__cxx_atomic_ref_impl<_Tp, _Sco>(__a)) {} + _Base(__atomic_impl_traits<_Tp>::__atomic_ref_storage_t(__a)) {} }; #if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE) @@ -1820,19 +1167,19 @@ struct atomic<_Tp*> _LIBCUDACXX_INLINE_VISIBILITY _Tp* fetch_add(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) volatile noexcept - {return __cxx_atomic_fetch_add(&this->__a_, __op, __m);} + {return __atomic_fetch_add_dispatch(this->__a_, __op, __m);} _LIBCUDACXX_INLINE_VISIBILITY _Tp* fetch_add(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) noexcept - {return __cxx_atomic_fetch_add(&this->__a_, __op, __m);} + {return __atomic_fetch_add_dispatch(this->__a_, __op, __m);} _LIBCUDACXX_INLINE_VISIBILITY _Tp* fetch_sub(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) volatile noexcept - {return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);} + {return __atomic_fetch_sub_dispatch(this->__a_, __op, __m);} _LIBCUDACXX_INLINE_VISIBILITY _Tp* fetch_sub(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) noexcept - {return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);} + {return __atomic_fetch_sub_dispatch(this->__a_, __op, __m);} _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator++(int) volatile noexcept {return fetch_add(1);} @@ -1902,11 +1249,11 @@ template _LIBCUDACXX_INLINE_VISIBILITY _Tp* fetch_add(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) const noexcept - {return __cxx_atomic_fetch_add(&this->__a_, __op, __m);} + {return __atomic_fetch_add_dispatch(this->__a_, __op, __m);} _LIBCUDACXX_INLINE_VISIBILITY _Tp* fetch_sub(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) const noexcept - {return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);} + {return __atomic_fetch_sub_dispatch(this->__a_, __op, __m);} _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator++(int) const noexcept {return fetch_add(1);} @@ -1947,7 +1294,7 @@ _LIBCUDACXX_INLINE_VISIBILITY void atomic_init(volatile atomic<_Tp>* __o, _Tp __d) noexcept { - __cxx_atomic_init(&__o->__a_, __d); + __atomic_init_dispatch(__o->__a_, __d); } template @@ -1955,7 +1302,7 @@ _LIBCUDACXX_INLINE_VISIBILITY void atomic_init(atomic<_Tp>* __o, _Tp __d) noexcept { - __cxx_atomic_init(&__o->__a_, __d); + __atomic_init_dispatch(__o->__a_, __d); } // atomic_store @@ -2553,47 +1900,47 @@ atomic_fetch_xor_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept typedef struct atomic_flag { - __cxx_atomic_impl<_LIBCUDACXX_ATOMIC_FLAG_TYPE, 0> __a_; + __atomic_impl_traits<_LIBCUDACXX_ATOMIC_FLAG_TYPE>::__atomic_storage_t __a_; _LIBCUDACXX_INLINE_VISIBILITY bool test(memory_order __m = memory_order_seq_cst) const volatile noexcept - {return _LIBCUDACXX_ATOMIC_FLAG_TYPE(true)==__cxx_atomic_load(&__a_, __m);} + {return _LIBCUDACXX_ATOMIC_FLAG_TYPE(true)==__atomic_load_dispatch(__a_, __m, __thread_scope_system_tag{}, __atomic_tag_t{});} _LIBCUDACXX_INLINE_VISIBILITY bool test(memory_order __m = memory_order_seq_cst) const noexcept - {return _LIBCUDACXX_ATOMIC_FLAG_TYPE(true)==__cxx_atomic_load(&__a_, __m);} + {return _LIBCUDACXX_ATOMIC_FLAG_TYPE(true)==__atomic_load_dispatch(__a_, __m, __thread_scope_system_tag{}, __atomic_tag_t{});} _LIBCUDACXX_INLINE_VISIBILITY bool test_and_set(memory_order __m = memory_order_seq_cst) volatile noexcept - {return __cxx_atomic_exchange(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), __m);} + {return __atomic_exchange_dispatch(__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), __m, __thread_scope_system_tag{}, __atomic_tag_t{});} _LIBCUDACXX_INLINE_VISIBILITY bool test_and_set(memory_order __m = memory_order_seq_cst) noexcept - {return __cxx_atomic_exchange(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), __m);} + {return __atomic_exchange_dispatch(__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), __m, __thread_scope_system_tag{}, __atomic_tag_t{});} _LIBCUDACXX_INLINE_VISIBILITY void clear(memory_order __m = memory_order_seq_cst) volatile noexcept - {__cxx_atomic_store(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), __m);} + {__atomic_store_dispatch(__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), __m, __thread_scope_system_tag{}, __atomic_tag_t{});} _LIBCUDACXX_INLINE_VISIBILITY void clear(memory_order __m = memory_order_seq_cst) noexcept - {__cxx_atomic_store(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), __m);} + {__atomic_store_dispatch(__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), __m, __thread_scope_system_tag{}, __atomic_tag_t{});} #if !defined(__CUDA_MINIMUM_ARCH__) || __CUDA_MINIMUM_ARCH__ >= 700 _LIBCUDACXX_INLINE_VISIBILITY void wait(bool __v, memory_order __m = memory_order_seq_cst) const volatile noexcept - {__cxx_atomic_wait(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(__v), __m);} + {__atomic_wait(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(__v), __m, __thread_scope_system_tag{});} _LIBCUDACXX_INLINE_VISIBILITY void wait(bool __v, memory_order __m = memory_order_seq_cst) const noexcept - {__cxx_atomic_wait(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(__v), __m);} + {__atomic_wait(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(__v), __m, __thread_scope_system_tag{});} _LIBCUDACXX_INLINE_VISIBILITY void notify_one() volatile noexcept - {__cxx_atomic_notify_one(&__a_);} + {__atomic_notify_one(&__a_, __thread_scope_system_tag{});} _LIBCUDACXX_INLINE_VISIBILITY void notify_one() noexcept - {__cxx_atomic_notify_one(&__a_);} + {__atomic_notify_one(&__a_, __thread_scope_system_tag{});} _LIBCUDACXX_INLINE_VISIBILITY void notify_all() volatile noexcept - {__cxx_atomic_notify_all(&__a_);} + {__atomic_notify_all(&__a_, __thread_scope_system_tag{});} _LIBCUDACXX_INLINE_VISIBILITY void notify_all() noexcept - {__cxx_atomic_notify_all(&__a_);} + {__atomic_notify_all(&__a_, __thread_scope_system_tag{});} #endif atomic_flag() noexcept = default; @@ -2759,14 +2106,14 @@ inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_thread_fence(memory_order __m) noexcept { - __cxx_atomic_thread_fence(__m); + __atomic_thread_fence_dispatch(__m); } inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_signal_fence(memory_order __m) noexcept { - __cxx_atomic_signal_fence(__m); + __atomic_signal_fence_dispatch(__m); } // Atomics for standard typedef types From a780c269f23e955fe572c725e0e1f3ea786b158b Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Thu, 18 Apr 2024 13:04:45 -0700 Subject: [PATCH 06/71] Change atomic_storage operator()() to get() --- .../std/__atomic/operations/heterogeneous.h | 58 +++++++++---------- .../cuda/std/__atomic/operations/host.h | 4 +- .../include/cuda/std/__atomic/storage/base.h | 8 +-- .../cuda/std/__atomic/storage/reference.h | 2 +- 4 files changed, 36 insertions(+), 36 deletions(-) diff --git a/libcudacxx/include/cuda/std/__atomic/operations/heterogeneous.h b/libcudacxx/include/cuda/std/__atomic/operations/heterogeneous.h index 86a142de08..5a87e876a6 100644 --- a/libcudacxx/include/cuda/std/__atomic/operations/heterogeneous.h +++ b/libcudacxx/include/cuda/std/__atomic/operations/heterogeneous.h @@ -66,7 +66,7 @@ using __atomic_enable_if_default_base_t = __enable_if_t> _LIBCUDACXX_HOST_DEVICE void __atomic_init_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, _Tag = {}) { - __atomic_assign_volatile(__a(), __val); + __atomic_assign_volatile(__a.get(), __val); } template > @@ -75,10 +75,10 @@ _LIBCUDACXX_HOST_DEVICE alignas(_Tp) auto __tmp = __val; NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - __atomic_store_n_cuda(__a(), __tmp, static_cast<__memory_order_underlying_t>(__order), _Sco{}); + __atomic_store_n_cuda(__a.get(), __val, static_cast<__memory_order_underlying_t>(__order), _Sco{}); ), NV_IS_HOST, ( - __atomic_store_host(__a(), __tmp, __order); + __atomic_store_host(__a.get(), __val, __order); ) ) } @@ -88,10 +88,10 @@ _LIBCUDACXX_HOST_DEVICE auto __atomic_load_dispatch(_Tp const& __a, memory_order __order, _Sco = {}, _Tag = {}) -> __atomic_underlying_t<_Tp> { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - return __atomic_load_n_cuda(__a(), static_cast<__memory_order_underlying_t>(__order), _Sco{}); + return __atomic_load_n_cuda(__a.get(), static_cast<__memory_order_underlying_t>(__order), _Sco{}); ), NV_IS_HOST, ( - return __atomic_load_host(__a(), __order); + return __atomic_load_host(__a.get(), __order); ) ) } @@ -102,10 +102,10 @@ __atomic_underlying_t<_Tp> __atomic_exchange_dispatch(_Tp& __a, __atomic_underly alignas(_Tp) auto __tmp = __value; NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - return __atomic_exchange_n_cuda(__a(), __tmp, static_cast<__memory_order_underlying_t>(__order), _Sco{}); + return __atomic_exchange_n_cuda(__a.get(), __tmp, static_cast<__memory_order_underlying_t>(__order), _Sco{}); ), NV_IS_HOST, ( - return __atomic_exchange_host(__a(), __tmp, __order); + return __atomic_exchange_host(__a.get(), __tmp, __order); ) ) } @@ -116,10 +116,10 @@ _LIBCUDACXX_HOST_DEVICE bool __result = false; NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - __result = __atomic_compare_exchange_cuda(__a(), __expected, __val, false, static_cast<__memory_order_underlying_t>(__success), static_cast<__memory_order_underlying_t>(__failure), _Sco{}); + __result = __atomic_compare_exchange_cuda(__a.get(), __expected, __val, false, static_cast<__memory_order_underlying_t>(__success), static_cast<__memory_order_underlying_t>(__failure), _Sco{}); ), NV_IS_HOST, ( - __result = __atomic_compare_exchange_strong_host(__a(), __expected, __val, __success, __failure); + __result = __atomic_compare_exchange_strong_host(__a.get(), __expected, __val, __success, __failure); ) ) return __result; @@ -131,10 +131,10 @@ _LIBCUDACXX_HOST_DEVICE bool __result = false; NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - __result = __atomic_compare_exchange_cuda(__a(), __expected, __val, true, static_cast<__memory_order_underlying_t>(__success), static_cast<__memory_order_underlying_t>(__failure), _Sco{}); + __result = __atomic_compare_exchange_cuda(__a.get(), __expected, __val, true, static_cast<__memory_order_underlying_t>(__success), static_cast<__memory_order_underlying_t>(__failure), _Sco{}); ), NV_IS_HOST, ( - __result = __atomic_compare_exchange_weak_host(__a(), __expected, __val, __success, __failure); + __result = __atomic_compare_exchange_weak_host(__a.get(), __expected, __val, __success, __failure); ) ) return __result; @@ -150,10 +150,10 @@ _LIBCUDACXX_HOST_DEVICE __atomic_enable_if_not_ptr<_Tp> __atomic_fetch_add_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __delta, memory_order __order, _Sco = {}, _Tag = {}) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - return __atomic_fetch_add_cuda(__a(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{}); + return __atomic_fetch_add_cuda(__a.get(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{}); ), NV_IS_HOST, ( - return __atomic_fetch_add_host(__a(), __delta, __order); + return __atomic_fetch_add_host(__a.get(), __delta, __order); ) ) } @@ -163,10 +163,10 @@ _LIBCUDACXX_HOST_DEVICE __atomic_enable_if_ptr<_Tp> __atomic_fetch_add_dispatch(_Tp& __a, ptrdiff_t __delta, memory_order __order, _Sco = {}, _Tag = {}) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - return __atomic_fetch_add_cuda(__a(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{}); + return __atomic_fetch_add_cuda(__a.get(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{}); ), NV_IS_HOST, ( - return __atomic_fetch_add_host(__a(), __delta, __order); + return __atomic_fetch_add_host(__a.get(), __delta, __order); ) ) } @@ -176,10 +176,10 @@ _LIBCUDACXX_HOST_DEVICE __atomic_enable_if_not_ptr<_Tp> __atomic_fetch_sub_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __delta, memory_order __order, _Sco = {}, _Tag = {}) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - return __atomic_fetch_sub_cuda(__a(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{}); + return __atomic_fetch_sub_cuda(__a.get(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{}); ), NV_IS_HOST, ( - return __atomic_fetch_sub_cuda(__a(), __delta, __order); + return __atomic_fetch_sub_cuda(__a.get(), __delta, __order); ) ) } @@ -189,10 +189,10 @@ _LIBCUDACXX_HOST_DEVICE __atomic_enable_if_ptr<_Tp> __atomic_fetch_sub_dispatch(_Tp& __a, ptrdiff_t __delta, memory_order __order, _Sco = {}, _Tag = {}) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - return __atomic_fetch_sub_cuda(__a(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{}); + return __atomic_fetch_sub_cuda(__a.get(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{}); ), NV_IS_HOST, ( - return __atomic_fetch_sub_host(__a(), __delta, __order); + return __atomic_fetch_sub_host(__a.get(), __delta, __order); ) ) } @@ -202,10 +202,10 @@ _LIBCUDACXX_HOST_DEVICE __atomic_underlying_t<_Tp> __atomic_fetch_and_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order __order, _Sco = {}, _Tag = {}) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - return __atomic_fetch_and_cuda(__a(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{}); + return __atomic_fetch_and_cuda(__a.get(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{}); ), NV_IS_HOST, ( - return __atomic_fetch_and_host(__a(), __pattern, __order); + return __atomic_fetch_and_host(__a.get(), __pattern, __order); ) ) } @@ -215,10 +215,10 @@ _LIBCUDACXX_HOST_DEVICE __atomic_underlying_t<_Tp> __atomic_fetch_or_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order __order, _Sco = {}, _Tag = {}) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - return __atomic_fetch_or_cuda(__a(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{}); + return __atomic_fetch_or_cuda(__a.get(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{}); ), NV_IS_HOST, ( - return __atomic_fetch_or_host(__a(), __pattern, __order); + return __atomic_fetch_or_host(__a.get(), __pattern, __order); ) ) } @@ -228,10 +228,10 @@ _LIBCUDACXX_HOST_DEVICE __atomic_underlying_t<_Tp> __atomic_fetch_xor_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order __order, _Sco = {}, _Tag = {}) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( - return __atomic_fetch_xor_cuda(__a(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{}); + return __atomic_fetch_xor_cuda(__a.get(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{}); ), NV_IS_HOST, ( - return __atomic_fetch_xor_host(__a(), __pattern, __order); + return __atomic_fetch_xor_host(__a.get(), __pattern, __order); ) ) } @@ -241,9 +241,9 @@ _LIBCUDACXX_HOST_DEVICE __atomic_underlying_t<_Tp> __atomic_fetch_max_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco = {}, _Tag = {}) { NV_IF_TARGET( NV_IS_DEVICE, ( - return __atomic_fetch_max_cuda(__a(), __val, static_cast<__memory_order_underlying_t>(__order), _Sco{}); + return __atomic_fetch_max_cuda(__a.get(), __val, static_cast<__memory_order_underlying_t>(__order), _Sco{}); ), ( - return __atomic_fetch_max_host(__a(), __val, __order); + return __atomic_fetch_max_host(__a.get(), __val, __order); ) ) } @@ -253,9 +253,9 @@ _LIBCUDACXX_HOST_DEVICE __atomic_underlying_t<_Tp> __atomic_fetch_min_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco = {}, _Tag = {}) { NV_IF_TARGET( NV_IS_DEVICE, ( - return __atomic_fetch_min_cuda(__a(), __val, static_cast<__memory_order_underlying_t>(__order), _Sco{}); + return __atomic_fetch_min_cuda(__a.get(), __val, static_cast<__memory_order_underlying_t>(__order), _Sco{}); ), ( - return __atomic_fetch_min_host(__a(), __val, __order); + return __atomic_fetch_min_host(__a.get(), __val, __order); ) ) } diff --git a/libcudacxx/include/cuda/std/__atomic/operations/host.h b/libcudacxx/include/cuda/std/__atomic/operations/host.h index 4870c011c4..e6015f33e4 100644 --- a/libcudacxx/include/cuda/std/__atomic/operations/host.h +++ b/libcudacxx/include/cuda/std/__atomic/operations/host.h @@ -38,14 +38,14 @@ inline void __atomic_store_host(_Tp* __a, _Up __val, memory_order __order) { } template -inline auto __atomic_load_host(_Tp* __a, memory_order __order) -> _Tp { +inline auto __atomic_load_host(_Tp* __a, memory_order __order) -> __remove_cvref_t<_Tp> { __remove_cvref_t<_Tp> __ret{}; __atomic_load(__a, &__ret, __atomic_order_to_int(__order)); return __ret; } template -inline auto __atomic_exchange_host(_Tp* __a, _Up __val, memory_order __order) -> _Tp { +inline auto __atomic_exchange_host(_Tp* __a, _Up __val, memory_order __order) -> __remove_cvref_t<_Tp> { __remove_cvref_t<_Tp> __ret{}; __atomic_exchange(__a, &__val, &__ret, __atomic_order_to_int(__order)); return __ret; diff --git a/libcudacxx/include/cuda/std/__atomic/storage/base.h b/libcudacxx/include/cuda/std/__atomic/storage/base.h index ef197fd4ef..ca6a5fceaf 100644 --- a/libcudacxx/include/cuda/std/__atomic/storage/base.h +++ b/libcudacxx/include/cuda/std/__atomic/storage/base.h @@ -41,16 +41,16 @@ struct __atomic_storage { __atomic_storage(_Tp value) noexcept : __a_value(value) {} - _LIBCUDACXX_HOST_DEVICE inline auto operator()() -> __underlying_t* { + _LIBCUDACXX_HOST_DEVICE inline auto get() -> __underlying_t* { return &__a_value; } - _LIBCUDACXX_HOST_DEVICE inline auto operator()() volatile -> volatile __underlying_t* { + _LIBCUDACXX_HOST_DEVICE inline auto get() volatile -> volatile __underlying_t* { return &__a_value; } - _LIBCUDACXX_HOST_DEVICE inline auto operator()() const -> const __underlying_t* { + _LIBCUDACXX_HOST_DEVICE inline auto get() const -> const __underlying_t* { return &__a_value; } - _LIBCUDACXX_HOST_DEVICE inline auto operator()() const volatile -> const volatile __underlying_t* { + _LIBCUDACXX_HOST_DEVICE inline auto get() const volatile -> const volatile __underlying_t* { return &__a_value; } }; diff --git a/libcudacxx/include/cuda/std/__atomic/storage/reference.h b/libcudacxx/include/cuda/std/__atomic/storage/reference.h index 3ead98703a..a892f24d12 100644 --- a/libcudacxx/include/cuda/std/__atomic/storage/reference.h +++ b/libcudacxx/include/cuda/std/__atomic/storage/reference.h @@ -38,7 +38,7 @@ struct __atomic_ref_storage { __atomic_ref_storage(_Tp& value) noexcept : __a_value(&value) {} - _LIBCUDACXX_HOST_DEVICE inline auto operator()() -> __underlying_t* { + _LIBCUDACXX_HOST_DEVICE inline auto get() -> __underlying_t* { return __a_value; } }; From 217527d75683c8765845d99d84de02d4d34ab752 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Thu, 18 Apr 2024 14:43:30 -0700 Subject: [PATCH 07/71] Fixup: Change desired of compexch to accept by value. * This matches other implementations. --- libcudacxx/codegen/codegen.cpp | 2 +- .../operations/atomic_cuda_ptx_generated.h | 24 +++++++++---------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/libcudacxx/codegen/codegen.cpp b/libcudacxx/codegen/codegen.cpp index c1f809bd4b..2df154de05 100644 --- a/libcudacxx/codegen/codegen.cpp +++ b/libcudacxx/codegen/codegen.cpp @@ -305,7 +305,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD { out << "template = 0>\n"; out << "_CCCL_DEVICE bool __atomic_compare_exchange_cuda(" << cv - << "_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int " + << "_Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int " "__failure_memorder, " << scopenametag(s.first) << ") {\n"; out << " uint" << sz << "_t __tmp = 0, __old = 0, __old_tmp;\n"; diff --git a/libcudacxx/include/cuda/std/__atomic/operations/atomic_cuda_ptx_generated.h b/libcudacxx/include/cuda/std/__atomic/operations/atomic_cuda_ptx_generated.h index ff1bdcf1ff..52330eab5f 100644 --- a/libcudacxx/include/cuda/std/__atomic/operations/atomic_cuda_ptx_generated.h +++ b/libcudacxx/include/cuda/std/__atomic/operations/atomic_cuda_ptx_generated.h @@ -252,7 +252,7 @@ template static inli template static inline _CCCL_DEVICE void __cuda_compare_exchange_release_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.cta.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); } template static inline _CCCL_DEVICE void __cuda_compare_exchange_volatile_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.cta.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); } template = 0> -_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) { +_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) { uint32_t __tmp = 0, __old = 0, __old_tmp; memcpy(&__tmp, &__desired, 4); memcpy(&__old, __expected, 4); @@ -286,7 +286,7 @@ _CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *_ return __ret; } template = 0> -_CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) { +_CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) { uint32_t __tmp = 0, __old = 0, __old_tmp; memcpy(&__tmp, &__desired, 4); memcpy(&__old, __expected, 4); @@ -1159,7 +1159,7 @@ template static inli template static inline _CCCL_DEVICE void __cuda_compare_exchange_release_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.cta.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); } template static inline _CCCL_DEVICE void __cuda_compare_exchange_volatile_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.cta.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); } template = 0> -_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) { +_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) { uint64_t __tmp = 0, __old = 0, __old_tmp; memcpy(&__tmp, &__desired, 8); memcpy(&__old, __expected, 8); @@ -1193,7 +1193,7 @@ _CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *_ return __ret; } template = 0> -_CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) { +_CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) { uint64_t __tmp = 0, __old = 0, __old_tmp; memcpy(&__tmp, &__desired, 8); memcpy(&__old, __expected, 8); @@ -2429,7 +2429,7 @@ template static inli template static inline _CCCL_DEVICE void __cuda_compare_exchange_release_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); } template static inline _CCCL_DEVICE void __cuda_compare_exchange_volatile_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); } template = 0> -_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) { +_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) { uint32_t __tmp = 0, __old = 0, __old_tmp; memcpy(&__tmp, &__desired, 4); memcpy(&__old, __expected, 4); @@ -2463,7 +2463,7 @@ _CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *_ return __ret; } template = 0> -_CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) { +_CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) { uint32_t __tmp = 0, __old = 0, __old_tmp; memcpy(&__tmp, &__desired, 4); memcpy(&__old, __expected, 4); @@ -3336,7 +3336,7 @@ template static inli template static inline _CCCL_DEVICE void __cuda_compare_exchange_release_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.gpu.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); } template static inline _CCCL_DEVICE void __cuda_compare_exchange_volatile_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.gpu.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); } template = 0> -_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) { +_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) { uint64_t __tmp = 0, __old = 0, __old_tmp; memcpy(&__tmp, &__desired, 8); memcpy(&__old, __expected, 8); @@ -3370,7 +3370,7 @@ _CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *_ return __ret; } template = 0> -_CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) { +_CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) { uint64_t __tmp = 0, __old = 0, __old_tmp; memcpy(&__tmp, &__desired, 8); memcpy(&__old, __expected, 8); @@ -4606,7 +4606,7 @@ template static inli template static inline _CCCL_DEVICE void __cuda_compare_exchange_release_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.sys.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); } template static inline _CCCL_DEVICE void __cuda_compare_exchange_volatile_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.sys.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); } template = 0> -_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) { +_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) { uint32_t __tmp = 0, __old = 0, __old_tmp; memcpy(&__tmp, &__desired, 4); memcpy(&__old, __expected, 4); @@ -4640,7 +4640,7 @@ _CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *_ return __ret; } template = 0> -_CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) { +_CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) { uint32_t __tmp = 0, __old = 0, __old_tmp; memcpy(&__tmp, &__desired, 4); memcpy(&__old, __expected, 4); @@ -5513,7 +5513,7 @@ template static inli template static inline _CCCL_DEVICE void __cuda_compare_exchange_release_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.sys.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); } template static inline _CCCL_DEVICE void __cuda_compare_exchange_volatile_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.sys.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); } template = 0> -_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) { +_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) { uint64_t __tmp = 0, __old = 0, __old_tmp; memcpy(&__tmp, &__desired, 8); memcpy(&__old, __expected, 8); @@ -5547,7 +5547,7 @@ _CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *_ return __ret; } template = 0> -_CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) { +_CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) { uint64_t __tmp = 0, __old = 0, __old_tmp; memcpy(&__tmp, &__desired, 8); memcpy(&__old, __expected, 8); From eaaa670a85108199d512d7416c8663515d3c10c1 Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Thu, 18 Apr 2024 14:47:58 -0700 Subject: [PATCH 08/71] Fix merge conflicts (LIBCUDACXX->CCCL) --- .../std/__atomic/operations/heterogeneous.h | 34 ++++++++--------- libcudacxx/include/cuda/std/__atomic/order.h | 6 +-- .../include/cuda/std/__atomic/storage/base.h | 12 +++--- .../cuda/std/__atomic/storage/common.h | 4 +- .../cuda/std/__atomic/storage/locked.h | 36 +++++++++--------- .../cuda/std/__atomic/storage/reference.h | 4 +- .../include/cuda/std/__atomic/storage/small.h | 38 +++++++++---------- .../include/cuda/std/__atomic/wait/polling.h | 6 +-- 8 files changed, 70 insertions(+), 70 deletions(-) diff --git a/libcudacxx/include/cuda/std/__atomic/operations/heterogeneous.h b/libcudacxx/include/cuda/std/__atomic/operations/heterogeneous.h index 5a87e876a6..5bfa86661d 100644 --- a/libcudacxx/include/cuda/std/__atomic/operations/heterogeneous.h +++ b/libcudacxx/include/cuda/std/__atomic/operations/heterogeneous.h @@ -29,7 +29,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD -_LIBCUDACXX_HOST_DEVICE +_CCCL_HOST_DEVICE inline void __atomic_thread_fence_dispatch(memory_order __order) { NV_DISPATCH_TARGET( @@ -42,7 +42,7 @@ inline ) } -_LIBCUDACXX_HOST_DEVICE +_CCCL_HOST_DEVICE inline void __atomic_signal_fence_dispatch(memory_order __order) { NV_DISPATCH_TARGET( @@ -64,13 +64,13 @@ template using __atomic_enable_if_default_base_t = __enable_if_t, __atomic_base_tag>::value, __atomic_tag_t<_Tp>>; template > -_LIBCUDACXX_HOST_DEVICE +_CCCL_HOST_DEVICE void __atomic_init_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, _Tag = {}) { __atomic_assign_volatile(__a.get(), __val); } template > -_LIBCUDACXX_HOST_DEVICE +_CCCL_HOST_DEVICE void __atomic_store_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco = {}, _Tag = {}) { alignas(_Tp) auto __tmp = __val; NV_DISPATCH_TARGET( @@ -84,7 +84,7 @@ _LIBCUDACXX_HOST_DEVICE } template > -_LIBCUDACXX_HOST_DEVICE +_CCCL_HOST_DEVICE auto __atomic_load_dispatch(_Tp const& __a, memory_order __order, _Sco = {}, _Tag = {}) -> __atomic_underlying_t<_Tp> { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( @@ -97,7 +97,7 @@ _LIBCUDACXX_HOST_DEVICE } template > -_LIBCUDACXX_HOST_DEVICE +_CCCL_HOST_DEVICE __atomic_underlying_t<_Tp> __atomic_exchange_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __value, memory_order __order, _Sco = {}, _Tag = {}) { alignas(_Tp) auto __tmp = __value; NV_DISPATCH_TARGET( @@ -111,7 +111,7 @@ __atomic_underlying_t<_Tp> __atomic_exchange_dispatch(_Tp& __a, __atomic_underly } template > -_LIBCUDACXX_HOST_DEVICE +_CCCL_HOST_DEVICE bool __atomic_compare_exchange_strong_dispatch(_Tp& __a, __atomic_underlying_t<_Tp>* __expected, __atomic_underlying_t<_Tp> __val, memory_order __success, memory_order __failure, _Sco = {}, _Tag = {}) { bool __result = false; NV_DISPATCH_TARGET( @@ -126,7 +126,7 @@ _LIBCUDACXX_HOST_DEVICE } template > -_LIBCUDACXX_HOST_DEVICE +_CCCL_HOST_DEVICE bool __atomic_compare_exchange_weak_dispatch(_Tp& __a, __atomic_underlying_t<_Tp>* __expected, __atomic_underlying_t<_Tp> __val, memory_order __success, memory_order __failure, _Sco = {}, _Tag = {}) { bool __result = false; NV_DISPATCH_TARGET( @@ -146,7 +146,7 @@ template using __atomic_enable_if_not_ptr = __enable_if_t>::value, __atomic_underlying_t<_Tp>>; template > -_LIBCUDACXX_HOST_DEVICE +_CCCL_HOST_DEVICE __atomic_enable_if_not_ptr<_Tp> __atomic_fetch_add_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __delta, memory_order __order, _Sco = {}, _Tag = {}) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( @@ -159,7 +159,7 @@ _LIBCUDACXX_HOST_DEVICE } template > -_LIBCUDACXX_HOST_DEVICE +_CCCL_HOST_DEVICE __atomic_enable_if_ptr<_Tp> __atomic_fetch_add_dispatch(_Tp& __a, ptrdiff_t __delta, memory_order __order, _Sco = {}, _Tag = {}) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( @@ -172,7 +172,7 @@ _LIBCUDACXX_HOST_DEVICE } template > -_LIBCUDACXX_HOST_DEVICE +_CCCL_HOST_DEVICE __atomic_enable_if_not_ptr<_Tp> __atomic_fetch_sub_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __delta, memory_order __order, _Sco = {}, _Tag = {}) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( @@ -185,7 +185,7 @@ _LIBCUDACXX_HOST_DEVICE } template > -_LIBCUDACXX_HOST_DEVICE +_CCCL_HOST_DEVICE __atomic_enable_if_ptr<_Tp> __atomic_fetch_sub_dispatch(_Tp& __a, ptrdiff_t __delta, memory_order __order, _Sco = {}, _Tag = {}) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( @@ -198,7 +198,7 @@ _LIBCUDACXX_HOST_DEVICE } template > -_LIBCUDACXX_HOST_DEVICE +_CCCL_HOST_DEVICE __atomic_underlying_t<_Tp> __atomic_fetch_and_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order __order, _Sco = {}, _Tag = {}) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( @@ -211,7 +211,7 @@ _LIBCUDACXX_HOST_DEVICE } template > -_LIBCUDACXX_HOST_DEVICE +_CCCL_HOST_DEVICE __atomic_underlying_t<_Tp> __atomic_fetch_or_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order __order, _Sco = {}, _Tag = {}) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( @@ -224,7 +224,7 @@ _LIBCUDACXX_HOST_DEVICE } template > -_LIBCUDACXX_HOST_DEVICE +_CCCL_HOST_DEVICE __atomic_underlying_t<_Tp> __atomic_fetch_xor_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order __order, _Sco = {}, _Tag = {}) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( @@ -237,7 +237,7 @@ _LIBCUDACXX_HOST_DEVICE } template > -_LIBCUDACXX_HOST_DEVICE +_CCCL_HOST_DEVICE __atomic_underlying_t<_Tp> __atomic_fetch_max_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco = {}, _Tag = {}) { NV_IF_TARGET( NV_IS_DEVICE, ( @@ -249,7 +249,7 @@ _LIBCUDACXX_HOST_DEVICE } template > -_LIBCUDACXX_HOST_DEVICE +_CCCL_HOST_DEVICE __atomic_underlying_t<_Tp> __atomic_fetch_min_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco = {}, _Tag = {}) { NV_IF_TARGET( NV_IS_DEVICE, ( diff --git a/libcudacxx/include/cuda/std/__atomic/order.h b/libcudacxx/include/cuda/std/__atomic/order.h index d5c37c45ec..0310f125b6 100644 --- a/libcudacxx/include/cuda/std/__atomic/order.h +++ b/libcudacxx/include/cuda/std/__atomic/order.h @@ -83,7 +83,7 @@ typedef enum memory_order { #endif // _CCCL_STD_VER > 2017 -_LIBCUDACXX_HOST_DEVICE +_CCCL_HOST_DEVICE inline int __stronger_order_cuda(int __a, int __b) { int const __max = __a > __b ? __a : __b; if(__max != __ATOMIC_RELEASE) @@ -96,7 +96,7 @@ inline int __stronger_order_cuda(int __a, int __b) { return __xform[__a < __b ? __a : __b]; } -_LIBCUDACXX_HOST_DEVICE +_CCCL_HOST_DEVICE inline constexpr int __atomic_order_to_int(memory_order __order) { // Avoid switch statement to make this a constexpr. return __order == memory_order_relaxed ? __ATOMIC_RELAXED: @@ -107,7 +107,7 @@ inline constexpr int __atomic_order_to_int(memory_order __order) { __ATOMIC_CONSUME)))); } -_LIBCUDACXX_HOST_DEVICE +_CCCL_HOST_DEVICE inline constexpr int __atomic_failure_order_to_int(memory_order __order) { // Avoid switch statement to make this a constexpr. return __order == memory_order_relaxed ? __ATOMIC_RELAXED: diff --git a/libcudacxx/include/cuda/std/__atomic/storage/base.h b/libcudacxx/include/cuda/std/__atomic/storage/base.h index ca6a5fceaf..75500d0769 100644 --- a/libcudacxx/include/cuda/std/__atomic/storage/base.h +++ b/libcudacxx/include/cuda/std/__atomic/storage/base.h @@ -34,23 +34,23 @@ struct __atomic_storage { _ALIGNAS(sizeof(_Tp)) _Tp __a_value; - _LIBCUDACXX_HOST_DEVICE + _CCCL_HOST_DEVICE __atomic_storage() noexcept : __a_value() {} - _LIBCUDACXX_HOST_DEVICE constexpr explicit + _CCCL_HOST_DEVICE constexpr explicit __atomic_storage(_Tp value) noexcept : __a_value(value) {} - _LIBCUDACXX_HOST_DEVICE inline auto get() -> __underlying_t* { + _CCCL_HOST_DEVICE inline auto get() -> __underlying_t* { return &__a_value; } - _LIBCUDACXX_HOST_DEVICE inline auto get() volatile -> volatile __underlying_t* { + _CCCL_HOST_DEVICE inline auto get() volatile -> volatile __underlying_t* { return &__a_value; } - _LIBCUDACXX_HOST_DEVICE inline auto get() const -> const __underlying_t* { + _CCCL_HOST_DEVICE inline auto get() const -> const __underlying_t* { return &__a_value; } - _LIBCUDACXX_HOST_DEVICE inline auto get() const volatile -> const volatile __underlying_t* { + _CCCL_HOST_DEVICE inline auto get() const volatile -> const volatile __underlying_t* { return &__a_value; } }; diff --git a/libcudacxx/include/cuda/std/__atomic/storage/common.h b/libcudacxx/include/cuda/std/__atomic/storage/common.h index 22f946aada..48a3307616 100644 --- a/libcudacxx/include/cuda/std/__atomic/storage/common.h +++ b/libcudacxx/include/cuda/std/__atomic/storage/common.h @@ -21,13 +21,13 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD // is required. template __enable_if_t::value> -_LIBCUDACXX_HOST_DEVICE __atomic_assign_volatile(_Tp& __a_value, _Tv const& __val) { +_CCCL_HOST_DEVICE __atomic_assign_volatile(_Tp& __a_value, _Tv const& __val) { __a_value = __val; } template __enable_if_t::value> -_LIBCUDACXX_HOST_DEVICE __atomic_assign_volatile(_Tp volatile& __a_value, _Tv volatile const& __val) { +_CCCL_HOST_DEVICE __atomic_assign_volatile(_Tp volatile& __a_value, _Tv volatile const& __val) { volatile char* __to = reinterpret_cast(&__a_value); volatile char* __end = __to + sizeof(_Tp); volatile const char* __from = reinterpret_cast(&__val); diff --git a/libcudacxx/include/cuda/std/__atomic/storage/locked.h b/libcudacxx/include/cuda/std/__atomic/storage/locked.h index ab359bc780..2c579cf23a 100644 --- a/libcudacxx/include/cuda/std/__atomic/storage/locked.h +++ b/libcudacxx/include/cuda/std/__atomic/storage/locked.h @@ -30,10 +30,10 @@ struct __atomic_locked_storage { using __underlying_t = typename remove_cv<_Tp>::type; using __tag_t = typename __atomic_locked_tag; - _LIBCUDACXX_HOST_DEVICE + _CCCL_HOST_DEVICE __atomic_locked_storage() noexcept : __a_value(), __a_lock(0) {} - _LIBCUDACXX_HOST_DEVICE constexpr explicit + _CCCL_HOST_DEVICE constexpr explicit __atomic_locked_storage(_Tp value) noexcept : __a_value(value), __a_lock(0) {} @@ -41,33 +41,33 @@ struct __atomic_locked_storage { mutable __atomic_storage<_LIBCUDACXX_ATOMIC_FLAG_TYPE> __a_lock; template - _LIBCUDACXX_HOST_DEVICE void __lock(_Sco) const volatile { + _CCCL_HOST_DEVICE void __lock(_Sco) const volatile { while(1 == __atomic_exchange_dispatch(__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), memory_order_acquire, _Sco{})) /*spin*/; } template - _LIBCUDACXX_HOST_DEVICE void __lock(_Sco) const { + _CCCL_HOST_DEVICE void __lock(_Sco) const { while(1 == __atomic_exchange_dispatch(__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), memory_order_acquire, _Sco{})) /*spin*/; } template - _LIBCUDACXX_HOST_DEVICE void __unlock(_Sco) const volatile { + _CCCL_HOST_DEVICE void __unlock(_Sco) const volatile { __atomic_store_dispatch(__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), memory_order_release, _Sco{}); } template - _LIBCUDACXX_HOST_DEVICE void __unlock(_Sco) const { + _CCCL_HOST_DEVICE void __unlock(_Sco) const { __atomic_store_dispatch(__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), memory_order_release, _Sco{}); } }; template -_LIBCUDACXX_HOST_DEVICE +_CCCL_HOST_DEVICE void __atomic_init_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, _Sco, __atomic_locked_tag) { __atomic_assign_volatile(__a.__a_value, __val); } template -_LIBCUDACXX_HOST_DEVICE +_CCCL_HOST_DEVICE void __atomic_store_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, memory_order, _Sco, __atomic_locked_tag) { __a.__lock(_Sco{}); __atomic_assign_volatile(__a.__a_value, __val); @@ -75,7 +75,7 @@ void __atomic_store_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, memory } template -_LIBCUDACXX_HOST_DEVICE +_CCCL_HOST_DEVICE __atomic_underlying_t<_Tp> __atomic_load_dispatch(const _Tp& __a, memory_order, _Sco, __atomic_locked_tag) { __atomic_underlying_t<_Tp> __old; __a.__lock(_Sco{}); @@ -85,7 +85,7 @@ __atomic_underlying_t<_Tp> __atomic_load_dispatch(const _Tp& __a, memory_order, } template -_LIBCUDACXX_HOST_DEVICE +_CCCL_HOST_DEVICE __atomic_underlying_t<_Tp> __atomic_exchange_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __value, memory_order, _Sco, __atomic_locked_tag) { __atomic_underlying_t<_Tp> __old; __a.__lock(_Sco{}); @@ -96,7 +96,7 @@ __atomic_underlying_t<_Tp> __atomic_exchange_dispatch(_Tp& __a, __atomic_underly } template -_LIBCUDACXX_HOST_DEVICE +_CCCL_HOST_DEVICE bool __atomic_compare_exchange_strong_dispatch(_Tp& __a, __atomic_underlying_t<_Tp>* __expected, __atomic_underlying_t<_Tp> __value, memory_order, memory_order, _Sco, __atomic_locked_tag) { __atomic_underlying_t<_Tp> __temp; @@ -112,7 +112,7 @@ bool __atomic_compare_exchange_strong_dispatch(_Tp& __a, } template -_LIBCUDACXX_HOST_DEVICE +_CCCL_HOST_DEVICE bool __atomic_compare_exchange_weak_dispatch(_Tp& __a, __atomic_underlying_t<_Tp>* __expected, __atomic_underlying_t<_Tp> __value, memory_order, memory_order, _Sco, __atomic_locked_tag) { __atomic_underlying_t<_Tp> __temp; @@ -128,7 +128,7 @@ bool __atomic_compare_exchange_weak_dispatch(_Tp& __a, } template -_LIBCUDACXX_HOST_DEVICE +_CCCL_HOST_DEVICE __atomic_underlying_t<_Tp> __atomic_fetch_add_dispatch(_Tp& __a, _Td __delta, memory_order, _Sco, __atomic_locked_tag) { __atomic_underlying_t<_Tp> __old; @@ -140,7 +140,7 @@ __atomic_underlying_t<_Tp> __atomic_fetch_add_dispatch(_Tp& __a, } template -_LIBCUDACXX_HOST_DEVICE +_CCCL_HOST_DEVICE __atomic_underlying_t<_Tp> __atomic_fetch_add_dispatch(_Tp& __a, ptrdiff_t __delta, memory_order, _Sco, __atomic_locked_tag) { __atomic_underlying_t<_Tp> __old; @@ -152,7 +152,7 @@ __atomic_underlying_t<_Tp> __atomic_fetch_add_dispatch(_Tp& __a, } template -_LIBCUDACXX_HOST_DEVICE +_CCCL_HOST_DEVICE __atomic_underlying_t<_Tp> __atomic_fetch_sub_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __delta, memory_order, _Sco, __atomic_locked_tag) { __atomic_underlying_t<_Tp> __old; @@ -164,7 +164,7 @@ __atomic_underlying_t<_Tp> __atomic_fetch_sub_dispatch(_Tp& __a, } template -_LIBCUDACXX_HOST_DEVICE +_CCCL_HOST_DEVICE __atomic_underlying_t<_Tp> __atomic_fetch_and_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order, _Sco, __atomic_locked_tag) { __atomic_underlying_t<_Tp> __old; @@ -176,7 +176,7 @@ __atomic_underlying_t<_Tp> __atomic_fetch_and_dispatch(_Tp& __a, } template -_LIBCUDACXX_HOST_DEVICE +_CCCL_HOST_DEVICE __atomic_underlying_t<_Tp> __atomic_fetch_or_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order, _Sco, __atomic_locked_tag) { __atomic_underlying_t<_Tp> __old; @@ -188,7 +188,7 @@ __atomic_underlying_t<_Tp> __atomic_fetch_or_dispatch(_Tp& __a, } template -_LIBCUDACXX_HOST_DEVICE +_CCCL_HOST_DEVICE __atomic_underlying_t<_Tp> __atomic_fetch_xor_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order, _Sco, __atomic_locked_tag) { __atomic_underlying_t<_Tp> __old; diff --git a/libcudacxx/include/cuda/std/__atomic/storage/reference.h b/libcudacxx/include/cuda/std/__atomic/storage/reference.h index a892f24d12..cfd2e3a5e2 100644 --- a/libcudacxx/include/cuda/std/__atomic/storage/reference.h +++ b/libcudacxx/include/cuda/std/__atomic/storage/reference.h @@ -34,11 +34,11 @@ struct __atomic_ref_storage { _Tp* __a_value; - _LIBCUDACXX_HOST_DEVICE constexpr explicit + _CCCL_HOST_DEVICE constexpr explicit __atomic_ref_storage(_Tp& value) noexcept : __a_value(&value) {} - _LIBCUDACXX_HOST_DEVICE inline auto get() -> __underlying_t* { + _CCCL_HOST_DEVICE inline auto get() -> __underlying_t* { return __a_value; } }; diff --git a/libcudacxx/include/cuda/std/__atomic/storage/small.h b/libcudacxx/include/cuda/std/__atomic/storage/small.h index 679fbd5487..1f4c88abd1 100644 --- a/libcudacxx/include/cuda/std/__atomic/storage/small.h +++ b/libcudacxx/include/cuda/std/__atomic/storage/small.h @@ -33,25 +33,25 @@ using __atomic_small_proxy_t = __conditional_t::value, int32_t, u // Arithmetic conversions to/from proxy types template::value, int> = 0> -constexpr _LIBCUDACXX_HOST_DEVICE inline __atomic_small_proxy_t<_Tp> __atomic_small_to_32(_Tp __val) { +constexpr _CCCL_HOST_DEVICE inline __atomic_small_proxy_t<_Tp> __atomic_small_to_32(_Tp __val) { return static_cast<__atomic_small_proxy_t<_Tp>>(__val); } template::value, int> = 0> -constexpr _LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_small_from_32(__atomic_small_proxy_t<_Tp> __val) { +constexpr _CCCL_HOST_DEVICE inline _Tp __atomic_small_from_32(__atomic_small_proxy_t<_Tp> __val) { return static_cast<_Tp>(__val); } // Non-arithmetic conversion to/from proxy types template::value, int> = 0> -_LIBCUDACXX_HOST_DEVICE inline __atomic_small_proxy_t<_Tp> __atomic_small_to_32(_Tp __val) { +_CCCL_HOST_DEVICE inline __atomic_small_proxy_t<_Tp> __atomic_small_to_32(_Tp __val) { __atomic_small_proxy_t<_Tp> __temp{}; memcpy(&__temp, &__val, sizeof(_Tp)); return __temp; } template::value, int> = 0> -_LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_small_from_32(__atomic_small_proxy_t<_Tp> __val) { +_CCCL_HOST_DEVICE inline _Tp __atomic_small_from_32(__atomic_small_proxy_t<_Tp> __val) { _Tp __temp{}; memcpy(&__temp, &__val, sizeof(_Tp)); return __temp; @@ -65,33 +65,33 @@ struct __atomic_small_storage { __atomic_small_storage() noexcept = default; - _LIBCUDACXX_HOST_DEVICE + _CCCL_HOST_DEVICE constexpr explicit __atomic_small_storage(_Tp __value) : __a_value(__atomic_small_to_32(__value)) {} __atomic_storage<__proxy_t> __a_value; }; template -_LIBCUDACXX_HOST_DEVICE +_CCCL_HOST_DEVICE void __atomic_init_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, _Sco, __atomic_small_tag) { __atomic_init_dispatch(__a.__a_value, __atomic_small_to_32(__val), _Sco{}); } template -_LIBCUDACXX_HOST_DEVICE inline void __atomic_store_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco, __atomic_small_tag) { +_CCCL_HOST_DEVICE inline void __atomic_store_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco, __atomic_small_tag) { __atomic_store_dispatch(__a.__a_value, __atomic_small_to_32(__val), __order, _Sco{}); } template -_LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_load_dispatch(_Tp const& __a, memory_order __order, _Sco, __atomic_small_tag) { +_CCCL_HOST_DEVICE inline _Tp __atomic_load_dispatch(_Tp const& __a, memory_order __order, _Sco, __atomic_small_tag) { return __atomic_small_from_32<_Tp>(__atomic_load_dispatch(__a.__a_value, __order, _Sco{})); } template -_LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_exchange_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __value, memory_order __order, _Sco, __atomic_small_tag) { +_CCCL_HOST_DEVICE inline _Tp __atomic_exchange_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __value, memory_order __order, _Sco, __atomic_small_tag) { return __atomic_small_from_32<_Tp>(__atomic_exchange_dispatch(__a.__a_value, __atomic_small_to_32(__value), __order, _Sco{})); } -_LIBCUDACXX_HOST_DEVICE +_CCCL_HOST_DEVICE inline int __cuda_memcmp(void const * __lhs, void const * __rhs, size_t __count) { NV_DISPATCH_TARGET( NV_IS_DEVICE, ( @@ -112,7 +112,7 @@ inline int __cuda_memcmp(void const * __lhs, void const * __rhs, size_t __count) } template -_LIBCUDACXX_HOST_DEVICE inline bool __atomic_compare_exchange_weak_dispatch(_Tp& __a, __atomic_underlying_t<_Tp>* __expected, __atomic_underlying_t<_Tp> __value, memory_order __success, memory_order __failure, _Sco, __atomic_small_tag) { +_CCCL_HOST_DEVICE inline bool __atomic_compare_exchange_weak_dispatch(_Tp& __a, __atomic_underlying_t<_Tp>* __expected, __atomic_underlying_t<_Tp> __value, memory_order __success, memory_order __failure, _Sco, __atomic_small_tag) { auto __temp_expected = __atomic_small_to_32(*__expected); auto const __ret = __atomic_compare_exchange_weak_dispatch(__a.__a_value, &__temp_expected, __atomic_small_to_32(__value), __success, __failure, _Sco{}); auto const __actual = __atomic_small_from_32<__atomic_underlying_t<_Tp>>(__temp_expected); @@ -127,7 +127,7 @@ _LIBCUDACXX_HOST_DEVICE inline bool __atomic_compare_exchange_weak_dispatch(_Tp& } template -_LIBCUDACXX_HOST_DEVICE inline bool __atomic_compare_exchange_strong_dispatch(_Tp& __a, __atomic_underlying_t<_Tp>* __expected, __atomic_underlying_t<_Tp> __value, memory_order __success, memory_order __failure, _Sco, __atomic_small_tag) { +_CCCL_HOST_DEVICE inline bool __atomic_compare_exchange_strong_dispatch(_Tp& __a, __atomic_underlying_t<_Tp>* __expected, __atomic_underlying_t<_Tp> __value, memory_order __success, memory_order __failure, _Sco, __atomic_small_tag) { auto const __old = *__expected; while(1) { if(__atomic_compare_exchange_weak_dispatch(__a, __expected, __value, __success, __failure, _Sco{}, __atomic_small_tag{})) @@ -138,37 +138,37 @@ _LIBCUDACXX_HOST_DEVICE inline bool __atomic_compare_exchange_strong_dispatch(_T } template -_LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_fetch_add_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __delta, memory_order __order, _Sco, __atomic_small_tag) { +_CCCL_HOST_DEVICE inline _Tp __atomic_fetch_add_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __delta, memory_order __order, _Sco, __atomic_small_tag) { return __atomic_small_from_32<_Tp>(__atomic_fetch_add_dispatch(__a.__a_value, __atomic_small_to_32(__delta), __order, _Sco{})); } template -_LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_fetch_sub_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __delta, memory_order __order, _Sco, __atomic_small_tag) { +_CCCL_HOST_DEVICE inline _Tp __atomic_fetch_sub_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __delta, memory_order __order, _Sco, __atomic_small_tag) { return __atomic_small_from_32<_Tp>(__atomic_fetch_sub_dispatch(__a.__a_value, __atomic_small_to_32(__delta), __order, _Sco{})); } template -_LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_fetch_and_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order __order, _Sco, __atomic_small_tag) { +_CCCL_HOST_DEVICE inline _Tp __atomic_fetch_and_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order __order, _Sco, __atomic_small_tag) { return __atomic_small_from_32<_Tp>(__atomic_fetch_and_dispatch(__a.__a_value, __atomic_small_to_32(__pattern), __order, _Sco{})); } template -_LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_fetch_or_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order __order, _Sco, __atomic_small_tag) { +_CCCL_HOST_DEVICE inline _Tp __atomic_fetch_or_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order __order, _Sco, __atomic_small_tag) { return __atomic_small_from_32<_Tp>(__atomic_fetch_or_dispatch(__a.__a_value, __atomic_small_to_32(__pattern), __order, _Sco{})); } template -_LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_fetch_xor_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order __order, _Sco, __atomic_small_tag) { +_CCCL_HOST_DEVICE inline _Tp __atomic_fetch_xor_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order __order, _Sco, __atomic_small_tag) { return __atomic_small_from_32<_Tp>(__atomic_fetch_xor_dispatch(__a.__a_value, __atomic_small_to_32(__pattern), __order, _Sco{})); } template -_LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_fetch_max_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco, __atomic_small_tag) { +_CCCL_HOST_DEVICE inline _Tp __atomic_fetch_max_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco, __atomic_small_tag) { return __atomic_small_from_32<_Tp>(__atomic_fetch_max_dispatch(__a.__a_value, __atomic_small_to_32(__val), __order, _Sco{})); } template -_LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_fetch_min_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco, __atomic_small_tag) { +_CCCL_HOST_DEVICE inline _Tp __atomic_fetch_min_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco, __atomic_small_tag) { return __atomic_small_from_32<_Tp>(__atomic_fetch_min_dispatch(__a.__a_value, __atomic_small_to_32(__val), __order, _Sco{})); } diff --git a/libcudacxx/include/cuda/std/__atomic/wait/polling.h b/libcudacxx/include/cuda/std/__atomic/wait/polling.h index 4f4a8dd9a3..0a5e06c28f 100644 --- a/libcudacxx/include/cuda/std/__atomic/wait/polling.h +++ b/libcudacxx/include/cuda/std/__atomic/wait/polling.h @@ -32,21 +32,21 @@ struct __atomic_poll_tester { __underlying_t __val; memory_order __order; - _LIBCUDACXX_HOST_DEVICE + _CCCL_HOST_DEVICE __atomic_poll_tester(_Tp const volatile* __a, __underlying_t __v, memory_order __o) : __atom(__a) , __val(__v) , __order(__o) {} - _LIBCUDACXX_HOST_DEVICE + _CCCL_HOST_DEVICE bool operator()() const { return !(__atomic_load_dispatch(*__atom, __order, _Sco{}, __atomic_tag_t<_Tp>{}) == __val); } }; template -_LIBCUDACXX_HOST_DEVICE +_CCCL_HOST_DEVICE void __atomic_try_wait_slow_fallback(_Tp const volatile* __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco) { __libcpp_thread_poll_with_backoff(__atomic_poll_tester<_Tp, _Sco>(__a, __val, __order)); } From 452fc3b0c55cd43dfdaa543fce764ba8163c680d Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Thu, 18 Apr 2024 16:09:27 -0700 Subject: [PATCH 09/71] Fix another merge conflict (LIBCUDACXX->CCCL) --- libcudacxx/include/cuda/std/__atomic/storage/base.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcudacxx/include/cuda/std/__atomic/storage/base.h b/libcudacxx/include/cuda/std/__atomic/storage/base.h index 75500d0769..1725c4e819 100644 --- a/libcudacxx/include/cuda/std/__atomic/storage/base.h +++ b/libcudacxx/include/cuda/std/__atomic/storage/base.h @@ -32,7 +32,7 @@ struct __atomic_storage { "std::atomic requires that 'Tp' be a trivially copyable type"); #endif - _ALIGNAS(sizeof(_Tp)) _Tp __a_value; + _CCCL_ALIGNAS(sizeof(_Tp)) _Tp __a_value; _CCCL_HOST_DEVICE __atomic_storage() noexcept From 91f8b11623205e0515763f099dc69eb1458acb1a Mon Sep 17 00:00:00 2001 From: Wesley Maxey Date: Thu, 18 Apr 2024 16:11:08 -0700 Subject: [PATCH 10/71] Simplify tag dispatch in the atomic backend --- .../std/__atomic/operations/heterogeneous.h | 73 +++++++-------- .../include/cuda/std/__atomic/storage/base.h | 3 +- .../cuda/std/__atomic/storage/common.h | 3 - .../cuda/std/__atomic/storage/locked.h | 91 +++++++++---------- .../cuda/std/__atomic/storage/reference.h | 3 +- .../include/cuda/std/__atomic/storage/small.h | 63 +++++++------ 6 files changed, 111 insertions(+), 125 deletions(-) diff --git a/libcudacxx/include/cuda/std/__atomic/operations/heterogeneous.h b/libcudacxx/include/cuda/std/__atomic/operations/heterogeneous.h index 5bfa86661d..9ef3fcf51e 100644 --- a/libcudacxx/include/cuda/std/__atomic/operations/heterogeneous.h +++ b/libcudacxx/include/cuda/std/__atomic/operations/heterogeneous.h @@ -55,23 +55,16 @@ inline ) } -// Regarding __atomic_base_Tag -// It *is* possible to define it as: -// _Tag = __atomic_enable_if_default_base_t<_Tp> and make all tag types default to the 'base' backend -// I don't know if it's necessary to do that though. For now, this just adds some kind of protection -// preventing access to the functions with the wrong tag type. -template -using __atomic_enable_if_default_base_t = __enable_if_t, __atomic_base_tag>::value, __atomic_tag_t<_Tp>>; - -template > +// automatically dispatch based on default argument of '_Sto<_Tp, tag_t>' +template