diff --git a/libcudacxx/codegen/CMakeLists.txt b/libcudacxx/codegen/CMakeLists.txt
index b0df1b5a98..3477f988af 100644
--- a/libcudacxx/codegen/CMakeLists.txt
+++ b/libcudacxx/codegen/CMakeLists.txt
@@ -19,8 +19,8 @@ target_compile_features(
 
 add_dependencies(libcudacxx.atomics.codegen codegen)
 
-set(atomic_generated_output "${libcudacxx_BINARY_DIR}/codegen/atomic_cuda_generated.h")
-set(atomic_install_location "${libcudacxx_SOURCE_DIR}/include/cuda/std/detail/libcxx/include/support/atomic")
+set(atomic_generated_output "${libcudacxx_BINARY_DIR}/codegen/cuda_ptx_generated.h")
+set(atomic_install_location "${libcudacxx_SOURCE_DIR}/include/cuda/std/__atomic/functions")
 
 add_custom_target(
     libcudacxx.atomics.codegen.execute
@@ -32,13 +32,13 @@ add_dependencies(libcudacxx.atomics.codegen libcudacxx.atomics.codegen.execute)
 
 add_custom_target(
     libcudacxx.atomics.codegen.install
-    COMMAND ${CMAKE_COMMAND} -E copy "${atomic_generated_output}" "${atomic_install_location}/atomic_cuda_generated.h"
-    BYPRODUCTS "${atomic_install_location}/atomic_cuda_generated.h"
+    COMMAND ${CMAKE_COMMAND} -E copy "${atomic_generated_output}" "${atomic_install_location}/cuda_ptx_generated.h"
+    BYPRODUCTS "${atomic_install_location}/cuda_ptx_generated.h"
 )
 
 add_dependencies(libcudacxx.atomics.codegen.install libcudacxx.atomics.codegen.execute)
 
 add_test(
     NAME libcudacxx.atomics.codegen.diff
-    COMMAND ${CMAKE_COMMAND} -E compare_files "${atomic_install_location}/atomic_cuda_generated.h" "${atomic_generated_output}"
+    COMMAND ${CMAKE_COMMAND} -E compare_files "${atomic_install_location}/cuda_ptx_generated.h" "${atomic_generated_output}"
 )
diff --git a/libcudacxx/codegen/codegen.cpp b/libcudacxx/codegen/codegen.cpp
index 77d96a92d9..b7111c44d7 100644
--- a/libcudacxx/codegen/codegen.cpp
+++ b/libcudacxx/codegen/codegen.cpp
@@ -66,7 +66,7 @@ int main()
 
   std::vector<std::string> cv_qualifier{"volatile ", ""};
 
-  std::ofstream out("atomic_cuda_generated.h");
+  std::ofstream out("cuda_ptx_generated.h");
 
   out << R"XXX(//===----------------------------------------------------------------------===//
 //
@@ -78,8 +78,36 @@ int main()
 //
 //===----------------------------------------------------------------------===//
 
-// This is a autogenerated file, we want to ensure that it contains exactly the contentes we want to generate
+// This is an autogenerated file, we want to ensure that it contains exactly the contents we want to generate
 // clang-format off
+
+#ifndef _LIBCUDACXX___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H
+#define _LIBCUDACXX___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/cassert>
+#include <cuda/std/cstdint>
+
+#include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/is_signed.h>
+#include <cuda/std/__type_traits/is_unsigned.h>
+
+#include <cuda/std/__atomic/scopes.h>
+#include <cuda/std/__atomic/order.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+#if defined(_CCCL_CUDA_COMPILER)
+
 )XXX";
 
   auto scopenametag = [&](auto scope) {
@@ -302,11 +330,11 @@ int main()
               {
                 out << "template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==" << sz / 8 << ", int> = 0>\n";
                 out << "_CCCL_DEVICE bool __atomic_compare_exchange_cuda(" << cv
-                    << "_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int "
+                    << "_Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int "
                        "__failure_memorder, "
                     << scopenametag(s.first) << ") {\n";
                 out << "    uint" << sz << "_t __tmp = 0, __old = 0, __old_tmp;\n";
-                out << "    memcpy(&__tmp, __desired, " << sz / 8 << ");\n";
+                out << "    memcpy(&__tmp, &__desired, " << sz / 8 << ");\n";
                 out << "    memcpy(&__old, __expected, " << sz / 8 << ");\n";
                 out << "    __old_tmp = __old;\n";
                 out << "    NV_DISPATCH_TARGET(\n";
@@ -503,6 +531,9 @@ int main()
     }
   }
 
+  out << "\n#endif // defined(_CCCL_CUDA_COMPILER)\n";
+  out << "\n_LIBCUDACXX_END_NAMESPACE_STD\n";
+  out << "\n#endif // _LIBCUDACXX___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H\n";
   out << "\n// clang-format on\n";
 
   return 0;
diff --git a/libcudacxx/examples/rtc_example.cpp b/libcudacxx/examples/rtc_example.cpp
index 08ce22adf2..513e580584 100644
--- a/libcudacxx/examples/rtc_example.cpp
+++ b/libcudacxx/examples/rtc_example.cpp
@@ -50,11 +50,11 @@ template<class T> static constexpr T min(T a, T b) { return a < b ? a : b; }
 
 struct trie {
     struct ref {
-        cuda::std::atomic<trie*> ptr = ATOMIC_VAR_INIT(nullptr);
+        cuda::std::atomic<trie*> ptr = LIBCUDACXX_ATOMIC_VAR_INIT(nullptr);
         // the flag will protect against multiple pointer updates
-        cuda::std::atomic_flag flag = ATOMIC_FLAG_INIT;
+        cuda::std::atomic_flag flag = LIBCUDACXX_ATOMIC_FLAG_INIT;
     } next[26];
-    cuda::std::atomic<int> count = ATOMIC_VAR_INIT(0);
+    cuda::std::atomic<int> count = LIBCUDACXX_ATOMIC_VAR_INIT(0);
 };
 __host__ __device__
 int index_of(char c) {
diff --git a/libcudacxx/examples/trie.cu b/libcudacxx/examples/trie.cu
index b4b7a7a5f1..3a16fdceeb 100644
--- a/libcudacxx/examples/trie.cu
+++ b/libcudacxx/examples/trie.cu
@@ -36,11 +36,11 @@ struct trie
 {
   struct ref
   {
-    cuda::atomic<trie*, cuda::thread_scope_device> ptr = ATOMIC_VAR_INIT(nullptr);
+    cuda::atomic<trie*, cuda::thread_scope_device> ptr = LIBCUDACXX_ATOMIC_VAR_INIT(nullptr);
     // the flag will protect against multiple pointer updates
-    cuda::std::atomic_flag flag = ATOMIC_FLAG_INIT;
+    cuda::std::atomic_flag flag = LIBCUDACXX_ATOMIC_FLAG_INIT;
   } next[26];
-  cuda::std::atomic<short> count = ATOMIC_VAR_INIT(0);
+  cuda::std::atomic<short> count = LIBCUDACXX_ATOMIC_VAR_INIT(0);
 };
 __host__ __device__ int index_of(char c)
 {
diff --git a/libcudacxx/examples/trie_mt.cpp b/libcudacxx/examples/trie_mt.cpp
index 22fdb68499..2e2a46df29 100644
--- a/libcudacxx/examples/trie_mt.cpp
+++ b/libcudacxx/examples/trie_mt.cpp
@@ -36,11 +36,11 @@ struct trie
 {
   struct ref
   {
-    std::atomic<trie*> ptr = ATOMIC_VAR_INIT(nullptr);
+    std::atomic<trie*> ptr = LIBCUDACXX_ATOMIC_VAR_INIT(nullptr);
     // the flag will protect against multiple pointer updates
-    std::atomic_flag flag = ATOMIC_VAR_INIT(0);
+    std::atomic_flag flag = LIBCUDACXX_ATOMIC_VAR_INIT(0);
   } next[26];
-  std::atomic<int> count = ATOMIC_VAR_INIT(0);
+  std::atomic<int> count = LIBCUDACXX_ATOMIC_VAR_INIT(0);
 };
 int index_of(char c)
 {
diff --git a/libcudacxx/include/cuda/atomic b/libcudacxx/include/cuda/atomic
index 3c9e76cb1d..06dd1c785c 100644
--- a/libcudacxx/include/cuda/atomic
+++ b/libcudacxx/include/cuda/atomic
@@ -11,6 +11,14 @@
 #ifndef _CUDA_ATOMIC
 #define _CUDA_ATOMIC
 
-#include <cuda/std/atomic>
+#include <cuda/std/__cuda/atomic.h>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
 
 #endif // _CUDA_ATOMIC
diff --git a/libcudacxx/include/cuda/std/__atomic/api/common.h b/libcudacxx/include/cuda/std/__atomic/api/common.h
new file mode 100644
index 0000000000..e3f8c7c3e6
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/api/common.h
@@ -0,0 +1,192 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __LIBCUDACXX___ATOMIC_API_COMMON_H
+#define __LIBCUDACXX___ATOMIC_API_COMMON_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__atomic/types/base.h>
+
+// API definitions for the base atomic implementation
+#define _LIBCUDACXX_ATOMIC_COMMON_IMPL(_CONST, _VOLATILE)                                                           \
+  _CCCL_HOST_DEVICE inline bool is_lock_free() const _VOLATILE noexcept                                             \
+  {                                                                                                                 \
+    return _LIBCUDACXX_ATOMIC_IS_LOCK_FREE(sizeof(_Tp));                                                            \
+  }                                                                                                                 \
+  _CCCL_HOST_DEVICE inline void store(_Tp __d, memory_order __m = memory_order_seq_cst)                             \
+    _CONST _VOLATILE noexcept _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)                                             \
+  {                                                                                                                 \
+    __atomic_store_dispatch(&__a, __d, __m, _Sco{});                                                                \
+  }                                                                                                                 \
+  _CCCL_HOST_DEVICE inline _Tp load(memory_order __m = memory_order_seq_cst)                                        \
+    const _VOLATILE noexcept _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)                                               \
+  {                                                                                                                 \
+    return __atomic_load_dispatch(&__a, __m, _Sco{});                                                               \
+  }                                                                                                                 \
+  _CCCL_HOST_DEVICE inline operator _Tp() const _VOLATILE noexcept                                                  \
+  {                                                                                                                 \
+    return load();                                                                                                  \
+  }                                                                                                                 \
+  _CCCL_HOST_DEVICE inline _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) _CONST _VOLATILE noexcept \
+  {                                                                                                                 \
+    return __atomic_exchange_dispatch(&__a, __d, __m, _Sco{});                                                      \
+  }                                                                                                                 \
+  _CCCL_HOST_DEVICE inline bool compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __s, memory_order __f)        \
+    _CONST _VOLATILE noexcept _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)                                     \
+  {                                                                                                                 \
+    return __atomic_compare_exchange_weak_dispatch(&__a, &__e, __d, __s, __f, _Sco{});                              \
+  }                                                                                                                 \
+  _CCCL_HOST_DEVICE inline bool compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __s, memory_order __f)      \
+    _CONST _VOLATILE noexcept _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)                                     \
+  {                                                                                                                 \
+    return __atomic_compare_exchange_strong_dispatch(&__a, &__e, __d, __s, __f, _Sco{});                            \
+  }                                                                                                                 \
+  _CCCL_HOST_DEVICE inline bool compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst)   \
+    _CONST _VOLATILE noexcept                                                                                       \
+  {                                                                                                                 \
+    if (memory_order_acq_rel == __m)                                                                                \
+      return __atomic_compare_exchange_weak_dispatch(&__a, &__e, __d, __m, memory_order_acquire, _Sco{});           \
+    else if (memory_order_release == __m)                                                                           \
+      return __atomic_compare_exchange_weak_dispatch(&__a, &__e, __d, __m, memory_order_relaxed, _Sco{});           \
+    else                                                                                                            \
+      return __atomic_compare_exchange_weak_dispatch(&__a, &__e, __d, __m, __m, _Sco{});                            \
+  }                                                                                                                 \
+  _CCCL_HOST_DEVICE inline bool compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst) \
+    _CONST _VOLATILE noexcept                                                                                       \
+  {                                                                                                                 \
+    if (memory_order_acq_rel == __m)                                                                                \
+      return __atomic_compare_exchange_strong_dispatch(&__a, &__e, __d, __m, memory_order_acquire, _Sco{});         \
+    else if (memory_order_release == __m)                                                                           \
+      return __atomic_compare_exchange_strong_dispatch(&__a, &__e, __d, __m, memory_order_relaxed, _Sco{});         \
+    else                                                                                                            \
+      return __atomic_compare_exchange_strong_dispatch(&__a, &__e, __d, __m, __m, _Sco{});                          \
+  }                                                                                                                 \
+  _CCCL_HOST_DEVICE inline void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const _VOLATILE noexcept     \
+  {                                                                                                                 \
+    __atomic_wait(&__a, __v, __m, _Sco{});                                                                          \
+  }                                                                                                                 \
+  _CCCL_HOST_DEVICE inline void notify_one() _CONST _VOLATILE noexcept                                              \
+  {                                                                                                                 \
+    __atomic_notify_one(&__a, _Sco{});                                                                              \
+  }                                                                                                                 \
+  _CCCL_HOST_DEVICE inline void notify_all() _CONST _VOLATILE noexcept                                              \
+  {                                                                                                                 \
+    __atomic_notify_all(&__a, _Sco{});                                                                              \
+  }
+
+// API definitions for arithmetic atomics
+#define _LIBCUDACXX_ATOMIC_ARITHMETIC_IMPL(_CONST, _VOLATILE)                                                         \
+  _CCCL_HOST_DEVICE inline _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) _CONST _VOLATILE noexcept \
+  {                                                                                                                   \
+    return __atomic_fetch_add_dispatch(&__a, __op, __m, _Sco{});                                                      \
+  }                                                                                                                   \
+  _CCCL_HOST_DEVICE inline _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) _CONST _VOLATILE noexcept \
+  {                                                                                                                   \
+    return __atomic_fetch_sub_dispatch(&__a, __op, __m, _Sco{});                                                      \
+  }                                                                                                                   \
+  _CCCL_HOST_DEVICE inline _Tp operator++(int) _CONST _VOLATILE noexcept                                              \
+  {                                                                                                                   \
+    return fetch_add(_Tp(1));                                                                                         \
+  }                                                                                                                   \
+  _CCCL_HOST_DEVICE inline _Tp operator--(int) _CONST _VOLATILE noexcept                                              \
+  {                                                                                                                   \
+    return fetch_sub(_Tp(1));                                                                                         \
+  }                                                                                                                   \
+  _CCCL_HOST_DEVICE inline _Tp operator++() _CONST _VOLATILE noexcept                                                 \
+  {                                                                                                                   \
+    return fetch_add(_Tp(1)) + _Tp(1);                                                                                \
+  }                                                                                                                   \
+  _CCCL_HOST_DEVICE inline _Tp operator--() _CONST _VOLATILE noexcept                                                 \
+  {                                                                                                                   \
+    return fetch_sub(_Tp(1)) - _Tp(1);                                                                                \
+  }                                                                                                                   \
+  _CCCL_HOST_DEVICE inline _Tp operator+=(_Tp __op) _CONST _VOLATILE noexcept                                         \
+  {                                                                                                                   \
+    return fetch_add(__op) + __op;                                                                                    \
+  }                                                                                                                   \
+  _CCCL_HOST_DEVICE inline _Tp operator-=(_Tp __op) _CONST _VOLATILE noexcept                                         \
+  {                                                                                                                   \
+    return fetch_sub(__op) - __op;                                                                                    \
+  }
+
+// API definitions for bitwise atomics
+#define _LIBCUDACXX_ATOMIC_BITWISE_IMPL(_CONST, _VOLATILE)                                                            \
+  _CCCL_HOST_DEVICE inline _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) _CONST _VOLATILE noexcept \
+  {                                                                                                                   \
+    return __atomic_fetch_and_dispatch(&__a, __op, __m, _Sco{});                                                      \
+  }                                                                                                                   \
+  _CCCL_HOST_DEVICE inline _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) _CONST _VOLATILE noexcept  \
+  {                                                                                                                   \
+    return __atomic_fetch_or_dispatch(&__a, __op, __m, _Sco{});                                                       \
+  }                                                                                                                   \
+  _CCCL_HOST_DEVICE inline _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) _CONST _VOLATILE noexcept \
+  {                                                                                                                   \
+    return __atomic_fetch_xor_dispatch(&__a, __op, __m, _Sco{});                                                      \
+  }                                                                                                                   \
+  _CCCL_HOST_DEVICE inline _Tp operator&=(_Tp __op) _CONST _VOLATILE noexcept                                         \
+  {                                                                                                                   \
+    return fetch_and(__op) & __op;                                                                                    \
+  }                                                                                                                   \
+  _CCCL_HOST_DEVICE inline _Tp operator|=(_Tp __op) _CONST _VOLATILE noexcept                                         \
+  {                                                                                                                   \
+    return fetch_or(__op) | __op;                                                                                     \
+  }                                                                                                                   \
+  _CCCL_HOST_DEVICE inline _Tp operator^=(_Tp __op) _CONST _VOLATILE noexcept                                         \
+  {                                                                                                                   \
+    return fetch_xor(__op) ^ __op;                                                                                    \
+  }
+
+// API definitions for atomics with pointers
+#define _LIBCUDACXX_ATOMIC_POINTER_IMPL(_CONST, _VOLATILE)                                        \
+  _CCCL_HOST_DEVICE inline _Tp fetch_add(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) \
+    _CONST _VOLATILE noexcept                                                                     \
+  {                                                                                               \
+    return __atomic_fetch_add_dispatch(&__a, __op, __m, __thread_scope_system_tag{});             \
+  }                                                                                               \
+  _CCCL_HOST_DEVICE inline _Tp fetch_sub(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) \
+    _CONST _VOLATILE noexcept                                                                     \
+  {                                                                                               \
+    return __atomic_fetch_sub_dispatch(&__a, __op, __m, __thread_scope_system_tag{});             \
+  }                                                                                               \
+  _CCCL_HOST_DEVICE inline _Tp operator++(int) _CONST _VOLATILE noexcept                          \
+  {                                                                                               \
+    return fetch_add(1);                                                                          \
+  }                                                                                               \
+  _CCCL_HOST_DEVICE inline _Tp operator--(int) _CONST _VOLATILE noexcept                          \
+  {                                                                                               \
+    return fetch_sub(1);                                                                          \
+  }                                                                                               \
+  _CCCL_HOST_DEVICE inline _Tp operator++() _CONST _VOLATILE noexcept                             \
+  {                                                                                               \
+    return fetch_add(1) + 1;                                                                      \
+  }                                                                                               \
+  _CCCL_HOST_DEVICE inline _Tp operator--() _CONST _VOLATILE noexcept                             \
+  {                                                                                               \
+    return fetch_sub(1) - 1;                                                                      \
+  }                                                                                               \
+  _CCCL_HOST_DEVICE inline _Tp operator+=(ptrdiff_t __op) _CONST _VOLATILE noexcept               \
+  {                                                                                               \
+    return fetch_add(__op) + __op;                                                                \
+  }                                                                                               \
+  _CCCL_HOST_DEVICE inline _Tp operator-=(ptrdiff_t __op) _CONST _VOLATILE noexcept               \
+  {                                                                                               \
+    return fetch_sub(__op) - __op;                                                                \
+  }
+
+#endif // __LIBCUDACXX___ATOMIC_API_COMMON_H
diff --git a/libcudacxx/include/cuda/std/__atomic/api/owned.h b/libcudacxx/include/cuda/std/__atomic/api/owned.h
new file mode 100644
index 0000000000..fdbc8baac2
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/api/owned.h
@@ -0,0 +1,134 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __LIBCUDACXX___ATOMIC_API_OWNED_H
+#define __LIBCUDACXX___ATOMIC_API_OWNED_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__atomic/api/common.h>
+#include <cuda/std/__atomic/order.h>
+#include <cuda/std/__atomic/scopes.h>
+#include <cuda/std/__atomic/types.h>
+#include <cuda/std/__atomic/wait/notify_wait.h>
+#include <cuda/std/__atomic/wait/polling.h>
+#include <cuda/std/__type_traits/conditional.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+template <typename _Tp, typename _Sco>
+struct __atomic_common
+{
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_common(_Tp __v)
+      : __a(__v)
+  {}
+
+  constexpr inline __atomic_common() = default;
+
+  __atomic_storage_t<_Tp> __a;
+
+#if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+  static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
+#endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+
+  _LIBCUDACXX_ATOMIC_COMMON_IMPL(, )
+  _LIBCUDACXX_ATOMIC_COMMON_IMPL(, volatile)
+};
+
+template <typename _Tp, typename _Sco>
+struct __atomic_arithmetic
+{
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_arithmetic(_Tp __v)
+      : __a(__v)
+  {}
+
+  constexpr inline __atomic_arithmetic() = default;
+
+  __atomic_storage_t<_Tp> __a;
+
+#if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+  static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
+#endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+
+  _LIBCUDACXX_ATOMIC_COMMON_IMPL(, )
+  _LIBCUDACXX_ATOMIC_COMMON_IMPL(, volatile)
+
+  _LIBCUDACXX_ATOMIC_ARITHMETIC_IMPL(, )
+  _LIBCUDACXX_ATOMIC_ARITHMETIC_IMPL(, volatile)
+};
+
+template <typename _Tp, typename _Sco>
+struct __atomic_bitwise
+{
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_bitwise(_Tp __v)
+      : __a(__v)
+  {}
+
+  constexpr inline __atomic_bitwise() = default;
+
+  __atomic_storage_t<_Tp> __a;
+
+#if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+  static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
+#endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+
+  _LIBCUDACXX_ATOMIC_COMMON_IMPL(, )
+  _LIBCUDACXX_ATOMIC_COMMON_IMPL(, volatile)
+
+  _LIBCUDACXX_ATOMIC_ARITHMETIC_IMPL(, )
+  _LIBCUDACXX_ATOMIC_ARITHMETIC_IMPL(, volatile)
+
+  _LIBCUDACXX_ATOMIC_BITWISE_IMPL(, )
+  _LIBCUDACXX_ATOMIC_BITWISE_IMPL(, volatile)
+};
+
+template <typename _Tp, typename _Sco>
+struct __atomic_pointer
+{
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_pointer(_Tp __v)
+      : __a(__v)
+  {}
+
+  constexpr inline __atomic_pointer() = default;
+
+  __atomic_storage_t<_Tp> __a;
+
+#if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+  static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
+#endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+
+  _LIBCUDACXX_ATOMIC_COMMON_IMPL(, )
+  _LIBCUDACXX_ATOMIC_COMMON_IMPL(, volatile)
+
+  _LIBCUDACXX_ATOMIC_POINTER_IMPL(, )
+  _LIBCUDACXX_ATOMIC_POINTER_IMPL(, volatile)
+};
+
+template <typename _Tp, thread_scope _Sco = thread_scope_system>
+using __atomic_impl =
+  _If<is_pointer<_Tp>::value,
+      __atomic_pointer<_Tp, __scope_to_tag<_Sco>>,
+      _If<is_floating_point<_Tp>::value,
+          __atomic_arithmetic<_Tp, __scope_to_tag<_Sco>>,
+          _If<is_integral<_Tp>::value,
+              __atomic_bitwise<_Tp, __scope_to_tag<_Sco>>,
+              __atomic_common<_Tp, __scope_to_tag<_Sco>>>>>;
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // __LIBCUDACXX___ATOMIC_API_OWNED_H
diff --git a/libcudacxx/include/cuda/std/__atomic/api/reference.h b/libcudacxx/include/cuda/std/__atomic/api/reference.h
new file mode 100644
index 0000000000..eeba3a6746
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/api/reference.h
@@ -0,0 +1,114 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __LIBCUDACXX___ATOMIC_API_REFERENCE_H
+#define __LIBCUDACXX___ATOMIC_API_REFERENCE_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__atomic/api/common.h>
+#include <cuda/std/__atomic/order.h>
+#include <cuda/std/__atomic/scopes.h>
+#include <cuda/std/__atomic/types/reference.h>
+#include <cuda/std/__atomic/wait/notify_wait.h>
+#include <cuda/std/__atomic/wait/polling.h>
+#include <cuda/std/__type_traits/conditional.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+template <typename _Tp, typename _Sco>
+struct __atomic_ref_common
+{
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_ref_common(_Tp& __v)
+      : __a(&__v)
+  {}
+
+  __atomic_ref_storage<_Tp> __a;
+
+#if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+  static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
+#endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+
+  _LIBCUDACXX_ATOMIC_COMMON_IMPL(const, )
+};
+
+template <typename _Tp, typename _Sco>
+struct __atomic_ref_arithmetic
+{
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_ref_arithmetic(_Tp& __v)
+      : __a(&__v)
+  {}
+
+  __atomic_ref_storage<_Tp> __a;
+
+#if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+  static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
+#endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+
+  _LIBCUDACXX_ATOMIC_COMMON_IMPL(const, )
+  _LIBCUDACXX_ATOMIC_ARITHMETIC_IMPL(const, )
+};
+
+template <typename _Tp, typename _Sco>
+struct __atomic_ref_bitwise
+{
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_ref_bitwise(_Tp& __v)
+      : __a(&__v)
+  {}
+
+  __atomic_ref_storage<_Tp> __a;
+
+#if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+  static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
+#endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+
+  _LIBCUDACXX_ATOMIC_COMMON_IMPL(const, )
+  _LIBCUDACXX_ATOMIC_ARITHMETIC_IMPL(const, )
+  _LIBCUDACXX_ATOMIC_BITWISE_IMPL(const, )
+};
+
+template <typename _Tp, typename _Sco>
+struct __atomic_ref_pointer
+{
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_ref_pointer(_Tp& __v)
+      : __a(&__v)
+  {}
+
+  __atomic_ref_storage<_Tp> __a;
+
+#if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+  static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
+#endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+
+  _LIBCUDACXX_ATOMIC_COMMON_IMPL(const, )
+  _LIBCUDACXX_ATOMIC_POINTER_IMPL(const, )
+};
+
+template <typename _Tp, thread_scope _Sco = thread_scope_system>
+using __atomic_ref_impl =
+  _If<is_pointer<_Tp>::value,
+      __atomic_ref_pointer<_Tp, __scope_to_tag<_Sco>>,
+      _If<is_floating_point<_Tp>::value,
+          __atomic_ref_arithmetic<_Tp, __scope_to_tag<_Sco>>,
+          _If<is_integral<_Tp>::value,
+              __atomic_ref_bitwise<_Tp, __scope_to_tag<_Sco>>,
+              __atomic_ref_common<_Tp, __scope_to_tag<_Sco>>>>>;
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // __LIBCUDACXX___ATOMIC_API_REFERENCE_H
diff --git a/libcudacxx/include/cuda/std/__atomic/functions.h b/libcudacxx/include/cuda/std/__atomic/functions.h
new file mode 100644
index 0000000000..76cea325ce
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/functions.h
@@ -0,0 +1,33 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __LIBCUDACXX___ATOMIC_FUNCTIONS_H
+#define __LIBCUDACXX___ATOMIC_FUNCTIONS_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__atomic/platform.h>
+
+// Device atomics
+#include <cuda/std/__atomic/functions/cuda_ptx_derived.h>
+#include <cuda/std/__atomic/functions/cuda_ptx_generated.h>
+
+// Host atomics
+#include <cuda/std/__atomic/functions/host.h>
+
+#endif // __LIBCUDACXX___ATOMIC_FUNCTIONS_H
diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
new file mode 100644
index 0000000000..13f534b905
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
@@ -0,0 +1,203 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __LIBCUDACXX___ATOMIC_FUNCTIONS_DERIVED_H
+#define __LIBCUDACXX___ATOMIC_FUNCTIONS_DERIVED_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__atomic/functions/cuda_ptx_generated.h>
+#include <cuda/std/cstdint>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+#if defined(_CCCL_CUDA_COMPILER)
+
+template <typename _Tp, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
+_CCCL_DEVICE bool __atomic_compare_exchange_cuda(
+  _Tp volatile* __ptr, _Tp* __expected, const _Tp __desired, bool, int __success_memorder, int __failure_memorder, _Sco)
+{
+  auto const __aligned = (uint32_t*) ((intptr_t) __ptr & ~(sizeof(uint32_t) - 1));
+  auto const __offset  = uint32_t((intptr_t) __ptr & (sizeof(uint32_t) - 1)) * 8;
+  auto const __mask    = ((1 << sizeof(_Tp) * 8) - 1) << __offset;
+
+  uint32_t __old = *__expected << __offset;
+  uint32_t __old_value;
+  while (1)
+  {
+    __old_value = (__old & __mask) >> __offset;
+    if (__old_value != *__expected)
+    {
+      break;
+    }
+    uint32_t const __attempt = (__old & ~__mask) | (*__desired << __offset);
+    if (__atomic_compare_exchange_cuda(
+          __aligned, &__old, &__attempt, true, __success_memorder, __failure_memorder, _Sco{}))
+    {
+      return true;
+    }
+  }
+  *__expected = __old_value;
+  return false;
+}
+
+template <typename _Tp, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
+_CCCL_DEVICE void __atomic_exchange_cuda(_Tp volatile* __ptr, _Tp* __val, _Tp* __ret, int __memorder, _Sco)
+{
+  _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
+  while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __val, true, __memorder, __memorder, _Sco{}))
+    ;
+  *__ret = __expected;
+}
+
+template <typename _Tp, typename _Up, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
+_CCCL_DEVICE _Tp __atomic_fetch_add_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
+{
+  _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
+  _Tp __desired  = __expected + __val;
+  while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
+  {
+    __desired = __expected + __val;
+  }
+  return __expected;
+}
+
+template <typename _Tp,
+          typename _Up,
+          typename _Sco,
+          __enable_if_t<sizeof(_Tp) <= 2 || _CUDA_VSTD::is_floating_point<_Tp>::value, int> = 0>
+_CCCL_DEVICE _Tp __atomic_fetch_max_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
+{
+  _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
+  _Tp __desired  = __expected > __val ? __expected : __val;
+
+  while (__desired == __val
+         && !__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
+  {
+    __desired = __expected > __val ? __expected : __val;
+  }
+
+  return __expected;
+}
+
+template <typename _Tp,
+          typename _Up,
+          typename _Sco,
+          __enable_if_t<sizeof(_Tp) <= 2 || _CUDA_VSTD::is_floating_point<_Tp>::value, int> = 0>
+_CCCL_DEVICE _Tp __atomic_fetch_min_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
+{
+  _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
+  _Tp __desired  = __expected < __val ? __expected : __val;
+
+  while (__desired == __val
+         && !__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
+  {
+    __desired = __expected < __val ? __expected : __val;
+  }
+
+  return __expected;
+}
+
+template <typename _Tp, typename _Up, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
+_CCCL_DEVICE _Tp __atomic_fetch_sub_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
+{
+  _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
+  _Tp __desired  = __expected - __val;
+  while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
+  {
+    __desired = __expected - __val;
+  }
+  return __expected;
+}
+
+template <typename _Tp, typename _Up, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
+_CCCL_DEVICE _Tp __atomic_fetch_and_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
+{
+  _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
+  _Tp __desired  = __expected & __val;
+  while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
+  {
+    __desired = __expected & __val;
+  }
+  return __expected;
+}
+
+template <typename _Tp, typename _Up, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
+_CCCL_DEVICE _Tp __atomic_fetch_xor_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
+{
+  _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
+  _Tp __desired  = __expected ^ __val;
+  while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
+  {
+    __desired = __expected ^ __val;
+  }
+  return __expected;
+}
+
+template <typename _Tp, typename _Up, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
+_CCCL_DEVICE _Tp __atomic_fetch_or_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
+{
+  _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
+  _Tp __desired  = __expected | __val;
+  while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
+  {
+    __desired = __expected | __val;
+  }
+  return __expected;
+}
+
+template <typename _Tp, typename _Sco>
+_CCCL_DEVICE _Tp __atomic_load_n_cuda(const _Tp volatile* __ptr, int __memorder, _Sco)
+{
+  _Tp __ret;
+  __atomic_load_cuda(__ptr, &__ret, __memorder, _Sco{});
+  return __ret;
+}
+
+template <typename _Tp, typename _Sco>
+_CCCL_DEVICE void __atomic_store_n_cuda(_Tp volatile* __ptr, _Tp __val, int __memorder, _Sco)
+{
+  __atomic_store_cuda(__ptr, &__val, __memorder, _Sco{});
+}
+
+template <typename _Tp, typename _Sco>
+_CCCL_DEVICE bool __atomic_compare_exchange_n_cuda(
+  _Tp volatile* __ptr, _Tp* __expected, _Tp __desired, bool __weak, int __success_memorder, int __failure_memorder, _Sco)
+{
+  return __atomic_compare_exchange_cuda(
+    __ptr, __expected, __desired, __weak, __success_memorder, __failure_memorder, _Sco{});
+}
+
+template <typename _Tp, typename _Sco>
+_CCCL_DEVICE _Tp __atomic_exchange_n_cuda(_Tp volatile* __ptr, _Tp __val, int __memorder, _Sco)
+{
+  _Tp __ret;
+  __atomic_exchange_cuda(__ptr, &__val, &__ret, __memorder, _Sco{});
+  return __ret;
+}
+
+_CCCL_DEVICE static inline void __atomic_signal_fence_cuda(int)
+{
+  asm volatile("" ::: "memory");
+}
+
+#endif // defined(_CCCL_CUDA_COMPILER)
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // __LIBCUDACXX___ATOMIC_FUNCTIONS_DERIVED_H
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda_generated.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
similarity index 99%
rename from libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda_generated.h
rename to libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
index 648de27352..6d3ad940f3 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda_generated.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
@@ -8,8 +8,36 @@
 //
 //===----------------------------------------------------------------------===//
 
-// This is a autogenerated file, we want to ensure that it contains exactly the contentes we want to generate
+// This is an autogenerated file, we want to ensure that it contains exactly the contents we want to generate
 // clang-format off
+
+#ifndef _LIBCUDACXX___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H
+#define _LIBCUDACXX___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/cassert>
+#include <cuda/std/cstdint>
+
+#include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/is_signed.h>
+#include <cuda/std/__type_traits/is_unsigned.h>
+
+#include <cuda/std/__atomic/scopes.h>
+#include <cuda/std/__atomic/order.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+#if defined(_CCCL_CUDA_COMPILER)
+
 static inline _CCCL_DEVICE void __cuda_membar_block() { asm volatile("membar.cta;":::"memory"); }
 static inline _CCCL_DEVICE void __cuda_fence_acq_rel_block() { asm volatile("fence.acq_rel.cta;":::"memory"); }
 static inline _CCCL_DEVICE void __cuda_fence_sc_block() { asm volatile("fence.sc.cta;":::"memory"); }
@@ -249,9 +277,9 @@ template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inli
 template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_release_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.cta.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); }
 template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_volatile_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.cta.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); }
 template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) {
+_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) {
     uint32_t __tmp = 0, __old = 0, __old_tmp;
-    memcpy(&__tmp, __desired, 4);
+    memcpy(&__tmp, &__desired, 4);
     memcpy(&__old, __expected, 4);
     __old_tmp = __old;
     NV_DISPATCH_TARGET(
@@ -283,9 +311,9 @@ _CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *_
     return __ret;
 }
 template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) {
+_CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) {
     uint32_t __tmp = 0, __old = 0, __old_tmp;
-    memcpy(&__tmp, __desired, 4);
+    memcpy(&__tmp, &__desired, 4);
     memcpy(&__old, __expected, 4);
     __old_tmp = __old;
     NV_DISPATCH_TARGET(
@@ -1156,9 +1184,9 @@ template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inli
 template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_release_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.cta.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); }
 template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_volatile_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.cta.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); }
 template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) {
+_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) {
     uint64_t __tmp = 0, __old = 0, __old_tmp;
-    memcpy(&__tmp, __desired, 8);
+    memcpy(&__tmp, &__desired, 8);
     memcpy(&__old, __expected, 8);
     __old_tmp = __old;
     NV_DISPATCH_TARGET(
@@ -1190,9 +1218,9 @@ _CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *_
     return __ret;
 }
 template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) {
+_CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) {
     uint64_t __tmp = 0, __old = 0, __old_tmp;
-    memcpy(&__tmp, __desired, 8);
+    memcpy(&__tmp, &__desired, 8);
     memcpy(&__old, __expected, 8);
     __old_tmp = __old;
     NV_DISPATCH_TARGET(
@@ -2426,9 +2454,9 @@ template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inli
 template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_release_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); }
 template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_volatile_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); }
 template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) {
+_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) {
     uint32_t __tmp = 0, __old = 0, __old_tmp;
-    memcpy(&__tmp, __desired, 4);
+    memcpy(&__tmp, &__desired, 4);
     memcpy(&__old, __expected, 4);
     __old_tmp = __old;
     NV_DISPATCH_TARGET(
@@ -2460,9 +2488,9 @@ _CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *_
     return __ret;
 }
 template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) {
+_CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) {
     uint32_t __tmp = 0, __old = 0, __old_tmp;
-    memcpy(&__tmp, __desired, 4);
+    memcpy(&__tmp, &__desired, 4);
     memcpy(&__old, __expected, 4);
     __old_tmp = __old;
     NV_DISPATCH_TARGET(
@@ -3333,9 +3361,9 @@ template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inli
 template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_release_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.gpu.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); }
 template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_volatile_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.gpu.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); }
 template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) {
+_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) {
     uint64_t __tmp = 0, __old = 0, __old_tmp;
-    memcpy(&__tmp, __desired, 8);
+    memcpy(&__tmp, &__desired, 8);
     memcpy(&__old, __expected, 8);
     __old_tmp = __old;
     NV_DISPATCH_TARGET(
@@ -3367,9 +3395,9 @@ _CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *_
     return __ret;
 }
 template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) {
+_CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) {
     uint64_t __tmp = 0, __old = 0, __old_tmp;
-    memcpy(&__tmp, __desired, 8);
+    memcpy(&__tmp, &__desired, 8);
     memcpy(&__old, __expected, 8);
     __old_tmp = __old;
     NV_DISPATCH_TARGET(
@@ -4603,9 +4631,9 @@ template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inli
 template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_release_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.sys.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); }
 template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_volatile_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.sys.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); }
 template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) {
+_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) {
     uint32_t __tmp = 0, __old = 0, __old_tmp;
-    memcpy(&__tmp, __desired, 4);
+    memcpy(&__tmp, &__desired, 4);
     memcpy(&__old, __expected, 4);
     __old_tmp = __old;
     NV_DISPATCH_TARGET(
@@ -4637,9 +4665,9 @@ _CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *_
     return __ret;
 }
 template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) {
+_CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) {
     uint32_t __tmp = 0, __old = 0, __old_tmp;
-    memcpy(&__tmp, __desired, 4);
+    memcpy(&__tmp, &__desired, 4);
     memcpy(&__old, __expected, 4);
     __old_tmp = __old;
     NV_DISPATCH_TARGET(
@@ -5510,9 +5538,9 @@ template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inli
 template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_release_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.sys.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); }
 template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_volatile_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.sys.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); }
 template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) {
+_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) {
     uint64_t __tmp = 0, __old = 0, __old_tmp;
-    memcpy(&__tmp, __desired, 8);
+    memcpy(&__tmp, &__desired, 8);
     memcpy(&__old, __expected, 8);
     __old_tmp = __old;
     NV_DISPATCH_TARGET(
@@ -5544,9 +5572,9 @@ _CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *_
     return __ret;
 }
 template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) {
+_CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) {
     uint64_t __tmp = 0, __old = 0, __old_tmp;
-    memcpy(&__tmp, __desired, 8);
+    memcpy(&__tmp, &__desired, 8);
     memcpy(&__old, __expected, 8);
     __old_tmp = __old;
     NV_DISPATCH_TARGET(
@@ -6542,4 +6570,10 @@ _CCCL_DEVICE _Type* __atomic_fetch_sub_cuda(_Type **__ptr, ptrdiff_t __val, int
     return __ret;
 }
 
+#endif // defined(_CCCL_CUDA_COMPILER)
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H
+
 // clang-format on
diff --git a/libcudacxx/include/cuda/std/__atomic/functions/host.h b/libcudacxx/include/cuda/std/__atomic/functions/host.h
new file mode 100644
index 0000000000..59dc6bd093
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/functions/host.h
@@ -0,0 +1,250 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___ATOMICS_FUNCTIONS_HOST_H
+#define _LIBCUDACXX___ATOMICS_FUNCTIONS_HOST_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__atomic/order.h>
+#include <cuda/std/__atomic/platform.h>
+#include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/is_floating_point.h>
+#include <cuda/std/__type_traits/remove_cvref.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_CLANG("-Watomic-alignment")
+
+#if !defined(_CCCL_COMPILER_NVRTC)
+
+template <typename _Tp>
+struct __atomic_alignment_wrapper
+{
+  _CCCL_ALIGNAS(sizeof(_Tp)) _Tp __atom;
+};
+
+template <typename _Tp>
+__atomic_alignment_wrapper<__remove_cv_t<_Tp>>& __atomic_auto_align(_Tp* __a)
+{
+  using __aligned_t = __atomic_alignment_wrapper<__remove_cv_t<_Tp>>;
+  return *reinterpret_cast<__aligned_t*>(__a);
+};
+template <typename _Tp>
+const __atomic_alignment_wrapper<__remove_cv_t<_Tp>>& __atomic_auto_align(const _Tp* __a)
+{
+  using __aligned_t = const __atomic_alignment_wrapper<__remove_cv_t<_Tp>>;
+  return *reinterpret_cast<__aligned_t*>(__a);
+};
+template <typename _Tp>
+volatile __atomic_alignment_wrapper<__remove_cv_t<_Tp>>& __atomic_auto_align(volatile _Tp* __a)
+{
+  using __aligned_t = volatile __atomic_alignment_wrapper<__remove_cv_t<_Tp>>;
+  return *reinterpret_cast<__aligned_t*>(__a);
+};
+template <typename _Tp>
+const volatile __atomic_alignment_wrapper<__remove_cv_t<_Tp>>& __atomic_auto_align(const volatile _Tp* __a)
+{
+  using __aligned_t = const volatile __atomic_alignment_wrapper<__remove_cv_t<_Tp>>;
+  return *reinterpret_cast<__aligned_t*>(__a);
+};
+
+// Guard ifdef for lock free query in case it is assigned elsewhere (MSVC/CUDA)
+inline void __atomic_thread_fence_host(memory_order __order)
+{
+  __atomic_thread_fence(__atomic_order_to_int(__order));
+}
+
+inline void __atomic_signal_fence_host(memory_order __order)
+{
+  __atomic_signal_fence(__atomic_order_to_int(__order));
+}
+
+template <typename _Tp, typename _Up>
+inline void __atomic_store_host(_Tp* __a, _Up __val, memory_order __order)
+{
+  __atomic_store(
+    &__atomic_auto_align<_Tp>(__a), &__atomic_auto_align<__remove_cv_t<_Tp>>(&__val), __atomic_order_to_int(__order));
+}
+
+template <typename _Tp>
+inline auto __atomic_load_host(_Tp* __a, memory_order __order) -> __remove_cv_t<_Tp>
+{
+  __remove_cv_t<_Tp> __ret;
+  __atomic_load(
+    &__atomic_auto_align<_Tp>(__a), &__atomic_auto_align<__remove_cv_t<_Tp>>(&__ret), __atomic_order_to_int(__order));
+  return __ret;
+}
+
+template <typename _Tp, typename _Up>
+inline auto __atomic_exchange_host(_Tp* __a, _Up __val, memory_order __order) -> __remove_cv_t<_Tp>
+{
+  __remove_cv_t<_Tp> __ret;
+  __atomic_exchange(&__atomic_auto_align<_Tp>(__a),
+                    &__atomic_auto_align<__remove_cv_t<_Tp>>(&__val),
+                    &__atomic_auto_align<__remove_cv_t<_Tp>>(&__ret),
+                    __atomic_order_to_int(__order));
+  return __ret;
+}
+
+template <typename _Tp, typename _Up>
+inline bool __atomic_compare_exchange_strong_host(
+  _Tp* __a, _Up* __expected, _Up __value, memory_order __success, memory_order __failure)
+{
+  return __atomic_compare_exchange(
+    &__atomic_auto_align<_Tp>(__a),
+    &__atomic_auto_align<__remove_cv_t<_Tp>>(__expected),
+    &__atomic_auto_align<__remove_cv_t<_Tp>>(&__value),
+    false,
+    __atomic_order_to_int(__success),
+    __atomic_failure_order_to_int(__failure));
+}
+
+template <typename _Tp, typename _Up>
+inline bool __atomic_compare_exchange_weak_host(
+  _Tp* __a, _Up* __expected, _Up __value, memory_order __success, memory_order __failure)
+{
+  return __atomic_compare_exchange(
+    &__atomic_auto_align<_Tp>(__a),
+    &__atomic_auto_align<__remove_cv_t<_Tp>>(__expected),
+    &__atomic_auto_align<__remove_cv_t<_Tp>>(&__value),
+    true,
+    __atomic_order_to_int(__success),
+    __atomic_failure_order_to_int(__failure));
+}
+
+template <typename _Tp>
+struct __atomic_ptr_skip
+{
+  static constexpr auto __skip = 1;
+};
+
+template <typename _Tp>
+struct __atomic_ptr_skip<_Tp*>
+{
+  static constexpr auto __skip = sizeof(_Tp);
+};
+
+// FIXME: Haven't figured out what the spec says about using arrays with
+// atomic_fetch_add. Force a failure rather than creating bad behavior.
+template <typename _Tp>
+struct __atomic_ptr_skip<_Tp[]>
+{};
+template <typename _Tp, int n>
+struct __atomic_ptr_skip<_Tp[n]>
+{};
+
+template <typename _Tp>
+using __atomic_ptr_skip_t = __atomic_ptr_skip<__remove_cvref_t<_Tp>>;
+
+template <typename _Tp, typename _Td, __enable_if_t<!is_floating_point<_Tp>::value, int> = 0>
+inline __remove_cv_t<_Tp> __atomic_fetch_add_host(_Tp* __a, _Td __delta, memory_order __order)
+{
+  constexpr auto __skip_v = __atomic_ptr_skip_t<_Tp>::__skip;
+  return __atomic_fetch_add(__a, __delta * __skip_v, __atomic_order_to_int(__order));
+}
+
+template <typename _Tp, typename _Td, __enable_if_t<is_floating_point<_Tp>::value, int> = 0>
+inline __remove_cv_t<_Tp> __atomic_fetch_add_host(_Tp* __a, _Td __delta, memory_order __order)
+{
+  auto __expected = __atomic_load_host(__a, memory_order_relaxed);
+  auto __desired  = __expected + __delta;
+
+  while (!__atomic_compare_exchange_strong_host(__a, &__expected, __desired, __order, __order))
+  {
+    __desired = __expected + __delta;
+  }
+
+  return __expected;
+}
+
+template <typename _Tp, typename _Td, __enable_if_t<!is_floating_point<_Tp>::value, int> = 0>
+inline __remove_cv_t<_Tp> __atomic_fetch_sub_host(_Tp* __a, _Td __delta, memory_order __order)
+{
+  constexpr auto __skip_v = __atomic_ptr_skip_t<_Tp>::__skip;
+  return __atomic_fetch_sub(__a, __delta * __skip_v, __atomic_order_to_int(__order));
+}
+
+template <typename _Tp, typename _Td, __enable_if_t<is_floating_point<_Tp>::value, int> = 0>
+inline __remove_cv_t<_Tp> __atomic_fetch_sub_host(_Tp* __a, _Td __delta, memory_order __order)
+{
+  auto __expected = __atomic_load_host(__a, memory_order_relaxed);
+  auto __desired  = __expected - __delta;
+
+  while (!__atomic_compare_exchange_strong_host(__a, &__expected, __desired, __order, __order))
+  {
+    __desired = __expected - __delta;
+  }
+
+  return __expected;
+}
+
+template <typename _Tp, typename _Td>
+inline __remove_cv_t<_Tp> __atomic_fetch_and_host(_Tp* __a, _Td __pattern, memory_order __order)
+{
+  return __atomic_fetch_and(__a, __pattern, __atomic_order_to_int(__order));
+}
+
+template <typename _Tp, typename _Td>
+inline __remove_cv_t<_Tp> __atomic_fetch_or_host(_Tp* __a, _Td __pattern, memory_order __order)
+{
+  return __atomic_fetch_or(__a, __pattern, __atomic_order_to_int(__order));
+}
+
+template <typename _Tp, typename _Td>
+inline __remove_cv_t<_Tp> __atomic_fetch_xor_host(_Tp* __a, _Td __pattern, memory_order __order)
+{
+  return __atomic_fetch_xor(__a, __pattern, __atomic_order_to_int(__order));
+}
+
+template <typename _Tp, typename _Td>
+inline __remove_cv_t<_Tp> __atomic_fetch_max_host(_Tp* __a, _Td __val, memory_order __order)
+{
+  auto __expected = __atomic_load_host(__a, memory_order_relaxed);
+  auto __desired  = __expected > __val ? __expected : __val;
+
+  while (__desired == __val && !__atomic_compare_exchange_strong_host(__a, &__expected, __desired, __order, __order))
+  {
+    __desired = __expected > __val ? __expected : __val;
+  }
+
+  return __expected;
+}
+
+template <typename _Tp, typename _Td>
+inline __remove_cv_t<_Tp> __atomic_fetch_min_host(_Tp* __a, _Td __val, memory_order __order)
+{
+  auto __expected = __atomic_load_host(__a, memory_order_relaxed);
+  auto __desired  = __expected < __val ? __expected : __val;
+
+  while (__desired == __val && !__atomic_compare_exchange_strong_host(__a, &__expected, __desired, __order, __order))
+  {
+    __desired = __expected < __val ? __expected : __val;
+  }
+
+  return __expected;
+}
+
+#endif // !defined(_CCCL_COMPILER_NVRTC)
+
+_CCCL_DIAG_POP
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX___ATOMICS_FUNCTIONS_HOST_H
diff --git a/libcudacxx/include/cuda/std/__atomic/order.h b/libcudacxx/include/cuda/std/__atomic/order.h
new file mode 100644
index 0000000000..935efab757
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/order.h
@@ -0,0 +1,156 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __LIBCUDACXX___ATOMIC_ORDER_H
+#define __LIBCUDACXX___ATOMIC_ORDER_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/is_same.h>
+#include <cuda/std/__type_traits/underlying_type.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+#define _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)                                              \
+  _LIBCUDACXX_DIAGNOSE_WARNING(                                                                \
+    __m == memory_order_consume || __m == memory_order_acquire || __m == memory_order_acq_rel, \
+    "memory order argument to atomic operation is invalid")
+
+#define _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)                                           \
+  _LIBCUDACXX_DIAGNOSE_WARNING(__m == memory_order_release || __m == memory_order_acq_rel, \
+                               "memory order argument to atomic operation is invalid")
+
+#define _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__m, __f)                                  \
+  _LIBCUDACXX_DIAGNOSE_WARNING(__f == memory_order_release || __f == memory_order_acq_rel, \
+                               "memory order argument to atomic operation is invalid")
+
+#ifndef __ATOMIC_RELAXED
+#  define __ATOMIC_RELAXED 0
+#  define __ATOMIC_CONSUME 1
+#  define __ATOMIC_ACQUIRE 2
+#  define __ATOMIC_RELEASE 3
+#  define __ATOMIC_ACQ_REL 4
+#  define __ATOMIC_SEQ_CST 5
+#endif //__ATOMIC_RELAXED
+
+// Figure out what the underlying type for `memory_order` would be if it were
+// declared as an unscoped enum (accounting for -fshort-enums). Use this result
+// to pin the underlying type in C++20.
+enum __legacy_memory_order
+{
+  __mo_relaxed,
+  __mo_consume,
+  __mo_acquire,
+  __mo_release,
+  __mo_acq_rel,
+  __mo_seq_cst
+};
+
+using __memory_order_underlying_t = underlying_type<__legacy_memory_order>::type;
+
+#if _CCCL_STD_VER >= 2020
+
+enum class memory_order : __memory_order_underlying_t
+{
+  relaxed = __mo_relaxed,
+  consume = __mo_consume,
+  acquire = __mo_acquire,
+  release = __mo_release,
+  acq_rel = __mo_acq_rel,
+  seq_cst = __mo_seq_cst
+};
+
+inline constexpr auto memory_order_relaxed = memory_order::relaxed;
+inline constexpr auto memory_order_consume = memory_order::consume;
+inline constexpr auto memory_order_acquire = memory_order::acquire;
+inline constexpr auto memory_order_release = memory_order::release;
+inline constexpr auto memory_order_acq_rel = memory_order::acq_rel;
+inline constexpr auto memory_order_seq_cst = memory_order::seq_cst;
+
+#else // ^^^ C++20 ^^^ / vvv C++17 vvv
+
+typedef enum memory_order
+{
+  memory_order_relaxed = __mo_relaxed,
+  memory_order_consume = __mo_consume,
+  memory_order_acquire = __mo_acquire,
+  memory_order_release = __mo_release,
+  memory_order_acq_rel = __mo_acq_rel,
+  memory_order_seq_cst = __mo_seq_cst,
+} memory_order;
+
+#endif // _CCCL_STD_VER >= 2020
+
+_CCCL_HOST_DEVICE inline int __stronger_order_cuda(int __a, int __b)
+{
+  int const __max = __a > __b ? __a : __b;
+  if (__max != __ATOMIC_RELEASE)
+  {
+    return __max;
+  }
+  constexpr int __xform[] = {__ATOMIC_RELEASE, __ATOMIC_ACQ_REL, __ATOMIC_ACQ_REL, __ATOMIC_RELEASE};
+  return __xform[__a < __b ? __a : __b];
+}
+
+_CCCL_HOST_DEVICE inline constexpr int __atomic_order_to_int(memory_order __order)
+{
+  // Avoid switch statement to make this a constexpr.
+  return __order == memory_order_relaxed
+         ? __ATOMIC_RELAXED
+         : (__order == memory_order_acquire
+              ? __ATOMIC_ACQUIRE
+              : (__order == memory_order_release
+                   ? __ATOMIC_RELEASE
+                   : (__order == memory_order_seq_cst
+                        ? __ATOMIC_SEQ_CST
+                        : (__order == memory_order_acq_rel ? __ATOMIC_ACQ_REL : __ATOMIC_CONSUME))));
+}
+
+_CCCL_HOST_DEVICE inline constexpr int __atomic_failure_order_to_int(memory_order __order)
+{
+  // Avoid switch statement to make this a constexpr.
+  return __order == memory_order_relaxed
+         ? __ATOMIC_RELAXED
+         : (__order == memory_order_acquire
+              ? __ATOMIC_ACQUIRE
+              : (__order == memory_order_release
+                   ? __ATOMIC_RELAXED
+                   : (__order == memory_order_seq_cst
+                        ? __ATOMIC_SEQ_CST
+                        : (__order == memory_order_acq_rel ? __ATOMIC_ACQUIRE : __ATOMIC_CONSUME))));
+}
+
+static_assert((is_same<underlying_type<memory_order>::type, __memory_order_underlying_t>::value),
+              "unexpected underlying type for std::memory_order");
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+using memory_order = _CUDA_VSTD::memory_order;
+
+_LIBCUDACXX_INLINE_VAR constexpr memory_order memory_order_relaxed = _CUDA_VSTD::memory_order_relaxed;
+_LIBCUDACXX_INLINE_VAR constexpr memory_order memory_order_consume = _CUDA_VSTD::memory_order_consume;
+_LIBCUDACXX_INLINE_VAR constexpr memory_order memory_order_acquire = _CUDA_VSTD::memory_order_acquire;
+_LIBCUDACXX_INLINE_VAR constexpr memory_order memory_order_release = _CUDA_VSTD::memory_order_release;
+_LIBCUDACXX_INLINE_VAR constexpr memory_order memory_order_acq_rel = _CUDA_VSTD::memory_order_acq_rel;
+_LIBCUDACXX_INLINE_VAR constexpr memory_order memory_order_seq_cst = _CUDA_VSTD::memory_order_seq_cst;
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#endif // __LIBCUDACXX___ATOMIC_ORDER_H
diff --git a/libcudacxx/include/cuda/std/__atomic/platform.h b/libcudacxx/include/cuda/std/__atomic/platform.h
new file mode 100644
index 0000000000..6367e20234
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/platform.h
@@ -0,0 +1,89 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __LIBCUDACXX___ATOMIC_PLATFORM_H
+#define __LIBCUDACXX___ATOMIC_PLATFORM_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if defined(_CCCL_COMPILER_MSVC)
+#  include <cuda/std/__atomic/platform/msvc_to_builtins.h>
+#endif
+
+#if defined(__CLANG_ATOMIC_BOOL_LOCK_FREE)
+#  define LIBCUDACXX_ATOMIC_BOOL_LOCK_FREE     __CLANG_ATOMIC_BOOL_LOCK_FREE
+#  define LIBCUDACXX_ATOMIC_CHAR_LOCK_FREE     __CLANG_ATOMIC_CHAR_LOCK_FREE
+#  define LIBCUDACXX_ATOMIC_CHAR16_T_LOCK_FREE __CLANG_ATOMIC_CHAR16_T_LOCK_FREE
+#  define LIBCUDACXX_ATOMIC_CHAR32_T_LOCK_FREE __CLANG_ATOMIC_CHAR32_T_LOCK_FREE
+#  define LIBCUDACXX_ATOMIC_WCHAR_T_LOCK_FREE  __CLANG_ATOMIC_WCHAR_T_LOCK_FREE
+#  define LIBCUDACXX_ATOMIC_SHORT_LOCK_FREE    __CLANG_ATOMIC_SHORT_LOCK_FREE
+#  define LIBCUDACXX_ATOMIC_INT_LOCK_FREE      __CLANG_ATOMIC_INT_LOCK_FREE
+#  define LIBCUDACXX_ATOMIC_LONG_LOCK_FREE     __CLANG_ATOMIC_LONG_LOCK_FREE
+#  define LIBCUDACXX_ATOMIC_LLONG_LOCK_FREE    __CLANG_ATOMIC_LLONG_LOCK_FREE
+#  define LIBCUDACXX_ATOMIC_POINTER_LOCK_FREE  __CLANG_ATOMIC_POINTER_LOCK_FREE
+#elif defined(__GCC_ATOMIC_BOOL_LOCK_FREE)
+#  define LIBCUDACXX_ATOMIC_BOOL_LOCK_FREE     __GCC_ATOMIC_BOOL_LOCK_FREE
+#  define LIBCUDACXX_ATOMIC_CHAR_LOCK_FREE     __GCC_ATOMIC_CHAR_LOCK_FREE
+#  define LIBCUDACXX_ATOMIC_CHAR16_T_LOCK_FREE __GCC_ATOMIC_CHAR16_T_LOCK_FREE
+#  define LIBCUDACXX_ATOMIC_CHAR32_T_LOCK_FREE __GCC_ATOMIC_CHAR32_T_LOCK_FREE
+#  define LIBCUDACXX_ATOMIC_WCHAR_T_LOCK_FREE  __GCC_ATOMIC_WCHAR_T_LOCK_FREE
+#  define LIBCUDACXX_ATOMIC_SHORT_LOCK_FREE    __GCC_ATOMIC_SHORT_LOCK_FREE
+#  define LIBCUDACXX_ATOMIC_INT_LOCK_FREE      __GCC_ATOMIC_INT_LOCK_FREE
+#  define LIBCUDACXX_ATOMIC_LONG_LOCK_FREE     __GCC_ATOMIC_LONG_LOCK_FREE
+#  define LIBCUDACXX_ATOMIC_LLONG_LOCK_FREE    __GCC_ATOMIC_LLONG_LOCK_FREE
+#  define LIBCUDACXX_ATOMIC_POINTER_LOCK_FREE  __GCC_ATOMIC_POINTER_LOCK_FREE
+#else // !defined(__CLANG_ATOMIC_BOOL_LOCK_FREE) && !defined(__GCC_ATOMIC_BOOL_LOCK_FREE)
+#  define LIBCUDACXX_ATOMIC_BOOL_LOCK_FREE     2
+#  define LIBCUDACXX_ATOMIC_CHAR_LOCK_FREE     2
+#  define LIBCUDACXX_ATOMIC_CHAR16_T_LOCK_FREE 2
+#  define LIBCUDACXX_ATOMIC_CHAR32_T_LOCK_FREE 2
+#  define LIBCUDACXX_ATOMIC_WCHAR_T_LOCK_FREE  2
+#  define LIBCUDACXX_ATOMIC_SHORT_LOCK_FREE    2
+#  define LIBCUDACXX_ATOMIC_INT_LOCK_FREE      2
+#  define LIBCUDACXX_ATOMIC_LONG_LOCK_FREE     2
+#  define LIBCUDACXX_ATOMIC_LLONG_LOCK_FREE    2
+#  define LIBCUDACXX_ATOMIC_POINTER_LOCK_FREE  2
+#endif
+
+#define _LIBCUDACXX_ATOMIC_IS_LOCK_FREE(size) (size <= 8)
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+#if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+template <typename _Tp>
+struct __atomic_is_always_lock_free
+{
+  enum
+  {
+    __value = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0)
+  };
+};
+#else
+template <typename _Tp>
+struct __atomic_is_always_lock_free
+{
+  enum
+  {
+    __value = sizeof(_Tp) <= 8
+  };
+};
+#endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // __LIBCUDACXX___ATOMIC_PLATFORM_H
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_msvc.h b/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h
similarity index 92%
rename from libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_msvc.h
rename to libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h
index 53cd9cd4d7..8afa9756ef 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_msvc.h
+++ b/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h
@@ -1,4 +1,3 @@
-// -*- C++ -*-
 //===----------------------------------------------------------------------===//
 //
 // Part of libcu++, the C++ Standard Library for your entire system,
@@ -9,28 +8,42 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _MSC_VER
-#  error "This file is only for CL.EXE's benefit"
-#endif
+#ifndef __LIBCUDACXX___ATOMIC_PLATFORM_MSVC_H
+#define __LIBCUDACXX___ATOMIC_PLATFORM_MSVC_H
 
-#define _LIBCUDACXX_COMPILER_BARRIER() _ReadWriteBarrier()
+#include <cuda/std/detail/__config>
 
-#if defined(_M_ARM) || defined(_M_ARM64)
-#  define _LIBCUDACXX_MEMORY_BARRIER()             __dmb(0xB) // inner shared data memory barrier
-#  define _LIBCUDACXX_COMPILER_OR_MEMORY_BARRIER() _LIBCUDACXX_MEMORY_BARRIER()
-#elif defined(_M_IX86) || defined(_M_X64)
-#  define _LIBCUDACXX_MEMORY_BARRIER()             __faststorefence()
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if defined(_CCCL_COMPILER_MSVC)
+
+#  include <cuda/std/__atomic/order.h>
+#  include <cuda/std/cassert>
+
+#  include <intrin.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+#  define _LIBCUDACXX_COMPILER_BARRIER() _ReadWriteBarrier()
+
+#  if defined(_M_ARM) || defined(_M_ARM64)
+#    define _LIBCUDACXX_MEMORY_BARRIER()             __dmb(0xB) // inner shared data memory barrier
+#    define _LIBCUDACXX_COMPILER_OR_MEMORY_BARRIER() _LIBCUDACXX_MEMORY_BARRIER()
+#  elif defined(_M_IX86) || defined(_M_X64)
+#    define _LIBCUDACXX_MEMORY_BARRIER()             __faststorefence()
 // x86/x64 hardware only emits memory barriers inside _Interlocked intrinsics
-#  define _LIBCUDACXX_COMPILER_OR_MEMORY_BARRIER() _LIBCUDACXX_COMPILER_BARRIER()
-#else // ^^^ x86/x64 / unsupported hardware vvv
-#  error Unsupported hardware
-#endif // hardware
+#    define _LIBCUDACXX_COMPILER_OR_MEMORY_BARRIER() _LIBCUDACXX_COMPILER_BARRIER()
+#  else // ^^^ x86/x64 / unsupported hardware vvv
+#    error Unsupported hardware
+#  endif // hardware
 
 // MSVC Does not have compiler intrinsics for lock-free checking
-#ifndef _LIBCUDACXX_ATOMIC_IS_LOCK_FREE
-#  define _LIBCUDACXX_ATOMIC_IS_LOCK_FREE(__x) (__x <= 8)
-#endif
-
 inline int __stronger_order_msvc(int __a, int __b)
 {
   int const __max = __a > __b ? __a : __b;
@@ -64,41 +77,41 @@ using _enable_if_sized_as = typename enable_if<sizeof(_Type) == _Size, int>::typ
 template <class _Type, _enable_if_sized_as<_Type, 1> = 0>
 void __atomic_load_relaxed(const volatile _Type* __ptr, _Type* __ret)
 {
-#ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN
+#  ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN
   __int8 __tmp = *(const volatile __int8*) __ptr;
-#else
+#  else
   __int8 __tmp = __iso_volatile_load8((const volatile __int8*) __ptr);
-#endif
+#  endif
   *__ret = reinterpret_cast<_Type&>(__tmp);
 }
 template <class _Type, _enable_if_sized_as<_Type, 2> = 0>
 void __atomic_load_relaxed(const volatile _Type* __ptr, _Type* __ret)
 {
-#ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN
+#  ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN
   __int16 __tmp = *(const volatile __int16*) __ptr;
-#else
+#  else
   __int16 __tmp = __iso_volatile_load16((const volatile __int16*) __ptr);
-#endif
+#  endif
   *__ret = reinterpret_cast<_Type&>(__tmp);
 }
 template <class _Type, _enable_if_sized_as<_Type, 4> = 0>
 void __atomic_load_relaxed(const volatile _Type* __ptr, _Type* __ret)
 {
-#ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN
+#  ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN
   __int32 __tmp = *(const volatile __int32*) __ptr;
-#else
+#  else
   __int32 __tmp = __iso_volatile_load32((const volatile __int32*) __ptr);
-#endif
+#  endif
   *__ret = reinterpret_cast<_Type&>(__tmp);
 }
 template <class _Type, _enable_if_sized_as<_Type, 8> = 0>
 void __atomic_load_relaxed(const volatile _Type* __ptr, _Type* __ret)
 {
-#ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN
+#  ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN
   __int64 __tmp = *(const volatile __int64*) __ptr;
-#else
+#  else
   __int64 __tmp = __iso_volatile_load64((const volatile __int64*) __ptr);
-#endif
+#  endif
   *__ret = reinterpret_cast<_Type&>(__tmp);
 }
 
@@ -128,45 +141,45 @@ void __atomic_store_relaxed(volatile _Type* __ptr, _Type* __val)
 {
   auto __t = reinterpret_cast<__int8*>(__val);
   auto __d = reinterpret_cast<volatile __int8*>(__ptr);
-#ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN
+#  ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN
   (void) _InterlockedExchange8(__d, *__t);
-#else
+#  else
   __iso_volatile_store8(__d, *__t);
-#endif
+#  endif
 }
 template <class _Type, _enable_if_sized_as<_Type, 2> = 0>
 void __atomic_store_relaxed(volatile _Type* __ptr, _Type* __val)
 {
   auto __t = reinterpret_cast<__int16*>(__val);
   auto __d = reinterpret_cast<volatile __int16*>(__ptr);
-#ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN
+#  ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN
   (void) _InterlockedExchange16(__d, *__t);
-#else
+#  else
   __iso_volatile_store16(__d, *__t);
-#endif
+#  endif
 }
 template <class _Type, _enable_if_sized_as<_Type, 4> = 0>
 void __atomic_store_relaxed(volatile _Type* __ptr, _Type* __val)
 {
   auto __t = reinterpret_cast<__int32*>(__val);
   auto __d = reinterpret_cast<volatile __int32*>(__ptr);
-#ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN
+#  ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN
   // int cannot be converted to long?...
   (void) _InterlockedExchange(reinterpret_cast<volatile long*>(__d), *__t);
-#else
+#  else
   __iso_volatile_store32(__d, *__t);
-#endif
+#  endif
 }
 template <class _Type, _enable_if_sized_as<_Type, 8> = 0>
 void __atomic_store_relaxed(volatile _Type* __ptr, _Type* __val)
 {
   auto __t = reinterpret_cast<__int64*>(__val);
   auto __d = reinterpret_cast<volatile __int64*>(__ptr);
-#ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN
+#  ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN
   (void) _InterlockedExchange64(__d, *__t);
-#else
+#  else
   __iso_volatile_store64(__d, *__t);
-#endif
+#  endif
 }
 
 template <class _Type>
@@ -622,4 +635,8 @@ _Type __atomic_fetch_min(_Type volatile* __ptr, _Delta __val, int __memorder)
   return __expected;
 }
 
-#include <cuda/std/detail/libcxx/include/support/atomic/atomic_base.h>
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // defined(_CCCL_COMPILER_MSVC)
+
+#endif // __LIBCUDACXX___ATOMIC_PLATFORM_MSVC_H
diff --git a/libcudacxx/include/cuda/std/__atomic/scopes.h b/libcudacxx/include/cuda/std/__atomic/scopes.h
new file mode 100644
index 0000000000..70af777d5c
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/scopes.h
@@ -0,0 +1,99 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __LIBCUDACXX___ATOMIC_SCOPES_H
+#define __LIBCUDACXX___ATOMIC_SCOPES_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+// REMEMBER CHANGES TO THESE ARE ABI BREAKING
+// TODO: Space values out for potential new scopes
+#ifndef __ATOMIC_BLOCK
+#  define __ATOMIC_SYSTEM 0 // 0 indicates default
+#  define __ATOMIC_DEVICE 1
+#  define __ATOMIC_BLOCK  2
+#  define __ATOMIC_THREAD 10
+#endif //__ATOMIC_BLOCK
+
+enum thread_scope
+{
+  thread_scope_system = __ATOMIC_SYSTEM,
+  thread_scope_device = __ATOMIC_DEVICE,
+  thread_scope_block  = __ATOMIC_BLOCK,
+  thread_scope_thread = __ATOMIC_THREAD
+};
+
+struct __thread_scope_thread_tag
+{};
+struct __thread_scope_block_tag
+{};
+struct __thread_scope_device_tag
+{};
+struct __thread_scope_system_tag
+{};
+
+template <int _Scope>
+struct __scope_enum_to_tag
+{};
+/* This would be the implementation once an actual thread-scope backend exists.
+template<> struct __scope_enum_to_tag<(int)thread_scope_thread> {
+    using type = __thread_scope_thread_tag; };
+Until then: */
+template <>
+struct __scope_enum_to_tag<(int) thread_scope_thread>
+{
+  using __tag = __thread_scope_block_tag;
+};
+template <>
+struct __scope_enum_to_tag<(int) thread_scope_block>
+{
+  using __tag = __thread_scope_block_tag;
+};
+template <>
+struct __scope_enum_to_tag<(int) thread_scope_device>
+{
+  using __tag = __thread_scope_device_tag;
+};
+template <>
+struct __scope_enum_to_tag<(int) thread_scope_system>
+{
+  using __tag = __thread_scope_system_tag;
+};
+
+template <int _Scope>
+using __scope_to_tag = typename __scope_enum_to_tag<_Scope>::__tag;
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+using _CUDA_VSTD::thread_scope;
+using _CUDA_VSTD::thread_scope_block;
+using _CUDA_VSTD::thread_scope_device;
+using _CUDA_VSTD::thread_scope_system;
+using _CUDA_VSTD::thread_scope_thread;
+
+using _CUDA_VSTD::__thread_scope_block_tag;
+using _CUDA_VSTD::__thread_scope_device_tag;
+using _CUDA_VSTD::__thread_scope_system_tag;
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#endif // __LIBCUDACXX___ATOMIC_SCOPES_H
diff --git a/libcudacxx/include/cuda/std/__atomic/types.h b/libcudacxx/include/cuda/std/__atomic/types.h
new file mode 100644
index 0000000000..4b58ba4901
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/types.h
@@ -0,0 +1,49 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __LIBCUDACXX___ATOMIC_TYPES_H
+#define __LIBCUDACXX___ATOMIC_TYPES_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__atomic/types/base.h>
+#include <cuda/std/__atomic/types/locked.h>
+#include <cuda/std/__atomic/types/reference.h>
+#include <cuda/std/__atomic/types/small.h>
+#include <cuda/std/__type_traits/conditional.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+template <typename _Tp>
+struct __atomic_traits
+{
+  static constexpr bool __atomic_requires_lock  = !__atomic_is_always_lock_free<_Tp>::__value;
+  static constexpr bool __atomic_requires_small = sizeof(_Tp) < 4;
+  static constexpr bool __atomic_supports_reference =
+    __atomic_is_always_lock_free<_Tp>::__value && (sizeof(_Tp) >= 4 && sizeof(_Tp) <= 8);
+};
+
+template <typename _Tp>
+using __atomic_storage_t =
+  _If<__atomic_traits<_Tp>::__atomic_requires_small,
+      __atomic_small_storage<_Tp>,
+      _If<__atomic_traits<_Tp>::__atomic_requires_lock, __atomic_locked_storage<_Tp>, __atomic_storage<_Tp>>>;
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // __LIBCUDACXX___ATOMIC_TYPES_H
diff --git a/libcudacxx/include/cuda/std/__atomic/types/base.h b/libcudacxx/include/cuda/std/__atomic/types/base.h
new file mode 100644
index 0000000000..ecee01eb15
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/types/base.h
@@ -0,0 +1,239 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___ATOMIC_TYPES_BASE_H
+#define _LIBCUDACXX___ATOMIC_TYPES_BASE_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__atomic/functions.h>
+#include <cuda/std/__atomic/types/common.h>
+#include <cuda/std/__type_traits/is_trivially_copyable.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+template <typename _Tp>
+struct __atomic_storage
+{
+  using __underlying_t                = _Tp;
+  static constexpr __atomic_tag __tag = __atomic_tag::__atomic_base_tag;
+
+#if !defined(_CCCL_COMPILER_GCC) || (__GNUC__ >= 5)
+  static_assert(_CCCL_TRAIT(is_trivially_copyable, _Tp),
+                "std::atomic<Tp> requires that 'Tp' be a trivially copyable type");
+#endif
+
+  _CCCL_ALIGNAS(sizeof(_Tp)) _Tp __a_value;
+
+  constexpr explicit __atomic_storage() noexcept = default;
+
+  _CCCL_HOST_DEVICE constexpr explicit inline __atomic_storage(_Tp value) noexcept
+      : __a_value(value)
+  {}
+
+  _CCCL_HOST_DEVICE inline auto get() noexcept -> __underlying_t*
+  {
+    return &__a_value;
+  }
+  _CCCL_HOST_DEVICE inline auto get() const noexcept -> const __underlying_t*
+  {
+    return &__a_value;
+  }
+  _CCCL_HOST_DEVICE inline auto get() volatile noexcept -> volatile __underlying_t*
+  {
+    return &__a_value;
+  }
+  _CCCL_HOST_DEVICE inline auto get() const volatile noexcept -> const volatile __underlying_t*
+  {
+    return &__a_value;
+  }
+};
+
+_CCCL_HOST_DEVICE inline void __atomic_thread_fence_dispatch(memory_order __order)
+{
+  NV_DISPATCH_TARGET(
+    NV_IS_DEVICE,
+    (__atomic_thread_fence_cuda(static_cast<__memory_order_underlying_t>(__order), __thread_scope_system_tag());),
+    NV_IS_HOST,
+    (__atomic_thread_fence_host(__order);))
+}
+
+_CCCL_HOST_DEVICE inline void __atomic_signal_fence_dispatch(memory_order __order)
+{
+  NV_DISPATCH_TARGET(NV_IS_DEVICE,
+                     (__atomic_signal_fence_cuda(static_cast<__memory_order_underlying_t>(__order));),
+                     NV_IS_HOST,
+                     (__atomic_signal_fence_host(__order);))
+}
+
+template <typename _Sto, typename _Up, __atomic_storage_is_base<_Sto> = 0>
+_CCCL_HOST_DEVICE inline void __atomic_init_dispatch(_Sto* __a, _Up __val)
+{
+  __atomic_assign_volatile(__a->get(), __val);
+}
+
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_base<_Sto> = 0>
+_CCCL_HOST_DEVICE inline void __atomic_store_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {})
+{
+  NV_DISPATCH_TARGET(
+    NV_IS_DEVICE,
+    (__atomic_store_n_cuda(__a->get(), __val, static_cast<__memory_order_underlying_t>(__order), _Sco{});),
+    NV_IS_HOST,
+    (__atomic_store_host(__a->get(), __val, __order);))
+}
+
+template <typename _Sto, typename _Sco, __atomic_storage_is_base<_Sto> = 0>
+_CCCL_HOST_DEVICE inline auto __atomic_load_dispatch(const _Sto* __a, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
+  NV_DISPATCH_TARGET(
+    NV_IS_DEVICE,
+    (return __atomic_load_n_cuda(__a->get(), static_cast<__memory_order_underlying_t>(__order), _Sco{});),
+    NV_IS_HOST,
+    (return __atomic_load_host(__a->get(), __order);))
+}
+
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_base<_Sto> = 0>
+_CCCL_HOST_DEVICE inline auto __atomic_exchange_dispatch(_Sto* __a, _Up __value, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
+  NV_DISPATCH_TARGET(
+    NV_IS_DEVICE,
+    (return __atomic_exchange_n_cuda(__a->get(), __value, static_cast<__memory_order_underlying_t>(__order), _Sco{});),
+    NV_IS_HOST,
+    (return __atomic_exchange_host(__a->get(), __value, __order);))
+}
+
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_base<_Sto> = 0>
+_CCCL_HOST_DEVICE inline bool __atomic_compare_exchange_strong_dispatch(
+  _Sto* __a, _Up* __expected, _Up __val, memory_order __success, memory_order __failure, _Sco = {})
+{
+  bool __result = false;
+  NV_DISPATCH_TARGET(
+    NV_IS_DEVICE,
+    (__result = __atomic_compare_exchange_cuda(
+       __a->get(),
+       __expected,
+       __val,
+       false,
+       static_cast<__memory_order_underlying_t>(__success),
+       static_cast<__memory_order_underlying_t>(__failure),
+       _Sco{});),
+    NV_IS_HOST,
+    (__result = __atomic_compare_exchange_strong_host(__a->get(), __expected, __val, __success, __failure);))
+  return __result;
+}
+
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_base<_Sto> = 0>
+_CCCL_HOST_DEVICE inline bool __atomic_compare_exchange_weak_dispatch(
+  _Sto* __a, _Up* __expected, _Up __val, memory_order __success, memory_order __failure, _Sco = {})
+{
+  bool __result = false;
+  NV_DISPATCH_TARGET(
+    NV_IS_DEVICE,
+    (__result = __atomic_compare_exchange_cuda(
+       __a->get(),
+       __expected,
+       __val,
+       true,
+       static_cast<__memory_order_underlying_t>(__success),
+       static_cast<__memory_order_underlying_t>(__failure),
+       _Sco{});),
+    NV_IS_HOST,
+    (__result = __atomic_compare_exchange_weak_host(__a->get(), __expected, __val, __success, __failure);))
+  return __result;
+}
+
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_base<_Sto> = 0>
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_add_dispatch(_Sto* __a, _Up __delta, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
+  NV_DISPATCH_TARGET(
+    NV_IS_DEVICE,
+    (return __atomic_fetch_add_cuda(__a->get(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{});),
+    NV_IS_HOST,
+    (return __atomic_fetch_add_host(__a->get(), __delta, __order);))
+}
+
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_base<_Sto> = 0>
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_sub_dispatch(_Sto* __a, _Up __delta, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
+  NV_DISPATCH_TARGET(
+    NV_IS_DEVICE,
+    (return __atomic_fetch_sub_cuda(__a->get(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{});),
+    NV_IS_HOST,
+    (return __atomic_fetch_sub_host(__a->get(), __delta, __order);))
+}
+
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_base<_Sto> = 0>
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_and_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
+  NV_DISPATCH_TARGET(
+    NV_IS_DEVICE,
+    (return __atomic_fetch_and_cuda(__a->get(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{});),
+    NV_IS_HOST,
+    (return __atomic_fetch_and_host(__a->get(), __pattern, __order);))
+}
+
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_base<_Sto> = 0>
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_or_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
+  NV_DISPATCH_TARGET(
+    NV_IS_DEVICE,
+    (return __atomic_fetch_or_cuda(__a->get(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{});),
+    NV_IS_HOST,
+    (return __atomic_fetch_or_host(__a->get(), __pattern, __order);))
+}
+
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_base<_Sto> = 0>
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_xor_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
+  NV_DISPATCH_TARGET(
+    NV_IS_DEVICE,
+    (return __atomic_fetch_xor_cuda(__a->get(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{});),
+    NV_IS_HOST,
+    (return __atomic_fetch_xor_host(__a->get(), __pattern, __order);))
+}
+
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_base<_Sto> = 0>
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_max_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
+  NV_IF_TARGET(
+    NV_IS_DEVICE,
+    (return __atomic_fetch_max_cuda(__a->get(), __val, static_cast<__memory_order_underlying_t>(__order), _Sco{});),
+    (return __atomic_fetch_max_host(__a->get(), __val, __order);))
+}
+
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_base<_Sto> = 0>
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_min_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
+  NV_IF_TARGET(
+    NV_IS_DEVICE,
+    (return __atomic_fetch_min_cuda(__a->get(), __val, static_cast<__memory_order_underlying_t>(__order), _Sco{});),
+    (return __atomic_fetch_min_host(__a->get(), __val, __order);))
+}
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX___ATOMIC_TYPES_BASE_H
diff --git a/libcudacxx/include/cuda/std/__atomic/types/common.h b/libcudacxx/include/cuda/std/__atomic/types/common.h
new file mode 100644
index 0000000000..9a44fe7034
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/types/common.h
@@ -0,0 +1,100 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___ATOMIC_TYPES_COMMON_H
+#define _LIBCUDACXX___ATOMIC_TYPES_COMMON_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/is_assignable.h>
+#include <cuda/std/__type_traits/remove_cv.h>
+#include <cuda/std/__type_traits/remove_cvref.h>
+#include <cuda/std/detail/libcxx/include/cstring>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+enum class __atomic_tag
+{
+  __atomic_base_tag,
+  __atomic_locked_tag,
+  __atomic_small_tag,
+};
+
+// Helpers to SFINAE on the tag inside the storage object
+template <typename _Sto>
+using __atomic_storage_is_base = __enable_if_t<__atomic_tag::__atomic_base_tag == __remove_cvref_t<_Sto>::__tag, int>;
+template <typename _Sto>
+using __atomic_storage_is_locked =
+  __enable_if_t<__atomic_tag::__atomic_locked_tag == __remove_cvref_t<_Sto>::__tag, int>;
+template <typename _Sto>
+using __atomic_storage_is_small = __enable_if_t<__atomic_tag::__atomic_small_tag == __remove_cvref_t<_Sto>::__tag, int>;
+
+template <typename _Tp>
+using __atomic_underlying_t = typename _Tp::__underlying_t;
+template <typename _Tp>
+using __atomic_underlying_remove_cv_t = __remove_cv_t<typename _Tp::__underlying_t>;
+
+// [atomics.types.generic]p1 guarantees _Tp is trivially copyable. Because
+// the default operator= in an object is not volatile, a byte-by-byte copy
+// is required.
+template <typename _Tp, typename _Tv>
+_CCCL_HOST_DEVICE __enable_if_t<_CCCL_TRAIT(is_assignable, _Tp&, _Tv)>
+__atomic_assign_volatile(_Tp* __a_value, _Tv const& __val)
+{
+  *__a_value = __val;
+}
+
+template <typename _Tp, typename _Tv>
+_CCCL_HOST_DEVICE __enable_if_t<_CCCL_TRAIT(is_assignable, _Tp&, _Tv)>
+__atomic_assign_volatile(_Tp volatile* __a_value, _Tv volatile const& __val)
+{
+  volatile char* __to         = reinterpret_cast<volatile char*>(__a_value);
+  volatile char* __end        = __to + sizeof(_Tp);
+  volatile const char* __from = reinterpret_cast<volatile const char*>(&__val);
+  while (__to != __end)
+  {
+    *__to++ = *__from++;
+  }
+}
+
+_CCCL_HOST_DEVICE inline int __atomic_memcmp(void const* __lhs, void const* __rhs, size_t __count)
+{
+  NV_DISPATCH_TARGET(
+    NV_IS_DEVICE,
+    (auto __lhs_c = reinterpret_cast<unsigned char const*>(__lhs);
+     auto __rhs_c = reinterpret_cast<unsigned char const*>(__rhs);
+     while (__count--) {
+       auto const __lhs_v = *__lhs_c++;
+       auto const __rhs_v = *__rhs_c++;
+       if (__lhs_v < __rhs_v)
+       {
+         return -1;
+       }
+       if (__lhs_v > __rhs_v)
+       {
+         return 1;
+       }
+     } return 0;),
+    NV_IS_HOST,
+    (return memcmp(__lhs, __rhs, __count);))
+}
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX___ATOMIC_TYPES_COMMON_H
diff --git a/libcudacxx/include/cuda/std/__atomic/types/locked.h b/libcudacxx/include/cuda/std/__atomic/types/locked.h
new file mode 100644
index 0000000000..1fc5103d2a
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/types/locked.h
@@ -0,0 +1,221 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___ATOMIC_TYPES_LOCKED_H
+#define _LIBCUDACXX___ATOMIC_TYPES_LOCKED_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__atomic/order.h>
+#include <cuda/std/__atomic/scopes.h>
+#include <cuda/std/__atomic/types/base.h>
+#include <cuda/std/__atomic/types/common.h>
+#include <cuda/std/__type_traits/remove_cv.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+// Locked atomics must override the dispatch to be able to implement RMW primitives around the embedded lock.
+template <typename _Tp>
+struct __atomic_locked_storage
+{
+  using __underlying_t                = _Tp;
+  static constexpr __atomic_tag __tag = __atomic_tag::__atomic_locked_tag;
+
+  _Tp __a_value;
+  mutable __atomic_storage<_LIBCUDACXX_ATOMIC_FLAG_TYPE> __a_lock;
+
+  explicit constexpr __atomic_locked_storage() noexcept = default;
+
+  _CCCL_HOST_DEVICE constexpr explicit inline __atomic_locked_storage(_Tp value) noexcept
+      : __a_value(value)
+      , __a_lock{}
+  {}
+
+  template <typename _Sco>
+  _CCCL_HOST_DEVICE inline void __lock(_Sco) const volatile noexcept
+  {
+    while (1 == __atomic_exchange_dispatch(&__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), memory_order_acquire, _Sco{}))
+      /*spin*/;
+  }
+  template <typename _Sco>
+  _CCCL_HOST_DEVICE inline void __lock(_Sco) const noexcept
+  {
+    while (1 == __atomic_exchange_dispatch(&__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), memory_order_acquire, _Sco{}))
+      /*spin*/;
+  }
+  template <typename _Sco>
+  _CCCL_HOST_DEVICE inline void __unlock(_Sco) const volatile noexcept
+  {
+    __atomic_store_dispatch(&__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), memory_order_release, _Sco{});
+  }
+  template <typename _Sco>
+  _CCCL_HOST_DEVICE inline void __unlock(_Sco) const noexcept
+  {
+    __atomic_store_dispatch(&__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), memory_order_release, _Sco{});
+  }
+};
+
+template <typename _Sto, typename _Up, __atomic_storage_is_locked<_Sto> = 0>
+_CCCL_HOST_DEVICE inline void __atomic_init_dispatch(_Sto* __a, _Up __val)
+{
+  __atomic_assign_volatile(&__a->__a_value, __val);
+}
+
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_locked<_Sto> = 0>
+_CCCL_HOST_DEVICE inline void __atomic_store_dispatch(_Sto* __a, _Up __val, memory_order, _Sco = {})
+{
+  __a->__lock(_Sco{});
+  __atomic_assign_volatile(&__a->__a_value, __val);
+  __a->__unlock(_Sco{});
+}
+
+template <typename _Sto, typename _Sco, __atomic_storage_is_locked<_Sto> = 0>
+_CCCL_HOST_DEVICE inline auto __atomic_load_dispatch(const _Sto* __a, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
+  using _Tp = __atomic_underlying_t<_Sto>;
+  _Tp __old;
+  __a->__lock(_Sco{});
+  __atomic_assign_volatile(&__old, __a->__a_value);
+  __a->__unlock(_Sco{});
+  return __old;
+}
+
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_locked<_Sto> = 0>
+_CCCL_HOST_DEVICE inline auto __atomic_exchange_dispatch(_Sto* __a, _Up __value, memory_order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
+  using _Tp = __atomic_underlying_t<_Sto>;
+  _Tp __old;
+  __a->__lock(_Sco{});
+  __atomic_assign_volatile(&__old, __a->__a_value);
+  __atomic_assign_volatile(&__a->__a_value, __value);
+  __a->__unlock(_Sco{});
+  return __old;
+}
+
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_locked<_Sto> = 0>
+_CCCL_HOST_DEVICE inline bool __atomic_compare_exchange_strong_dispatch(
+  _Sto* __a, _Up* __expected, _Up __value, memory_order, memory_order, _Sco = {})
+{
+  using _Tp = __atomic_underlying_t<_Sto>;
+  _Tp __temp;
+  __a->__lock(_Sco{});
+  __atomic_assign_volatile(&__temp, __a->__a_value);
+  bool __ret = __temp == *__expected;
+  if (__ret)
+  {
+    __atomic_assign_volatile(&__a->__a_value, __value);
+  }
+  else
+  {
+    __atomic_assign_volatile(__expected, __a->__a_value);
+  }
+  __a->__unlock(_Sco{});
+  return __ret;
+}
+
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_locked<_Sto> = 0>
+_CCCL_HOST_DEVICE inline bool
+__atomic_compare_exchange_weak_dispatch(_Sto* __a, _Up* __expected, _Up __value, memory_order, memory_order, _Sco = {})
+{
+  using _Tp = __atomic_underlying_t<_Sto>;
+  _Tp __temp;
+  __a->__lock(_Sco{});
+  __atomic_assign_volatile(&__temp, __a->__a_value);
+  bool __ret = __temp == *__expected;
+  if (__ret)
+  {
+    __atomic_assign_volatile(&__a->__a_value, __value);
+  }
+  else
+  {
+    __atomic_assign_volatile(__expected, __a->__a_value);
+  }
+  __a->__unlock(_Sco{});
+  return __ret;
+}
+
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_locked<_Sto> = 0>
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_add_dispatch(_Sto* __a, _Up __delta, memory_order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
+  using _Tp = __atomic_underlying_t<_Sto>;
+  _Tp __old;
+  __a->__lock(_Sco{});
+  __atomic_assign_volatile(&__old, __a->__a_value);
+  __atomic_assign_volatile(&__a->__a_value, _Tp(__old + __delta));
+  __a->__unlock(_Sco{});
+  return __old;
+}
+
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_locked<_Sto> = 0>
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_sub_dispatch(_Sto* __a, _Up __delta, memory_order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
+  using _Tp = __atomic_underlying_t<_Sto>;
+  _Tp __old;
+  __a->__lock(_Sco{});
+  __atomic_assign_volatile(&__old, __a->__a_value);
+  __atomic_assign_volatile(&__a->__a_value, _Tp(__old - __delta));
+  __a->__unlock(_Sco{});
+  return __old;
+}
+
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_locked<_Sto> = 0>
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_and_dispatch(_Sto* __a, _Up __pattern, memory_order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
+  using _Tp = __atomic_underlying_t<_Sto>;
+  _Tp __old;
+  __a->__lock(_Sco{});
+  __atomic_assign_volatile(&__old, __a->__a_value);
+  __atomic_assign_volatile(&__a->__a_value, _Tp(__old & __pattern));
+  __a->__unlock(_Sco{});
+  return __old;
+}
+
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_locked<_Sto> = 0>
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_or_dispatch(_Sto* __a, _Up __pattern, memory_order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
+  using _Tp = __atomic_underlying_t<_Sto>;
+  _Tp __old;
+  __a->__lock(_Sco{});
+  __atomic_assign_volatile(&__old, __a->__a_value);
+  __atomic_assign_volatile(&__a->__a_value, _Tp(__old | __pattern));
+  __a->__unlock(_Sco{});
+  return __old;
+}
+
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_locked<_Sto> = 0>
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_xor_dispatch(_Sto* __a, _Up __pattern, memory_order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
+  using _Tp = __atomic_underlying_t<_Sto>;
+  _Tp __old;
+  __a->__lock(_Sco{});
+  __atomic_assign_volatile(&__old, __a->__a_value);
+  __atomic_assign_volatile(&__a->__a_value, _Tp(__old ^ __pattern));
+  __a->__unlock(_Sco{});
+  return __old;
+}
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX___ATOMIC_TYPES_LOCKED_H
diff --git a/libcudacxx/include/cuda/std/__atomic/types/reference.h b/libcudacxx/include/cuda/std/__atomic/types/reference.h
new file mode 100644
index 0000000000..a83c8e5832
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/types/reference.h
@@ -0,0 +1,69 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___ATOMIC_TYPES_REFERENCE_H
+#define _LIBCUDACXX___ATOMIC_TYPES_REFERENCE_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__atomic/types/base.h>
+#include <cuda/std/__type_traits/is_trivially_copyable.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+// Reference is compatible with __atomic_base_tag and uses the default dispatch
+template <typename _Tp>
+struct __atomic_ref_storage
+{
+  using __underlying_t                = _Tp;
+  static constexpr __atomic_tag __tag = __atomic_tag::__atomic_base_tag;
+
+#if !defined(_CCCL_COMPILER_GCC) || (__GNUC__ >= 5)
+  static_assert(_CCCL_TRAIT(is_trivially_copyable, _Tp),
+                "std::atomic_ref<Tp> requires that 'Tp' be a trivially copyable type");
+#endif
+
+  _Tp* __a_value;
+
+  __atomic_ref_storage() = delete;
+
+  _CCCL_HOST_DEVICE constexpr explicit inline __atomic_ref_storage(_Tp* value) noexcept
+      : __a_value(value)
+  {}
+
+  _CCCL_HOST_DEVICE inline auto get() noexcept -> __underlying_t*
+  {
+    return __a_value;
+  }
+  _CCCL_HOST_DEVICE inline auto get() const noexcept -> __underlying_t*
+  {
+    return __a_value;
+  }
+  _CCCL_HOST_DEVICE inline auto get() volatile noexcept -> volatile __underlying_t*
+  {
+    return __a_value;
+  }
+  _CCCL_HOST_DEVICE inline auto get() const volatile noexcept -> volatile __underlying_t*
+  {
+    return __a_value;
+  }
+};
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX___ATOMIC_TYPES_REFERENCE_H
diff --git a/libcudacxx/include/cuda/std/__atomic/types/small.h b/libcudacxx/include/cuda/std/__atomic/types/small.h
new file mode 100644
index 0000000000..8f38df5bb0
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/types/small.h
@@ -0,0 +1,222 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___ATOMIC_TYPES_SMALL_H
+#define _LIBCUDACXX___ATOMIC_TYPES_SMALL_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__atomic/order.h>
+#include <cuda/std/__atomic/scopes.h>
+#include <cuda/std/__atomic/types/base.h>
+#include <cuda/std/__type_traits/conditional.h>
+#include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/is_arithmetic.h>
+#include <cuda/std/__type_traits/is_signed.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+// manipulated by PTX without any performance overhead
+template <typename _Tp>
+using __atomic_small_proxy_t = _If<_CCCL_TRAIT(is_signed, _Tp), int32_t, uint32_t>;
+
+// Arithmetic conversions to/from proxy types
+template <class _Tp, __enable_if_t<_CCCL_TRAIT(is_arithmetic, _Tp), int> = 0>
+_CCCL_HOST_DEVICE constexpr __atomic_small_proxy_t<_Tp> __atomic_small_to_32(_Tp __val)
+{
+  return static_cast<__atomic_small_proxy_t<_Tp>>(__val);
+}
+
+template <class _Tp, __enable_if_t<_CCCL_TRAIT(is_arithmetic, _Tp), int> = 0>
+_CCCL_HOST_DEVICE constexpr inline _Tp __atomic_small_from_32(__atomic_small_proxy_t<_Tp> __val)
+{
+  return static_cast<_Tp>(__val);
+}
+
+// Non-arithmetic conversion to/from proxy types
+template <class _Tp, __enable_if_t<!_CCCL_TRAIT(is_arithmetic, _Tp), int> = 0>
+_CCCL_HOST_DEVICE inline __atomic_small_proxy_t<_Tp> __atomic_small_to_32(_Tp __val)
+{
+  __atomic_small_proxy_t<_Tp> __temp{};
+  memcpy(&__temp, &__val, sizeof(_Tp));
+  return __temp;
+}
+
+template <class _Tp, __enable_if_t<!_CCCL_TRAIT(is_arithmetic, _Tp), int> = 0>
+_CCCL_HOST_DEVICE inline _Tp __atomic_small_from_32(__atomic_small_proxy_t<_Tp> __val)
+{
+  _Tp __temp{};
+  memcpy(&__temp, &__val, sizeof(_Tp));
+  return __temp;
+}
+
+template <typename _Tp>
+struct __atomic_small_storage
+{
+  using __underlying_t                = _Tp;
+  using __proxy_t                     = __atomic_small_proxy_t<_Tp>;
+  static constexpr __atomic_tag __tag = __atomic_tag::__atomic_small_tag;
+
+  _CCCL_HOST_DEVICE constexpr inline explicit __atomic_small_storage() noexcept
+      : __a_value{__proxy_t{}} {};
+
+  _CCCL_HOST_DEVICE constexpr inline explicit __atomic_small_storage(_Tp __value) noexcept
+      : __a_value{__atomic_small_to_32(__value)}
+  {}
+
+  __atomic_storage<__proxy_t> __a_value;
+};
+
+template <typename _Sto, typename _Up, __atomic_storage_is_small<_Sto> = 0>
+_CCCL_HOST_DEVICE inline void __atomic_init_dispatch(_Sto* __a, _Up __val)
+{
+  __atomic_init_dispatch(&__a->__a_value, __atomic_small_to_32(__val));
+}
+
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_small<_Sto> = 0>
+_CCCL_HOST_DEVICE inline void __atomic_store_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {})
+{
+  __atomic_store_dispatch(&__a->__a_value, __atomic_small_to_32(__val), __order, _Sco{});
+}
+
+template <typename _Sto, typename _Sco, __atomic_storage_is_small<_Sto> = 0>
+_CCCL_HOST_DEVICE inline auto __atomic_load_dispatch(const _Sto* __a, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
+  using _Tp = __atomic_underlying_t<_Sto>;
+  return __atomic_small_from_32<_Tp>(__atomic_load_dispatch(&__a->__a_value, __order, _Sco{}));
+}
+
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_small<_Sto> = 0>
+_CCCL_HOST_DEVICE inline auto __atomic_exchange_dispatch(_Sto* __a, _Up __value, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
+  using _Tp = __atomic_underlying_t<_Sto>;
+  return __atomic_small_from_32<_Tp>(
+    __atomic_exchange_dispatch(&__a->__a_value, __atomic_small_to_32(__value), __order, _Sco{}));
+}
+
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_small<_Sto> = 0>
+_CCCL_HOST_DEVICE inline bool __atomic_compare_exchange_weak_dispatch(
+  _Sto* __a, _Up* __expected, _Up __value, memory_order __success, memory_order __failure, _Sco = {})
+{
+  using _Tp            = __atomic_underlying_t<_Sto>;
+  auto __temp_expected = __atomic_small_to_32(*__expected);
+  auto const __ret     = __atomic_compare_exchange_weak_dispatch(
+    &__a->__a_value, &__temp_expected, __atomic_small_to_32(__value), __success, __failure, _Sco{});
+  auto const __actual   = __atomic_small_from_32<_Tp>(__temp_expected);
+  constexpr auto __mask = static_cast<decltype(__temp_expected)>((1u << (8 * sizeof(_Tp))) - 1);
+  if (!__ret)
+  {
+    if (0 == __atomic_memcmp(&__actual, __expected, sizeof(_Tp)))
+    {
+      __atomic_fetch_and_dispatch(&__a->__a_value, __mask, memory_order_relaxed, _Sco{});
+    }
+    else
+    {
+      *__expected = __actual;
+    }
+  }
+  return __ret;
+}
+
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_small<_Sto> = 0>
+_CCCL_HOST_DEVICE inline bool __atomic_compare_exchange_strong_dispatch(
+  _Sto* __a, _Up* __expected, _Up __value, memory_order __success, memory_order __failure, _Sco = {})
+{
+  using _Tp        = __atomic_underlying_t<_Sto>;
+  auto const __old = *__expected;
+  while (1)
+  {
+    if (__atomic_compare_exchange_weak_dispatch(__a, __expected, __value, __success, __failure, _Sco{}))
+    {
+      return true;
+    }
+    if (0 != __atomic_memcmp(&__old, __expected, sizeof(_Tp)))
+    {
+      return false;
+    }
+  }
+}
+
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_small<_Sto> = 0>
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_add_dispatch(_Sto* __a, _Up __delta, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
+  using _Tp = __atomic_underlying_t<_Sto>;
+  return __atomic_small_from_32<_Tp>(
+    __atomic_fetch_add_dispatch(&__a->__a_value, __atomic_small_to_32(__delta), __order, _Sco{}));
+}
+
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_small<_Sto> = 0>
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_sub_dispatch(_Sto* __a, _Up __delta, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
+  using _Tp = __atomic_underlying_t<_Sto>;
+  return __atomic_small_from_32<_Tp>(
+    __atomic_fetch_sub_dispatch(&__a->__a_value, __atomic_small_to_32(__delta), __order, _Sco{}));
+}
+
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_small<_Sto> = 0>
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_and_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
+  using _Tp = __atomic_underlying_t<_Sto>;
+  return __atomic_small_from_32<_Tp>(
+    __atomic_fetch_and_dispatch(&__a->__a_value, __atomic_small_to_32(__pattern), __order, _Sco{}));
+}
+
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_small<_Sto> = 0>
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_or_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
+  using _Tp = __atomic_underlying_t<_Sto>;
+  return __atomic_small_from_32<_Tp>(
+    __atomic_fetch_or_dispatch(&__a->__a_value, __atomic_small_to_32(__pattern), __order, _Sco{}));
+}
+
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_small<_Sto> = 0>
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_xor_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
+  using _Tp = __atomic_underlying_t<_Sto>;
+  return __atomic_small_from_32<_Tp>(
+    __atomic_fetch_xor_dispatch(&__a->__a_value, __atomic_small_to_32(__pattern), __order, _Sco{}));
+}
+
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_small<_Sto> = 0>
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_max_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
+  using _Tp = __atomic_underlying_t<_Sto>;
+  return __atomic_small_from_32<_Tp>(
+    __atomic_fetch_max_dispatch(&__a->__a_value, __atomic_small_to_32(__val), __order, _Sco{}));
+}
+
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_small<_Sto> = 0>
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_min_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
+  using _Tp = __atomic_underlying_t<_Sto>;
+  return __atomic_small_from_32<_Tp>(
+    __atomic_fetch_min_dispatch(&__a->__a_value, __atomic_small_to_32(__val), __order, _Sco{}));
+}
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX___ATOMIC_TYPES_SMALL_H
diff --git a/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h b/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h
new file mode 100644
index 0000000000..29130ee244
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h
@@ -0,0 +1,90 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___ATOMIC_WAIT_NOTIFY_WAIT_H
+#define _LIBCUDACXX___ATOMIC_WAIT_NOTIFY_WAIT_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__atomic/order.h>
+#include <cuda/std/__atomic/scopes.h>
+#include <cuda/std/__atomic/wait/polling.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+extern "C" _CCCL_DEVICE void __atomic_try_wait_unsupported_before_SM_70__();
+
+template <typename _Tp, typename _Sco>
+_LIBCUDACXX_INLINE_VISIBILITY void
+__atomic_try_wait_slow(_Tp const volatile* __a, __atomic_underlying_remove_cv_t<_Tp> __val, memory_order __order, _Sco)
+{
+  NV_DISPATCH_TARGET(NV_PROVIDES_SM_70, __atomic_try_wait_slow_fallback(__a, __val, __order, _Sco{});
+                     , NV_IS_HOST, __atomic_try_wait_slow_fallback(__a, __val, __order, _Sco{});
+                     , NV_ANY_TARGET, __atomic_try_wait_unsupported_before_SM_70__(););
+}
+
+template <typename _Tp, typename _Sco>
+_LIBCUDACXX_INLINE_VISIBILITY void __atomic_notify_one(_Tp const volatile*, _Sco)
+{
+  NV_DISPATCH_TARGET(NV_PROVIDES_SM_70, , NV_IS_HOST, , NV_ANY_TARGET, __atomic_try_wait_unsupported_before_SM_70__(););
+}
+
+template <typename _Tp, typename _Sco>
+_LIBCUDACXX_INLINE_VISIBILITY void __atomic_notify_all(_Tp const volatile*, _Sco)
+{
+  NV_DISPATCH_TARGET(NV_PROVIDES_SM_70, , NV_IS_HOST, , NV_ANY_TARGET, __atomic_try_wait_unsupported_before_SM_70__(););
+}
+
+template <typename _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY bool __nonatomic_compare_equal(_Tp const& __lhs, _Tp const& __rhs)
+{
+#if defined(_CCCL_CUDA_COMPILER)
+  return __lhs == __rhs;
+#else
+  return memcmp(&__lhs, &__rhs, sizeof(_Tp)) == 0;
+#endif
+}
+
+template <typename _Tp, typename _Sco>
+_LIBCUDACXX_INLINE_VISIBILITY void __atomic_wait(
+  _Tp const volatile* __a, __atomic_underlying_remove_cv_t<_Tp> const __val, memory_order __order, _Sco = {})
+{
+  for (int __i = 0; __i < _LIBCUDACXX_POLLING_COUNT; ++__i)
+  {
+    if (!__nonatomic_compare_equal(__atomic_load_dispatch(__a, __order, _Sco{}), __val))
+    {
+      return;
+    }
+    if (__i < 12)
+    {
+      __libcpp_thread_yield_processor();
+    }
+    else
+    {
+      __libcpp_thread_yield();
+    }
+  }
+  while (__nonatomic_compare_equal(__atomic_load_dispatch(__a, __order, _Sco{}), __val))
+  {
+    __atomic_try_wait_slow(__a, __val, __order, _Sco{});
+  }
+}
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX___ATOMIC_WAIT_NOTIFY_WAIT_H
diff --git a/libcudacxx/include/cuda/std/__atomic/wait/polling.h b/libcudacxx/include/cuda/std/__atomic/wait/polling.h
new file mode 100644
index 0000000000..8fe5f24b6d
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/wait/polling.h
@@ -0,0 +1,61 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___ATOMIC_WAIT_POLLING_H
+#define _LIBCUDACXX___ATOMIC_WAIT_POLLING_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__atomic/order.h>
+#include <cuda/std/__atomic/scopes.h>
+#include <cuda/std/__atomic/types.h>
+#include <cuda/std/detail/libcxx/include/__threading_support>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+template <typename _Tp, typename _Sco>
+struct __atomic_poll_tester
+{
+  using __underlying_t = __atomic_underlying_remove_cv_t<_Tp>;
+
+  _Tp const volatile* __atom;
+  __underlying_t __val;
+  memory_order __order;
+
+  _CCCL_HOST_DEVICE __atomic_poll_tester(_Tp const volatile* __a, __underlying_t __v, memory_order __o)
+      : __atom(__a)
+      , __val(__v)
+      , __order(__o)
+  {}
+
+  _CCCL_HOST_DEVICE bool operator()() const
+  {
+    return !(__atomic_load_dispatch(__atom, __order, _Sco{}) == __val);
+  }
+};
+
+template <typename _Tp, typename _Sco>
+_CCCL_HOST_DEVICE void __atomic_try_wait_slow_fallback(
+  _Tp const volatile* __a, __atomic_underlying_remove_cv_t<_Tp> __val, memory_order __order, _Sco)
+{
+  __libcpp_thread_poll_with_backoff(__atomic_poll_tester<_Tp, _Sco>(__a, __val, __order));
+}
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX___ATOMIC_WAIT_POLLING_H
diff --git a/libcudacxx/include/cuda/std/__cuda/atomic.h b/libcudacxx/include/cuda/std/__cuda/atomic.h
index c75d0c54da..d45a12c155 100644
--- a/libcudacxx/include/cuda/std/__cuda/atomic.h
+++ b/libcudacxx/include/cuda/std/__cuda/atomic.h
@@ -21,231 +21,90 @@
 #  pragma system_header
 #endif // no system header
 
-_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
-
-using std::__detail::thread_scope;
-using std::__detail::thread_scope_block;
-using std::__detail::thread_scope_device;
-using std::__detail::thread_scope_system;
-using std::__detail::thread_scope_thread;
-
-namespace __detail
-{
-using std::__detail::__thread_scope_block_tag;
-using std::__detail::__thread_scope_device_tag;
-using std::__detail::__thread_scope_system_tag;
-} // namespace __detail
-
-using memory_order = std::memory_order;
+#include <cuda/std/atomic>
 
-constexpr memory_order memory_order_relaxed = std::memory_order_relaxed;
-constexpr memory_order memory_order_consume = std::memory_order_consume;
-constexpr memory_order memory_order_acquire = std::memory_order_acquire;
-constexpr memory_order memory_order_release = std::memory_order_release;
-constexpr memory_order memory_order_acq_rel = std::memory_order_acq_rel;
-constexpr memory_order memory_order_seq_cst = std::memory_order_seq_cst;
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
 // atomic<T>
 
 template <class _Tp, thread_scope _Sco = thread_scope::thread_scope_system>
-struct atomic : public std::__atomic_base<_Tp, _Sco>
+struct atomic : public _CUDA_VSTD::__atomic_impl<_Tp, _Sco>
 {
-  typedef std::__atomic_base<_Tp, _Sco> __base;
+  using value_type = _Tp;
 
   constexpr atomic() noexcept = default;
-  _CCCL_HOST_DEVICE constexpr atomic(_Tp __d) noexcept
-      : __base(__d)
-  {}
-
-  _CCCL_HOST_DEVICE _Tp operator=(_Tp __d) volatile noexcept
-  {
-    __base::store(__d);
-    return __d;
-  }
-  _CCCL_HOST_DEVICE _Tp operator=(_Tp __d) noexcept
-  {
-    __base::store(__d);
-    return __d;
-  }
-
-  _CCCL_HOST_DEVICE _Tp fetch_max(const _Tp& __op, memory_order __m = memory_order_seq_cst) volatile noexcept
-  {
-    return std::__detail::__cxx_atomic_fetch_max(&this->__a_, __op, __m);
-  }
 
-  _CCCL_HOST_DEVICE _Tp fetch_min(const _Tp& __op, memory_order __m = memory_order_seq_cst) volatile noexcept
-  {
-    return std::__detail::__cxx_atomic_fetch_min(&this->__a_, __op, __m);
-  }
-};
-
-// atomic<T*>
-
-template <class _Tp, thread_scope _Sco>
-struct atomic<_Tp*, _Sco> : public std::__atomic_base<_Tp*, _Sco>
-{
-  typedef std::__atomic_base<_Tp*, _Sco> __base;
-
-  constexpr atomic() noexcept = default;
-  _CCCL_HOST_DEVICE constexpr atomic(_Tp* __d) noexcept
-      : __base(__d)
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr atomic(_Tp __d) noexcept
+      : _CUDA_VSTD::__atomic_impl<_Tp, _Sco>(__d)
   {}
 
-  _CCCL_HOST_DEVICE _Tp* operator=(_Tp* __d) volatile noexcept
+  atomic(const atomic&)                     = delete;
+  atomic& operator=(const atomic&)          = delete;
+  atomic& operator=(const atomic&) volatile = delete;
+
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator=(_Tp __d) volatile noexcept
   {
-    __base::store(__d);
+    this->store(__d);
     return __d;
   }
-  _CCCL_HOST_DEVICE _Tp* operator=(_Tp* __d) noexcept
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator=(_Tp __d) noexcept
   {
-    __base::store(__d);
+    this->store(__d);
     return __d;
   }
 
-  _CCCL_HOST_DEVICE _Tp* fetch_add(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) volatile noexcept
-  {
-    return __cxx_atomic_fetch_add(&this->__a_, __op, __m);
-  }
-  _CCCL_HOST_DEVICE _Tp* fetch_add(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) noexcept
-  {
-    return __cxx_atomic_fetch_add(&this->__a_, __op, __m);
-  }
-  _CCCL_HOST_DEVICE _Tp* fetch_sub(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) volatile noexcept
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_max(const _Tp& __op, memory_order __m = memory_order_seq_cst) noexcept
   {
-    return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);
+    return _CUDA_VSTD::__atomic_fetch_max_dispatch(&this->__a, __op, __m, _CUDA_VSTD::__scope_to_tag<_Sco>{});
   }
-  _CCCL_HOST_DEVICE _Tp* fetch_sub(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) noexcept
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_max(const _Tp& __op, memory_order __m = memory_order_seq_cst) volatile noexcept
   {
-    return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);
+    return _CUDA_VSTD::__atomic_fetch_max_dispatch(&this->__a, __op, __m, _CUDA_VSTD::__scope_to_tag<_Sco>{});
   }
 
-  _CCCL_HOST_DEVICE _Tp* operator++(int) volatile noexcept
-  {
-    return fetch_add(1);
-  }
-  _CCCL_HOST_DEVICE _Tp* operator++(int) noexcept
-  {
-    return fetch_add(1);
-  }
-  _CCCL_HOST_DEVICE _Tp* operator--(int) volatile noexcept
-  {
-    return fetch_sub(1);
-  }
-  _CCCL_HOST_DEVICE _Tp* operator--(int) noexcept
-  {
-    return fetch_sub(1);
-  }
-  _CCCL_HOST_DEVICE _Tp* operator++() volatile noexcept
-  {
-    return fetch_add(1) + 1;
-  }
-  _CCCL_HOST_DEVICE _Tp* operator++() noexcept
-  {
-    return fetch_add(1) + 1;
-  }
-  _CCCL_HOST_DEVICE _Tp* operator--() volatile noexcept
-  {
-    return fetch_sub(1) - 1;
-  }
-  _CCCL_HOST_DEVICE _Tp* operator--() noexcept
-  {
-    return fetch_sub(1) - 1;
-  }
-  _CCCL_HOST_DEVICE _Tp* operator+=(ptrdiff_t __op) volatile noexcept
-  {
-    return fetch_add(__op) + __op;
-  }
-  _CCCL_HOST_DEVICE _Tp* operator+=(ptrdiff_t __op) noexcept
-  {
-    return fetch_add(__op) + __op;
-  }
-  _CCCL_HOST_DEVICE _Tp* operator-=(ptrdiff_t __op) volatile noexcept
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_min(const _Tp& __op, memory_order __m = memory_order_seq_cst) noexcept
   {
-    return fetch_sub(__op) - __op;
+    return _CUDA_VSTD::__atomic_fetch_min_dispatch(&this->__a, __op, __m, _CUDA_VSTD::__scope_to_tag<_Sco>{});
   }
-  _CCCL_HOST_DEVICE _Tp* operator-=(ptrdiff_t __op) noexcept
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_min(const _Tp& __op, memory_order __m = memory_order_seq_cst) volatile noexcept
   {
-    return fetch_sub(__op) - __op;
+    return _CUDA_VSTD::__atomic_fetch_min_dispatch(&this->__a, __op, __m, _CUDA_VSTD::__scope_to_tag<_Sco>{});
   }
 };
 
 // atomic_ref<T>
 
 template <class _Tp, thread_scope _Sco = thread_scope::thread_scope_system>
-struct atomic_ref : public std::__atomic_base_ref<_Tp, _Sco>
+struct atomic_ref : public _CUDA_VSTD::__atomic_ref_impl<_Tp, _Sco>
 {
-  typedef std::__atomic_base_ref<_Tp, _Sco> __base;
-
-  _CCCL_HOST_DEVICE constexpr atomic_ref(_Tp& __d) noexcept
-      : __base(__d)
-  {}
+  using value_type = _Tp;
 
-  _CCCL_HOST_DEVICE _Tp operator=(_Tp __d) const noexcept
-  {
-    __base::store(__d);
-    return __d;
-  }
+  static constexpr size_t required_alignment = sizeof(_Tp);
 
-  _CCCL_HOST_DEVICE _Tp fetch_max(const _Tp& __op, memory_order __m = memory_order_seq_cst) const noexcept
-  {
-    return std::__detail::__cxx_atomic_fetch_max(&this->__a_, __op, __m);
-  }
+  static constexpr bool is_always_lock_free = sizeof(_Tp) <= 8;
 
-  _CCCL_HOST_DEVICE _Tp fetch_min(const _Tp& __op, memory_order __m = memory_order_seq_cst) const noexcept
-  {
-    return std::__detail::__cxx_atomic_fetch_min(&this->__a_, __op, __m);
-  }
-};
-
-// atomic_ref<T*>
-
-template <class _Tp, thread_scope _Sco>
-struct atomic_ref<_Tp*, _Sco> : public std::__atomic_base_ref<_Tp*, _Sco>
-{
-  typedef std::__atomic_base_ref<_Tp*, _Sco> __base;
-
-  _CCCL_HOST_DEVICE constexpr atomic_ref(_Tp*& __d) noexcept
-      : __base(__d)
+  _LIBCUDACXX_INLINE_VISIBILITY explicit atomic_ref(_Tp& __ref)
+      : _CUDA_VSTD::__atomic_ref_impl<_Tp, _Sco>(__ref)
   {}
 
-  _CCCL_HOST_DEVICE _Tp* operator=(_Tp* __d) const noexcept
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator=(_Tp __v) const noexcept
   {
-    __base::store(__d);
-    return __d;
+    this->store(__v);
+    return __v;
   }
 
-  _CCCL_HOST_DEVICE _Tp* fetch_add(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) const noexcept
-  {
-    return __cxx_atomic_fetch_add(&this->__a_, __op, __m);
-  }
-  _CCCL_HOST_DEVICE _Tp* fetch_sub(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) const noexcept
-  {
-    return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);
-  }
+  atomic_ref(const atomic_ref&) noexcept         = default;
+  atomic_ref& operator=(const atomic_ref&)       = delete;
+  atomic_ref& operator=(const atomic_ref&) const = delete;
 
-  _CCCL_HOST_DEVICE _Tp* operator++(int) const noexcept
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_max(const _Tp& __op, memory_order __m = memory_order_seq_cst) const noexcept
   {
-    return fetch_add(1);
+    return _CUDA_VSTD::__atomic_fetch_max_dispatch(&this->__a, __op, __m, _CUDA_VSTD::__scope_to_tag<_Sco>{});
   }
-  _CCCL_HOST_DEVICE _Tp* operator--(int) const noexcept
-  {
-    return fetch_sub(1);
-  }
-  _CCCL_HOST_DEVICE _Tp* operator++() const noexcept
-  {
-    return fetch_add(1) + 1;
-  }
-  _CCCL_HOST_DEVICE _Tp* operator--() const noexcept
-  {
-    return fetch_sub(1) - 1;
-  }
-  _CCCL_HOST_DEVICE _Tp* operator+=(ptrdiff_t __op) const noexcept
-  {
-    return fetch_add(__op) + __op;
-  }
-  _CCCL_HOST_DEVICE _Tp* operator-=(ptrdiff_t __op) const noexcept
+
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_min(const _Tp& __op, memory_order __m = memory_order_seq_cst) const noexcept
   {
-    return fetch_sub(__op) - __op;
+    return _CUDA_VSTD::__atomic_fetch_min_dispatch(&this->__a, __op, __m, _CUDA_VSTD::__scope_to_tag<_Sco>{});
   }
 };
 
@@ -256,25 +115,25 @@ atomic_thread_fence(memory_order __m, thread_scope _Scope = thread_scope::thread
     NV_IS_DEVICE,
     (switch (_Scope) {
       case thread_scope::thread_scope_system:
-        std::__detail::__atomic_thread_fence_cuda((int) __m, __detail::__thread_scope_system_tag());
+        _CUDA_VSTD::__atomic_thread_fence_cuda((int) __m, __thread_scope_system_tag{});
         break;
       case thread_scope::thread_scope_device:
-        std::__detail::__atomic_thread_fence_cuda((int) __m, __detail::__thread_scope_device_tag());
+        _CUDA_VSTD::__atomic_thread_fence_cuda((int) __m, __thread_scope_device_tag{});
         break;
       case thread_scope::thread_scope_block:
-        std::__detail::__atomic_thread_fence_cuda((int) __m, __detail::__thread_scope_block_tag());
+        _CUDA_VSTD::__atomic_thread_fence_cuda((int) __m, __thread_scope_block_tag{});
         break;
       // Atomics scoped to themselves do not require fencing
       case thread_scope::thread_scope_thread:
         break;
     }),
     NV_IS_HOST,
-    ((void) _Scope; std::atomic_thread_fence(__m);))
+    ((void) _Scope; _CUDA_VSTD::atomic_thread_fence(__m);))
 }
 
 inline _CCCL_HOST_DEVICE void atomic_signal_fence(memory_order __m)
 {
-  std::atomic_signal_fence(__m);
+  _CUDA_VSTD::atomic_signal_fence(__m);
 }
 
 _LIBCUDACXX_END_NAMESPACE_CUDA
diff --git a/libcudacxx/include/cuda/std/__cuda/atomic_prelude.h b/libcudacxx/include/cuda/std/__cuda/atomic_prelude.h
deleted file mode 100644
index 4e43fb4481..0000000000
--- a/libcudacxx/include/cuda/std/__cuda/atomic_prelude.h
+++ /dev/null
@@ -1,64 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCUDACXX___CUDA_ATOMIC_PRELUDE_H
-#define _LIBCUDACXX___CUDA_ATOMIC_PRELUDE_H
-
-#include <cuda/std/detail/__config>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-#ifndef _CCCL_COMPILER_NVRTC
-#  include <cuda/std/cassert> // TRANSITION: Fix transitive includes
-
-#  include <atomic>
-static_assert(ATOMIC_BOOL_LOCK_FREE == 2, "");
-static_assert(ATOMIC_CHAR_LOCK_FREE == 2, "");
-static_assert(ATOMIC_CHAR16_T_LOCK_FREE == 2, "");
-static_assert(ATOMIC_CHAR32_T_LOCK_FREE == 2, "");
-static_assert(ATOMIC_WCHAR_T_LOCK_FREE == 2, "");
-static_assert(ATOMIC_SHORT_LOCK_FREE == 2, "");
-static_assert(ATOMIC_INT_LOCK_FREE == 2, "");
-static_assert(ATOMIC_LONG_LOCK_FREE == 2, "");
-static_assert(ATOMIC_LLONG_LOCK_FREE == 2, "");
-static_assert(ATOMIC_POINTER_LOCK_FREE == 2, "");
-#  undef ATOMIC_BOOL_LOCK_FREE
-#  undef ATOMIC_BOOL_LOCK_FREE
-#  undef ATOMIC_CHAR_LOCK_FREE
-#  undef ATOMIC_CHAR16_T_LOCK_FREE
-#  undef ATOMIC_CHAR32_T_LOCK_FREE
-#  undef ATOMIC_WCHAR_T_LOCK_FREE
-#  undef ATOMIC_SHORT_LOCK_FREE
-#  undef ATOMIC_INT_LOCK_FREE
-#  undef ATOMIC_LONG_LOCK_FREE
-#  undef ATOMIC_LLONG_LOCK_FREE
-#  undef ATOMIC_POINTER_LOCK_FREE
-#  undef ATOMIC_FLAG_INIT
-#  undef ATOMIC_VAR_INIT
-#endif // _CCCL_COMPILER_NVRTC
-
-// pre-define lock free query for heterogeneous compatibility
-#ifndef _LIBCUDACXX_ATOMIC_IS_LOCK_FREE
-#  define _LIBCUDACXX_ATOMIC_IS_LOCK_FREE(__x) (__x <= 8)
-#endif
-
-#ifndef _CCCL_COMPILER_NVRTC
-#  include <thread>
-
-#  include <errno.h>
-#endif // _CCCL_COMPILER_NVRTC
-
-#endif // _LIBCUDACXX___CUDA_ATOMIC_PRELUDE_H
diff --git a/libcudacxx/include/cuda/std/__cuda/barrier.h b/libcudacxx/include/cuda/std/__cuda/barrier.h
index b116540607..8533501ae1 100644
--- a/libcudacxx/include/cuda/std/__cuda/barrier.h
+++ b/libcudacxx/include/cuda/std/__cuda/barrier.h
@@ -25,6 +25,7 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/std/__atomic/api/owned.h>
 #include <cuda/std/__type_traits/void_t.h> // _CUDA_VSTD::__void_t
 #include <cuda/std/detail/libcxx/include/cstdlib> // _LIBCUDACXX_UNREACHABLE
 
@@ -124,7 +125,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 template <>
 class barrier<thread_scope_block, _CUDA_VSTD::__empty_completion> : public __block_scope_barrier_base
 {
-  using __barrier_base = _CUDA_VSTD::__barrier_base<_CUDA_VSTD::__empty_completion, (int) thread_scope_block>;
+  using __barrier_base = _CUDA_VSTD::__barrier_base<_CUDA_VSTD::__empty_completion, thread_scope_block>;
   __barrier_base __barrier;
 
   _CCCL_DEVICE friend inline _CUDA_VSTD::uint64_t*
diff --git a/libcudacxx/include/cuda/std/atomic b/libcudacxx/include/cuda/std/atomic
index 8b7e696e93..8ae9efb420 100644
--- a/libcudacxx/include/cuda/std/atomic
+++ b/libcudacxx/include/cuda/std/atomic
@@ -1,18 +1,32 @@
-//===----------------------------------------------------------------------===//
+// -*- C++ -*-
+//===--------------------------- atomic -----------------------------------===//
 //
-// Part of libcu++, the C++ Standard Library for your entire system,
-// under the Apache License v2.0 with LLVM Exceptions.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef _CUDA_STD_ATOMIC
 #define _CUDA_STD_ATOMIC
 
+// clang-format off
+
 #include <cuda/std/detail/__config>
 
+#ifdef _LIBCUDACXX_HAS_NO_THREADS
+# error <atomic> is not supported on this single threaded system
+#endif
+#ifdef _LIBCUDACXX_HAS_NO_ATOMIC_HEADER
+# error <atomic> is not implemented
+#endif
+#ifdef _LIBCUDACXX_UNSUPPORTED_THREAD_API
+# error "<atomic> is not supported on this system"
+#endif
+#ifdef kill_dependency
+# error C++ standard library is incompatible with <stdatomic.h>
+#endif
+
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
@@ -21,9 +35,803 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/std/__atomic/platform.h>
+#include <cuda/std/__atomic/order.h>
+#include <cuda/std/__atomic/scopes.h>
+#include <cuda/std/__atomic/wait/polling.h>
+#include <cuda/std/__atomic/wait/notify_wait.h>
+#include <cuda/std/__atomic/api/owned.h>
+#include <cuda/std/__atomic/api/reference.h>
+
+#include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/is_floating_point.h>
+#include <cuda/std/__type_traits/is_integral.h>
+#include <cuda/std/__type_traits/is_same.h>
+
+// clang-format on
+
 _CCCL_PUSH_MACROS
 
-#include <cuda/std/detail/libcxx/include/atomic>
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY _Tp kill_dependency(_Tp __y) noexcept
+{
+  return __y;
+}
+
+// atomic<T>
+template <class _Tp>
+struct atomic : public __atomic_impl<_Tp>
+{
+  using value_type = _Tp;
+
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr atomic() noexcept
+      : __atomic_impl<_Tp>()
+  {}
+
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr atomic(_Tp __d) noexcept
+      : __atomic_impl<_Tp>(__d)
+  {}
+
+  atomic(const atomic&)                     = delete;
+  atomic& operator=(const atomic&)          = delete;
+  atomic& operator=(const atomic&) volatile = delete;
+
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator=(_Tp __d) volatile noexcept
+  {
+    this->store(__d);
+    return __d;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator=(_Tp __d) noexcept
+  {
+    this->store(__d);
+    return __d;
+  }
+};
+
+// atomic_ref<T>
+template <class _Tp>
+struct atomic_ref : public __atomic_ref_impl<_Tp>
+{
+  using value_type = _Tp;
+
+  static constexpr size_t required_alignment = sizeof(_Tp);
+
+  static constexpr bool is_always_lock_free = sizeof(_Tp) <= 8;
+
+  _LIBCUDACXX_INLINE_VISIBILITY explicit atomic_ref(_Tp& __ref)
+      : __atomic_ref_impl<_Tp>(__ref)
+  {}
+
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator=(_Tp __v) const noexcept
+  {
+    this->store(__v);
+    return __v;
+  }
+
+  atomic_ref(const atomic_ref&) noexcept         = default;
+  atomic_ref& operator=(const atomic_ref&)       = delete;
+  atomic_ref& operator=(const atomic_ref&) const = delete;
+};
+
+// atomic_is_lock_free
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY bool atomic_is_lock_free(const volatile atomic<_Tp>* __o) noexcept
+{
+  return __o->is_lock_free();
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY bool atomic_is_lock_free(const atomic<_Tp>* __o) noexcept
+{
+  return __o->is_lock_free();
+}
+
+// atomic_init
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_init(volatile atomic<_Tp>* __o, _Tp __d) noexcept
+{
+  __atomic_init_dispatch(&__o->__a, __d);
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_init(atomic<_Tp>* __o, _Tp __d) noexcept
+{
+  __atomic_init_dispatch(&__o->__a, __d);
+}
+
+// atomic_store
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_store(volatile atomic<_Tp>* __o, _Tp __d) noexcept
+{
+  __o->store(__d);
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_store(atomic<_Tp>* __o, _Tp __d) noexcept
+{
+  __o->store(__d);
+}
+
+// atomic_store_explicit
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_store_explicit(volatile atomic<_Tp>* __o, _Tp __d, memory_order __m) noexcept
+  _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
+{
+  __o->store(__d, __m);
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_store_explicit(atomic<_Tp>* __o, _Tp __d, memory_order __m) noexcept
+  _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
+{
+  __o->store(__d, __m);
+}
+
+// atomic_load
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_load(const volatile atomic<_Tp>* __o) noexcept
+{
+  return __o->load();
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_load(const atomic<_Tp>* __o) noexcept
+{
+  return __o->load();
+}
+
+// atomic_load_explicit
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_load_explicit(const volatile atomic<_Tp>* __o, memory_order __m) noexcept
+  _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
+{
+  return __o->load(__m);
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_load_explicit(const atomic<_Tp>* __o, memory_order __m) noexcept
+  _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
+{
+  return __o->load(__m);
+}
+
+// atomic_exchange
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_exchange(volatile atomic<_Tp>* __o, _Tp __d) noexcept
+{
+  return __o->exchange(__d);
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_exchange(atomic<_Tp>* __o, _Tp __d) noexcept
+{
+  return __o->exchange(__d);
+}
+
+// atomic_exchange_explicit
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_exchange_explicit(volatile atomic<_Tp>* __o, _Tp __d, memory_order __m) noexcept
+{
+  return __o->exchange(__d, __m);
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_exchange_explicit(atomic<_Tp>* __o, _Tp __d, memory_order __m) noexcept
+{
+  return __o->exchange(__d, __m);
+}
+
+// atomic_compare_exchange_weak
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_weak(volatile atomic<_Tp>* __o, _Tp* __e, _Tp __d) noexcept
+{
+  return __o->compare_exchange_weak(*__e, __d);
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_weak(atomic<_Tp>* __o, _Tp* __e, _Tp __d) noexcept
+{
+  return __o->compare_exchange_weak(*__e, __d);
+}
+
+// atomic_compare_exchange_strong
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_strong(volatile atomic<_Tp>* __o, _Tp* __e, _Tp __d) noexcept
+{
+  return __o->compare_exchange_strong(*__e, __d);
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_strong(atomic<_Tp>* __o, _Tp* __e, _Tp __d) noexcept
+{
+  return __o->compare_exchange_strong(*__e, __d);
+}
+
+// atomic_compare_exchange_weak_explicit
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_weak_explicit(
+  volatile atomic<_Tp>* __o, _Tp* __e, _Tp __d, memory_order __s, memory_order __f) noexcept
+  _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
+{
+  return __o->compare_exchange_weak(*__e, __d, __s, __f);
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY bool
+atomic_compare_exchange_weak_explicit(atomic<_Tp>* __o, _Tp* __e, _Tp __d, memory_order __s, memory_order __f) noexcept
+  _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
+{
+  return __o->compare_exchange_weak(*__e, __d, __s, __f);
+}
+
+// atomic_compare_exchange_strong_explicit
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_strong_explicit(
+  volatile atomic<_Tp>* __o, _Tp* __e, _Tp __d, memory_order __s, memory_order __f) noexcept
+  _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
+{
+  return __o->compare_exchange_strong(*__e, __d, __s, __f);
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_strong_explicit(
+  atomic<_Tp>* __o, _Tp* __e, _Tp __d, memory_order __s, memory_order __f) noexcept
+  _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
+{
+  return __o->compare_exchange_strong(*__e, __d, __s, __f);
+}
+
+// atomic_wait
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY void
+atomic_wait(const volatile atomic<_Tp>* __o, typename atomic<_Tp>::value_type __v) noexcept
+{
+  return __o->wait(__v);
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_wait(const atomic<_Tp>* __o, typename atomic<_Tp>::value_type __v) noexcept
+{
+  return __o->wait(__v);
+}
+
+// atomic_wait_explicit
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY void
+atomic_wait_explicit(const volatile atomic<_Tp>* __o, typename atomic<_Tp>::value_type __v, memory_order __m) noexcept
+  _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
+{
+  return __o->wait(__v, __m);
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY void
+atomic_wait_explicit(const atomic<_Tp>* __o, typename atomic<_Tp>::value_type __v, memory_order __m) noexcept
+  _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
+{
+  return __o->wait(__v, __m);
+}
+
+// atomic_notify_one
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_notify_one(volatile atomic<_Tp>* __o) noexcept
+{
+  __o->notify_one();
+}
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_notify_one(atomic<_Tp>* __o) noexcept
+{
+  __o->notify_one();
+}
+
+// atomic_notify_one
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_notify_all(volatile atomic<_Tp>* __o) noexcept
+{
+  __o->notify_all();
+}
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_notify_all(atomic<_Tp>* __o) noexcept
+{
+  __o->notify_all();
+}
+
+// atomic_fetch_add
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY
+  __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
+  atomic_fetch_add(volatile atomic<_Tp>* __o, _Tp __op) noexcept
+{
+  return __o->fetch_add(__op);
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY
+  __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
+  atomic_fetch_add(atomic<_Tp>* __o, _Tp __op) noexcept
+{
+  return __o->fetch_add(__op);
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY _Tp* atomic_fetch_add(volatile atomic<_Tp*>* __o, ptrdiff_t __op) noexcept
+{
+  return __o->fetch_add(__op);
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY _Tp* atomic_fetch_add(atomic<_Tp*>* __o, ptrdiff_t __op) noexcept
+{
+  return __o->fetch_add(__op);
+}
+
+// atomic_fetch_add_explicit
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY
+  __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
+  atomic_fetch_add_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
+{
+  return __o->fetch_add(__op, __m);
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY
+  __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
+  atomic_fetch_add_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
+{
+  return __o->fetch_add(__op, __m);
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY _Tp*
+atomic_fetch_add_explicit(volatile atomic<_Tp*>* __o, ptrdiff_t __op, memory_order __m) noexcept
+{
+  return __o->fetch_add(__op, __m);
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY _Tp*
+atomic_fetch_add_explicit(atomic<_Tp*>* __o, ptrdiff_t __op, memory_order __m) noexcept
+{
+  return __o->fetch_add(__op, __m);
+}
+
+// atomic_fetch_sub
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY
+  __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
+  atomic_fetch_sub(volatile atomic<_Tp>* __o, _Tp __op) noexcept
+{
+  return __o->fetch_sub(__op);
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY
+  __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
+  atomic_fetch_sub(atomic<_Tp>* __o, _Tp __op) noexcept
+{
+  return __o->fetch_sub(__op);
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY _Tp* atomic_fetch_sub(volatile atomic<_Tp*>* __o, ptrdiff_t __op) noexcept
+{
+  return __o->fetch_sub(__op);
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY _Tp* atomic_fetch_sub(atomic<_Tp*>* __o, ptrdiff_t __op) noexcept
+{
+  return __o->fetch_sub(__op);
+}
+
+// atomic_fetch_sub_explicit
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY
+  __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
+  atomic_fetch_sub_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
+{
+  return __o->fetch_sub(__op, __m);
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY
+  __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
+  atomic_fetch_sub_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
+{
+  return __o->fetch_sub(__op, __m);
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY _Tp*
+atomic_fetch_sub_explicit(volatile atomic<_Tp*>* __o, ptrdiff_t __op, memory_order __m) noexcept
+{
+  return __o->fetch_sub(__op, __m);
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY _Tp*
+atomic_fetch_sub_explicit(atomic<_Tp*>* __o, ptrdiff_t __op, memory_order __m) noexcept
+{
+  return __o->fetch_sub(__op, __m);
+}
+
+// atomic_fetch_and
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
+atomic_fetch_and(volatile atomic<_Tp>* __o, _Tp __op) noexcept
+{
+  return __o->fetch_and(__op);
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
+atomic_fetch_and(atomic<_Tp>* __o, _Tp __op) noexcept
+{
+  return __o->fetch_and(__op);
+}
+
+// atomic_fetch_and_explicit
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
+atomic_fetch_and_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
+{
+  return __o->fetch_and(__op, __m);
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
+atomic_fetch_and_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
+{
+  return __o->fetch_and(__op, __m);
+}
+
+// atomic_fetch_or
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
+atomic_fetch_or(volatile atomic<_Tp>* __o, _Tp __op) noexcept
+{
+  return __o->fetch_or(__op);
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
+atomic_fetch_or(atomic<_Tp>* __o, _Tp __op) noexcept
+{
+  return __o->fetch_or(__op);
+}
+
+// atomic_fetch_or_explicit
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
+atomic_fetch_or_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
+{
+  return __o->fetch_or(__op, __m);
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
+atomic_fetch_or_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
+{
+  return __o->fetch_or(__op, __m);
+}
+
+// atomic_fetch_xor
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
+atomic_fetch_xor(volatile atomic<_Tp>* __o, _Tp __op) noexcept
+{
+  return __o->fetch_xor(__op);
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
+atomic_fetch_xor(atomic<_Tp>* __o, _Tp __op) noexcept
+{
+  return __o->fetch_xor(__op);
+}
+
+// atomic_fetch_xor_explicit
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
+atomic_fetch_xor_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
+{
+  return __o->fetch_xor(__op, __m);
+}
+
+template <class _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
+atomic_fetch_xor_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
+{
+  return __o->fetch_xor(__op, __m);
+}
+
+// flag type and operations
+
+struct atomic_flag
+{
+  __atomic_storage_t<_LIBCUDACXX_ATOMIC_FLAG_TYPE> __a;
+
+  _LIBCUDACXX_INLINE_VISIBILITY bool test(memory_order __m = memory_order_seq_cst) const volatile noexcept
+  {
+    return _LIBCUDACXX_ATOMIC_FLAG_TYPE(true) == __atomic_load_dispatch(&__a, __m, __thread_scope_system_tag{});
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool test(memory_order __m = memory_order_seq_cst) const noexcept
+  {
+    return _LIBCUDACXX_ATOMIC_FLAG_TYPE(true) == __atomic_load_dispatch(&__a, __m, __thread_scope_system_tag{});
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY bool test_and_set(memory_order __m = memory_order_seq_cst) volatile noexcept
+  {
+    return __atomic_exchange_dispatch(&__a, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), __m, __thread_scope_system_tag{});
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool test_and_set(memory_order __m = memory_order_seq_cst) noexcept
+  {
+    return __atomic_exchange_dispatch(&__a, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), __m, __thread_scope_system_tag{});
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void clear(memory_order __m = memory_order_seq_cst) volatile noexcept
+  {
+    __atomic_store_dispatch(&__a, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), __m, __thread_scope_system_tag{});
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void clear(memory_order __m = memory_order_seq_cst) noexcept
+  {
+    __atomic_store_dispatch(&__a, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), __m, __thread_scope_system_tag{});
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY void
+  wait(_LIBCUDACXX_ATOMIC_FLAG_TYPE __v, memory_order __m = memory_order_seq_cst) const volatile noexcept
+  {
+    __atomic_wait(&__a, __v, __m, __thread_scope_system_tag{});
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void
+  wait(_LIBCUDACXX_ATOMIC_FLAG_TYPE __v, memory_order __m = memory_order_seq_cst) const noexcept
+  {
+    __atomic_wait(&__a, __v, __m, __thread_scope_system_tag{});
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void notify_one() volatile noexcept
+  {
+    __atomic_notify_one(&__a, __thread_scope_system_tag{});
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void notify_one() noexcept
+  {
+    __atomic_notify_one(&__a, __thread_scope_system_tag{});
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void notify_all() volatile noexcept
+  {
+    __atomic_notify_all(&__a, __thread_scope_system_tag{});
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void notify_all() noexcept
+  {
+    __atomic_notify_all(&__a, __thread_scope_system_tag{});
+  }
+
+  atomic_flag() noexcept = default;
+
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr atomic_flag(bool __b) noexcept
+      : __a(__b)
+  {} // EXTENSION
+
+  atomic_flag(const atomic_flag&)                     = delete;
+  atomic_flag& operator=(const atomic_flag&)          = delete;
+  atomic_flag& operator=(const atomic_flag&) volatile = delete;
+};
+
+inline _LIBCUDACXX_INLINE_VISIBILITY bool atomic_flag_test(const volatile atomic_flag* __o) noexcept
+{
+  return __o->test();
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY bool atomic_flag_test(const atomic_flag* __o) noexcept
+{
+  return __o->test();
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY bool
+atomic_flag_test_explicit(const volatile atomic_flag* __o, memory_order __m) noexcept
+{
+  return __o->test(__m);
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY bool atomic_flag_test_explicit(const atomic_flag* __o, memory_order __m) noexcept
+{
+  return __o->test(__m);
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY bool atomic_flag_test_and_set(volatile atomic_flag* __o) noexcept
+{
+  return __o->test_and_set();
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY bool atomic_flag_test_and_set(atomic_flag* __o) noexcept
+{
+  return __o->test_and_set();
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY bool
+atomic_flag_test_and_set_explicit(volatile atomic_flag* __o, memory_order __m) noexcept
+{
+  return __o->test_and_set(__m);
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY bool atomic_flag_test_and_set_explicit(atomic_flag* __o, memory_order __m) noexcept
+{
+  return __o->test_and_set(__m);
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_clear(volatile atomic_flag* __o) noexcept
+{
+  __o->clear();
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_clear(atomic_flag* __o) noexcept
+{
+  __o->clear();
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY void
+atomic_flag_clear_explicit(volatile atomic_flag* __o, memory_order __m) noexcept
+{
+  __o->clear(__m);
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_clear_explicit(atomic_flag* __o, memory_order __m) noexcept
+{
+  __o->clear(__m);
+}
+
+#if !defined(__CUDA_MINIMUM_ARCH__) || __CUDA_MINIMUM_ARCH__ >= 700
+
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_wait(const volatile atomic_flag* __o, bool __v) noexcept
+{
+  __o->wait(__v);
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_wait(const atomic_flag* __o, bool __v) noexcept
+{
+  __o->wait(__v);
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY void
+atomic_flag_wait_explicit(const volatile atomic_flag* __o, bool __v, memory_order __m) noexcept
+{
+  __o->wait(__v, __m);
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY void
+atomic_flag_wait_explicit(const atomic_flag* __o, bool __v, memory_order __m) noexcept
+{
+  __o->wait(__v, __m);
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_notify_one(volatile atomic_flag* __o) noexcept
+{
+  __o->notify_one();
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_notify_one(atomic_flag* __o) noexcept
+{
+  __o->notify_one();
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_notify_all(volatile atomic_flag* __o) noexcept
+{
+  __o->notify_all();
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_notify_all(atomic_flag* __o) noexcept
+{
+  __o->notify_all();
+}
+
+#endif
+
+// fences
+
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_thread_fence(memory_order __m) noexcept
+{
+  __atomic_thread_fence_dispatch(__m);
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_signal_fence(memory_order __m) noexcept
+{
+  __atomic_signal_fence_dispatch(__m);
+}
+
+// Atomics for standard typedef types
+
+typedef atomic<bool> atomic_bool;
+typedef atomic<char> atomic_char;
+typedef atomic<signed char> atomic_schar;
+typedef atomic<unsigned char> atomic_uchar;
+typedef atomic<short> atomic_short;
+typedef atomic<unsigned short> atomic_ushort;
+typedef atomic<int> atomic_int;
+typedef atomic<unsigned int> atomic_uint;
+typedef atomic<long> atomic_long;
+typedef atomic<unsigned long> atomic_ulong;
+typedef atomic<long long> atomic_llong;
+typedef atomic<unsigned long long> atomic_ullong;
+typedef atomic<char16_t> atomic_char16_t;
+typedef atomic<char32_t> atomic_char32_t;
+typedef atomic<wchar_t> atomic_wchar_t;
+
+typedef atomic<int_least8_t> atomic_int_least8_t;
+typedef atomic<uint_least8_t> atomic_uint_least8_t;
+typedef atomic<int_least16_t> atomic_int_least16_t;
+typedef atomic<uint_least16_t> atomic_uint_least16_t;
+typedef atomic<int_least32_t> atomic_int_least32_t;
+typedef atomic<uint_least32_t> atomic_uint_least32_t;
+typedef atomic<int_least64_t> atomic_int_least64_t;
+typedef atomic<uint_least64_t> atomic_uint_least64_t;
+
+typedef atomic<int_fast8_t> atomic_int_fast8_t;
+typedef atomic<uint_fast8_t> atomic_uint_fast8_t;
+typedef atomic<int_fast16_t> atomic_int_fast16_t;
+typedef atomic<uint_fast16_t> atomic_uint_fast16_t;
+typedef atomic<int_fast32_t> atomic_int_fast32_t;
+typedef atomic<uint_fast32_t> atomic_uint_fast32_t;
+typedef atomic<int_fast64_t> atomic_int_fast64_t;
+typedef atomic<uint_fast64_t> atomic_uint_fast64_t;
+
+typedef atomic<int8_t> atomic_int8_t;
+typedef atomic<uint8_t> atomic_uint8_t;
+typedef atomic<int16_t> atomic_int16_t;
+typedef atomic<uint16_t> atomic_uint16_t;
+typedef atomic<int32_t> atomic_int32_t;
+typedef atomic<uint32_t> atomic_uint32_t;
+typedef atomic<int64_t> atomic_int64_t;
+typedef atomic<uint64_t> atomic_uint64_t;
+
+typedef atomic<intptr_t> atomic_intptr_t;
+typedef atomic<uintptr_t> atomic_uintptr_t;
+typedef atomic<size_t> atomic_size_t;
+typedef atomic<ptrdiff_t> atomic_ptrdiff_t;
+typedef atomic<intmax_t> atomic_intmax_t;
+typedef atomic<uintmax_t> atomic_uintmax_t;
+
+static_assert(LIBCUDACXX_ATOMIC_INT_LOCK_FREE, "This library assumes atomic<int> is lock-free.");
+
+typedef atomic<int> atomic_signed_lock_free;
+typedef atomic<unsigned> atomic_unsigned_lock_free;
+
+#define LIBCUDACXX_ATOMIC_FLAG_INIT \
+  {                                 \
+    false                           \
+  }
+#define LIBCUDACXX_ATOMIC_VAR_INIT(__v) \
+  {                                     \
+    __v                                 \
+  }
+
+_LIBCUDACXX_END_NAMESPACE_STD
 
 _CCCL_POP_MACROS
 
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__threading_support b/libcudacxx/include/cuda/std/detail/libcxx/include/__threading_support
index d6889e1822..9f5fbe9255 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__threading_support
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__threading_support
@@ -20,7 +20,6 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/std/__cuda/atomic_prelude.h>
 #include <cuda/std/__functional/hash.h>
 #include <cuda/std/chrono>
 #include <cuda/std/climits>
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/atomic b/libcudacxx/include/cuda/std/detail/libcxx/include/atomic
deleted file mode 100644
index 5656afa683..0000000000
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/atomic
+++ /dev/null
@@ -1,2999 +0,0 @@
-// -*- C++ -*-
-//===--------------------------- atomic -----------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCUDACXX_ATOMIC
-#define _LIBCUDACXX_ATOMIC
-
-/*
-    atomic synopsis
-
-namespace std
-{
-
-// feature test macro
-
-#define __cpp_lib_atomic_is_always_lock_free // as specified by SG10
-
- // order and consistency
-
- enum memory_order: unspecified // enum class in C++20
- {
-    relaxed,
-    consume, // load-consume
-    acquire, // load-acquire
-    release, // store-release
-    acq_rel, // store-release load-acquire
-    seq_cst // store-release load-acquire
- };
-
- inline constexpr auto memory_order_relaxed = memory_order::relaxed;
- inline constexpr auto memory_order_consume = memory_order::consume;
- inline constexpr auto memory_order_acquire = memory_order::acquire;
- inline constexpr auto memory_order_release = memory_order::release;
- inline constexpr auto memory_order_acq_rel = memory_order::acq_rel;
- inline constexpr auto memory_order_seq_cst = memory_order::seq_cst;
-
-template <class T> T kill_dependency(T y) noexcept;
-
-// lock-free property
-
-#define ATOMIC_BOOL_LOCK_FREE unspecified
-#define ATOMIC_CHAR_LOCK_FREE unspecified
-#define ATOMIC_CHAR16_T_LOCK_FREE unspecified
-#define ATOMIC_CHAR32_T_LOCK_FREE unspecified
-#define ATOMIC_WCHAR_T_LOCK_FREE unspecified
-#define ATOMIC_SHORT_LOCK_FREE unspecified
-#define ATOMIC_INT_LOCK_FREE unspecified
-#define ATOMIC_LONG_LOCK_FREE unspecified
-#define ATOMIC_LLONG_LOCK_FREE unspecified
-#define ATOMIC_POINTER_LOCK_FREE unspecified
-
-// flag type and operations
-
-typedef struct atomic_flag
-{
-    bool test_and_set(memory_order m = memory_order_seq_cst) volatile noexcept;
-    bool test_and_set(memory_order m = memory_order_seq_cst) noexcept;
-    void clear(memory_order m = memory_order_seq_cst) volatile noexcept;
-    void clear(memory_order m = memory_order_seq_cst) noexcept;
-    atomic_flag()  noexcept = default;
-    atomic_flag(const atomic_flag&) = delete;
-    atomic_flag& operator=(const atomic_flag&) = delete;
-    atomic_flag& operator=(const atomic_flag&) volatile = delete;
-} atomic_flag;
-
-bool
-    atomic_flag_test_and_set(volatile atomic_flag* obj) noexcept;
-
-bool
-    atomic_flag_test_and_set(atomic_flag* obj) noexcept;
-
-bool
-    atomic_flag_test_and_set_explicit(volatile atomic_flag* obj,
-                                      memory_order m) noexcept;
-
-bool
-    atomic_flag_test_and_set_explicit(atomic_flag* obj, memory_order m) noexcept;
-
-void
-    atomic_flag_clear(volatile atomic_flag* obj) noexcept;
-
-void
-    atomic_flag_clear(atomic_flag* obj) noexcept;
-
-void
-    atomic_flag_clear_explicit(volatile atomic_flag* obj, memory_order m) noexcept;
-
-void
-    atomic_flag_clear_explicit(atomic_flag* obj, memory_order m) noexcept;
-
-#define ATOMIC_FLAG_INIT see below
-#define ATOMIC_VAR_INIT(value) see below
-
-template <class T>
-struct atomic
-{
-    static constexpr bool is_always_lock_free;
-    bool is_lock_free() const volatile noexcept;
-    bool is_lock_free() const noexcept;
-    void store(T desr, memory_order m = memory_order_seq_cst) volatile noexcept;
-    void store(T desr, memory_order m = memory_order_seq_cst) noexcept;
-    T load(memory_order m = memory_order_seq_cst) const volatile noexcept;
-    T load(memory_order m = memory_order_seq_cst) const noexcept;
-    operator T() const volatile noexcept;
-    operator T() const noexcept;
-    T exchange(T desr, memory_order m = memory_order_seq_cst) volatile noexcept;
-    T exchange(T desr, memory_order m = memory_order_seq_cst) noexcept;
-    bool compare_exchange_weak(T& expc, T desr,
-                               memory_order s, memory_order f) volatile noexcept;
-    bool compare_exchange_weak(T& expc, T desr, memory_order s, memory_order f) noexcept;
-    bool compare_exchange_strong(T& expc, T desr,
-                                 memory_order s, memory_order f) volatile noexcept;
-    bool compare_exchange_strong(T& expc, T desr,
-                                 memory_order s, memory_order f) noexcept;
-    bool compare_exchange_weak(T& expc, T desr,
-                               memory_order m = memory_order_seq_cst) volatile noexcept;
-    bool compare_exchange_weak(T& expc, T desr,
-                               memory_order m = memory_order_seq_cst) noexcept;
-    bool compare_exchange_strong(T& expc, T desr,
-                                memory_order m = memory_order_seq_cst) volatile noexcept;
-    bool compare_exchange_strong(T& expc, T desr,
-                                 memory_order m = memory_order_seq_cst) noexcept;
-
-    atomic() noexcept = default;
-    constexpr atomic(T desr) noexcept;
-    atomic(const atomic&) = delete;
-    atomic& operator=(const atomic&) = delete;
-    atomic& operator=(const atomic&) volatile = delete;
-    T operator=(T) volatile noexcept;
-    T operator=(T) noexcept;
-};
-
-template <>
-struct atomic<integral>
-{
-    static constexpr bool is_always_lock_free;
-    bool is_lock_free() const volatile noexcept;
-    bool is_lock_free() const noexcept;
-    void store(integral desr, memory_order m = memory_order_seq_cst) volatile noexcept;
-    void store(integral desr, memory_order m = memory_order_seq_cst) noexcept;
-    integral load(memory_order m = memory_order_seq_cst) const volatile noexcept;
-    integral load(memory_order m = memory_order_seq_cst) const noexcept;
-    operator integral() const volatile noexcept;
-    operator integral() const noexcept;
-    integral exchange(integral desr,
-                      memory_order m = memory_order_seq_cst) volatile noexcept;
-    integral exchange(integral desr, memory_order m = memory_order_seq_cst) noexcept;
-    bool compare_exchange_weak(integral& expc, integral desr,
-                               memory_order s, memory_order f) volatile noexcept;
-    bool compare_exchange_weak(integral& expc, integral desr,
-                               memory_order s, memory_order f) noexcept;
-    bool compare_exchange_strong(integral& expc, integral desr,
-                                 memory_order s, memory_order f) volatile noexcept;
-    bool compare_exchange_strong(integral& expc, integral desr,
-                                 memory_order s, memory_order f) noexcept;
-    bool compare_exchange_weak(integral& expc, integral desr,
-                               memory_order m = memory_order_seq_cst) volatile noexcept;
-    bool compare_exchange_weak(integral& expc, integral desr,
-                               memory_order m = memory_order_seq_cst) noexcept;
-    bool compare_exchange_strong(integral& expc, integral desr,
-                                memory_order m = memory_order_seq_cst) volatile noexcept;
-    bool compare_exchange_strong(integral& expc, integral desr,
-                                 memory_order m = memory_order_seq_cst) noexcept;
-
-    integral
-        fetch_add(integral op, memory_order m = memory_order_seq_cst) volatile noexcept;
-    integral fetch_add(integral op, memory_order m = memory_order_seq_cst) noexcept;
-    integral
-        fetch_sub(integral op, memory_order m = memory_order_seq_cst) volatile noexcept;
-    integral fetch_sub(integral op, memory_order m = memory_order_seq_cst) noexcept;
-    integral
-        fetch_and(integral op, memory_order m = memory_order_seq_cst) volatile noexcept;
-    integral fetch_and(integral op, memory_order m = memory_order_seq_cst) noexcept;
-    integral
-        fetch_or(integral op, memory_order m = memory_order_seq_cst) volatile noexcept;
-    integral fetch_or(integral op, memory_order m = memory_order_seq_cst) noexcept;
-    integral
-        fetch_xor(integral op, memory_order m = memory_order_seq_cst) volatile noexcept;
-    integral fetch_xor(integral op, memory_order m = memory_order_seq_cst) noexcept;
-
-    atomic() noexcept = default;
-    constexpr atomic(integral desr) noexcept;
-    atomic(const atomic&) = delete;
-    atomic& operator=(const atomic&) = delete;
-    atomic& operator=(const atomic&) volatile = delete;
-    integral operator=(integral desr) volatile noexcept;
-    integral operator=(integral desr) noexcept;
-
-    integral operator++(int) volatile noexcept;
-    integral operator++(int) noexcept;
-    integral operator--(int) volatile noexcept;
-    integral operator--(int) noexcept;
-    integral operator++() volatile noexcept;
-    integral operator++() noexcept;
-    integral operator--() volatile noexcept;
-    integral operator--() noexcept;
-    integral operator+=(integral op) volatile noexcept;
-    integral operator+=(integral op) noexcept;
-    integral operator-=(integral op) volatile noexcept;
-    integral operator-=(integral op) noexcept;
-    integral operator&=(integral op) volatile noexcept;
-    integral operator&=(integral op) noexcept;
-    integral operator|=(integral op) volatile noexcept;
-    integral operator|=(integral op) noexcept;
-    integral operator^=(integral op) volatile noexcept;
-    integral operator^=(integral op) noexcept;
-};
-
-template <class T>
-struct atomic<T*>
-{
-    static constexpr bool is_always_lock_free;
-    bool is_lock_free() const volatile noexcept;
-    bool is_lock_free() const noexcept;
-    void store(T* desr, memory_order m = memory_order_seq_cst) volatile noexcept;
-    void store(T* desr, memory_order m = memory_order_seq_cst) noexcept;
-    T* load(memory_order m = memory_order_seq_cst) const volatile noexcept;
-    T* load(memory_order m = memory_order_seq_cst) const noexcept;
-    operator T*() const volatile noexcept;
-    operator T*() const noexcept;
-    T* exchange(T* desr, memory_order m = memory_order_seq_cst) volatile noexcept;
-    T* exchange(T* desr, memory_order m = memory_order_seq_cst) noexcept;
-    bool compare_exchange_weak(T*& expc, T* desr,
-                               memory_order s, memory_order f) volatile noexcept;
-    bool compare_exchange_weak(T*& expc, T* desr,
-                               memory_order s, memory_order f) noexcept;
-    bool compare_exchange_strong(T*& expc, T* desr,
-                                 memory_order s, memory_order f) volatile noexcept;
-    bool compare_exchange_strong(T*& expc, T* desr,
-                                 memory_order s, memory_order f) noexcept;
-    bool compare_exchange_weak(T*& expc, T* desr,
-                               memory_order m = memory_order_seq_cst) volatile noexcept;
-    bool compare_exchange_weak(T*& expc, T* desr,
-                               memory_order m = memory_order_seq_cst) noexcept;
-    bool compare_exchange_strong(T*& expc, T* desr,
-                                memory_order m = memory_order_seq_cst) volatile noexcept;
-    bool compare_exchange_strong(T*& expc, T* desr,
-                                 memory_order m = memory_order_seq_cst) noexcept;
-    T* fetch_add(ptrdiff_t op, memory_order m = memory_order_seq_cst) volatile noexcept;
-    T* fetch_add(ptrdiff_t op, memory_order m = memory_order_seq_cst) noexcept;
-    T* fetch_sub(ptrdiff_t op, memory_order m = memory_order_seq_cst) volatile noexcept;
-    T* fetch_sub(ptrdiff_t op, memory_order m = memory_order_seq_cst) noexcept;
-
-    atomic() noexcept = default;
-    constexpr atomic(T* desr) noexcept;
-    atomic(const atomic&) = delete;
-    atomic& operator=(const atomic&) = delete;
-    atomic& operator=(const atomic&) volatile = delete;
-
-    T* operator=(T*) volatile noexcept;
-    T* operator=(T*) noexcept;
-    T* operator++(int) volatile noexcept;
-    T* operator++(int) noexcept;
-    T* operator--(int) volatile noexcept;
-    T* operator--(int) noexcept;
-    T* operator++() volatile noexcept;
-    T* operator++() noexcept;
-    T* operator--() volatile noexcept;
-    T* operator--() noexcept;
-    T* operator+=(ptrdiff_t op) volatile noexcept;
-    T* operator+=(ptrdiff_t op) noexcept;
-    T* operator-=(ptrdiff_t op) volatile noexcept;
-    T* operator-=(ptrdiff_t op) noexcept;
-};
-
-
-template <class T>
-    bool
-    atomic_is_lock_free(const volatile atomic<T>* obj) noexcept;
-
-template <class T>
-    bool
-    atomic_is_lock_free(const atomic<T>* obj) noexcept;
-
-template <class T>
-    void
-    atomic_init(volatile atomic<T>* obj, T desr) noexcept;
-
-template <class T>
-    void
-    atomic_init(atomic<T>* obj, T desr) noexcept;
-
-template <class T>
-    void
-    atomic_store(volatile atomic<T>* obj, T desr) noexcept;
-
-template <class T>
-    void
-    atomic_store(atomic<T>* obj, T desr) noexcept;
-
-template <class T>
-    void
-    atomic_store_explicit(volatile atomic<T>* obj, T desr, memory_order m) noexcept;
-
-template <class T>
-    void
-    atomic_store_explicit(atomic<T>* obj, T desr, memory_order m) noexcept;
-
-template <class T>
-    T
-    atomic_load(const volatile atomic<T>* obj) noexcept;
-
-template <class T>
-    T
-    atomic_load(const atomic<T>* obj) noexcept;
-
-template <class T>
-    T
-    atomic_load_explicit(const volatile atomic<T>* obj, memory_order m) noexcept;
-
-template <class T>
-    T
-    atomic_load_explicit(const atomic<T>* obj, memory_order m) noexcept;
-
-template <class T>
-    T
-    atomic_exchange(volatile atomic<T>* obj, T desr) noexcept;
-
-template <class T>
-    T
-    atomic_exchange(atomic<T>* obj, T desr) noexcept;
-
-template <class T>
-    T
-    atomic_exchange_explicit(volatile atomic<T>* obj, T desr, memory_order m) noexcept;
-
-template <class T>
-    T
-    atomic_exchange_explicit(atomic<T>* obj, T desr, memory_order m) noexcept;
-
-template <class T>
-    bool
-    atomic_compare_exchange_weak(volatile atomic<T>* obj, T* expc, T desr) noexcept;
-
-template <class T>
-    bool
-    atomic_compare_exchange_weak(atomic<T>* obj, T* expc, T desr) noexcept;
-
-template <class T>
-    bool
-    atomic_compare_exchange_strong(volatile atomic<T>* obj, T* expc, T desr) noexcept;
-
-template <class T>
-    bool
-    atomic_compare_exchange_strong(atomic<T>* obj, T* expc, T desr) noexcept;
-
-template <class T>
-    bool
-    atomic_compare_exchange_weak_explicit(volatile atomic<T>* obj, T* expc,
-                                          T desr,
-                                          memory_order s, memory_order f) noexcept;
-
-template <class T>
-    bool
-    atomic_compare_exchange_weak_explicit(atomic<T>* obj, T* expc, T desr,
-                                          memory_order s, memory_order f) noexcept;
-
-template <class T>
-    bool
-    atomic_compare_exchange_strong_explicit(volatile atomic<T>* obj,
-                                            T* expc, T desr,
-                                            memory_order s, memory_order f) noexcept;
-
-template <class T>
-    bool
-    atomic_compare_exchange_strong_explicit(atomic<T>* obj, T* expc,
-                                            T desr,
-                                            memory_order s, memory_order f) noexcept;
-
-template <class Integral>
-    Integral
-    atomic_fetch_add(volatile atomic<Integral>* obj, Integral op) noexcept;
-
-template <class Integral>
-    Integral
-    atomic_fetch_add(atomic<Integral>* obj, Integral op) noexcept;
-
-template <class Integral>
-    Integral
-    atomic_fetch_add_explicit(volatile atomic<Integral>* obj, Integral op,
-                              memory_order m) noexcept;
-template <class Integral>
-    Integral
-    atomic_fetch_add_explicit(atomic<Integral>* obj, Integral op,
-                              memory_order m) noexcept;
-template <class Integral>
-    Integral
-    atomic_fetch_sub(volatile atomic<Integral>* obj, Integral op) noexcept;
-
-template <class Integral>
-    Integral
-    atomic_fetch_sub(atomic<Integral>* obj, Integral op) noexcept;
-
-template <class Integral>
-    Integral
-    atomic_fetch_sub_explicit(volatile atomic<Integral>* obj, Integral op,
-                              memory_order m) noexcept;
-template <class Integral>
-    Integral
-    atomic_fetch_sub_explicit(atomic<Integral>* obj, Integral op,
-                              memory_order m) noexcept;
-template <class Integral>
-    Integral
-    atomic_fetch_and(volatile atomic<Integral>* obj, Integral op) noexcept;
-
-template <class Integral>
-    Integral
-    atomic_fetch_and(atomic<Integral>* obj, Integral op) noexcept;
-
-template <class Integral>
-    Integral
-    atomic_fetch_and_explicit(volatile atomic<Integral>* obj, Integral op,
-                              memory_order m) noexcept;
-template <class Integral>
-    Integral
-    atomic_fetch_and_explicit(atomic<Integral>* obj, Integral op,
-                              memory_order m) noexcept;
-template <class Integral>
-    Integral
-    atomic_fetch_or(volatile atomic<Integral>* obj, Integral op) noexcept;
-
-template <class Integral>
-    Integral
-    atomic_fetch_or(atomic<Integral>* obj, Integral op) noexcept;
-
-template <class Integral>
-    Integral
-    atomic_fetch_or_explicit(volatile atomic<Integral>* obj, Integral op,
-                             memory_order m) noexcept;
-template <class Integral>
-    Integral
-    atomic_fetch_or_explicit(atomic<Integral>* obj, Integral op,
-                             memory_order m) noexcept;
-template <class Integral>
-    Integral
-    atomic_fetch_xor(volatile atomic<Integral>* obj, Integral op) noexcept;
-
-template <class Integral>
-    Integral
-    atomic_fetch_xor(atomic<Integral>* obj, Integral op) noexcept;
-
-template <class Integral>
-    Integral
-    atomic_fetch_xor_explicit(volatile atomic<Integral>* obj, Integral op,
-                              memory_order m) noexcept;
-template <class Integral>
-    Integral
-    atomic_fetch_xor_explicit(atomic<Integral>* obj, Integral op,
-                              memory_order m) noexcept;
-
-template <class T>
-    T*
-    atomic_fetch_add(volatile atomic<T*>* obj, ptrdiff_t op) noexcept;
-
-template <class T>
-    T*
-    atomic_fetch_add(atomic<T*>* obj, ptrdiff_t op) noexcept;
-
-template <class T>
-    T*
-    atomic_fetch_add_explicit(volatile atomic<T*>* obj, ptrdiff_t op,
-                              memory_order m) noexcept;
-template <class T>
-    T*
-    atomic_fetch_add_explicit(atomic<T*>* obj, ptrdiff_t op, memory_order m) noexcept;
-
-template <class T>
-    T*
-    atomic_fetch_sub(volatile atomic<T*>* obj, ptrdiff_t op) noexcept;
-
-template <class T>
-    T*
-    atomic_fetch_sub(atomic<T*>* obj, ptrdiff_t op) noexcept;
-
-template <class T>
-    T*
-    atomic_fetch_sub_explicit(volatile atomic<T*>* obj, ptrdiff_t op,
-                              memory_order m) noexcept;
-template <class T>
-    T*
-    atomic_fetch_sub_explicit(atomic<T*>* obj, ptrdiff_t op, memory_order m) noexcept;
-
-// Atomics for standard typedef types
-
-typedef atomic<bool>               atomic_bool;
-typedef atomic<char>               atomic_char;
-typedef atomic<signed char>        atomic_schar;
-typedef atomic<unsigned char>      atomic_uchar;
-typedef atomic<short>              atomic_short;
-typedef atomic<unsigned short>     atomic_ushort;
-typedef atomic<int>                atomic_int;
-typedef atomic<unsigned int>       atomic_uint;
-typedef atomic<long>               atomic_long;
-typedef atomic<unsigned long>      atomic_ulong;
-typedef atomic<long long>          atomic_llong;
-typedef atomic<unsigned long long> atomic_ullong;
-typedef atomic<char16_t>           atomic_char16_t;
-typedef atomic<char32_t>           atomic_char32_t;
-typedef atomic<wchar_t>            atomic_wchar_t;
-
-typedef atomic<int_least8_t>   atomic_int_least8_t;
-typedef atomic<uint_least8_t>  atomic_uint_least8_t;
-typedef atomic<int_least16_t>  atomic_int_least16_t;
-typedef atomic<uint_least16_t> atomic_uint_least16_t;
-typedef atomic<int_least32_t>  atomic_int_least32_t;
-typedef atomic<uint_least32_t> atomic_uint_least32_t;
-typedef atomic<int_least64_t>  atomic_int_least64_t;
-typedef atomic<uint_least64_t> atomic_uint_least64_t;
-
-typedef atomic<int_fast8_t>   atomic_int_fast8_t;
-typedef atomic<uint_fast8_t>  atomic_uint_fast8_t;
-typedef atomic<int_fast16_t>  atomic_int_fast16_t;
-typedef atomic<uint_fast16_t> atomic_uint_fast16_t;
-typedef atomic<int_fast32_t>  atomic_int_fast32_t;
-typedef atomic<uint_fast32_t> atomic_uint_fast32_t;
-typedef atomic<int_fast64_t>  atomic_int_fast64_t;
-typedef atomic<uint_fast64_t> atomic_uint_fast64_t;
-
-typedef atomic<int8_t>   atomic_int8_t;
-typedef atomic<uint8_t>  atomic_uint8_t;
-typedef atomic<int16_t>  atomic_int16_t;
-typedef atomic<uint16_t> atomic_uint16_t;
-typedef atomic<int32_t>  atomic_int32_t;
-typedef atomic<uint32_t> atomic_uint32_t;
-typedef atomic<int64_t>  atomic_int64_t;
-typedef atomic<uint64_t> atomic_uint64_t;
-
-typedef atomic<intptr_t>  atomic_intptr_t;
-typedef atomic<uintptr_t> atomic_uintptr_t;
-typedef atomic<size_t>    atomic_size_t;
-typedef atomic<ptrdiff_t> atomic_ptrdiff_t;
-typedef atomic<intmax_t>  atomic_intmax_t;
-typedef atomic<uintmax_t> atomic_uintmax_t;
-
-// fences
-
-void atomic_thread_fence(memory_order m) noexcept;
-void atomic_signal_fence(memory_order m) noexcept;
-
-}  // std
-
-*/
-
-#include <cuda/std/detail/__config>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-#include <cuda/std/__type_traits/conditional.h>
-#include <cuda/std/__type_traits/enable_if.h>
-#include <cuda/std/__type_traits/is_assignable.h>
-#include <cuda/std/__type_traits/is_floating_point.h>
-#include <cuda/std/__type_traits/is_integral.h>
-#include <cuda/std/__type_traits/is_same.h>
-#include <cuda/std/__type_traits/is_trivially_copyable.h>
-#include <cuda/std/__type_traits/underlying_type.h>
-#include <cuda/std/__utility/forward.h>
-#include <cuda/std/cstddef>
-#include <cuda/std/cstdint>
-#include <cuda/std/detail/libcxx/include/__assert> // all public C++ headers provide the assertion handler
-#include <cuda/std/detail/libcxx/include/__debug>
-#include <cuda/std/detail/libcxx/include/__threading_support>
-#include <cuda/std/detail/libcxx/include/cstring>
-#include <cuda/std/type_traits>
-#include <cuda/std/version>
-
-_CCCL_PUSH_MACROS
-
-#ifdef _LIBCUDACXX_HAS_NO_THREADS
-#  error <atomic> is not supported on this single threaded system
-#endif
-#ifdef _LIBCUDACXX_HAS_NO_ATOMIC_HEADER
-#  error <atomic> is not implemented
-#endif
-#ifdef _LIBCUDACXX_UNSUPPORTED_THREAD_API
-#  error "<atomic> is not supported on this system"
-#endif
-#ifdef kill_dependency
-#  error C++ standard library is incompatible with <stdatomic.h>
-#endif
-
-#define _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)                                              \
-  _LIBCUDACXX_DIAGNOSE_WARNING(                                                                \
-    __m == memory_order_consume || __m == memory_order_acquire || __m == memory_order_acq_rel, \
-    "memory order argument to atomic operation is invalid")
-
-#define _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)                                           \
-  _LIBCUDACXX_DIAGNOSE_WARNING(__m == memory_order_release || __m == memory_order_acq_rel, \
-                               "memory order argument to atomic operation is invalid")
-
-#define _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__m, __f)                                  \
-  _LIBCUDACXX_DIAGNOSE_WARNING(__f == memory_order_release || __f == memory_order_acq_rel, \
-                               "memory order argument to atomic operation is invalid")
-
-#if defined(_LIBCUDACXX_HAS_MSVC_ATOMIC_IMPL)
-#  include <intrin.h>
-#endif
-
-#if !defined(_CCCL_COMPILER_NVRTC)
-#  include <string.h>
-#endif
-
-#if !defined(__CLANG_ATOMIC_BOOL_LOCK_FREE) && !defined(__GCC_ATOMIC_BOOL_LOCK_FREE)
-#  define ATOMIC_BOOL_LOCK_FREE     2
-#  define ATOMIC_CHAR_LOCK_FREE     2
-#  define ATOMIC_CHAR16_T_LOCK_FREE 2
-#  define ATOMIC_CHAR32_T_LOCK_FREE 2
-#  define ATOMIC_WCHAR_T_LOCK_FREE  2
-#  define ATOMIC_SHORT_LOCK_FREE    2
-#  define ATOMIC_INT_LOCK_FREE      2
-#  define ATOMIC_LONG_LOCK_FREE     2
-#  define ATOMIC_LLONG_LOCK_FREE    2
-#  define ATOMIC_POINTER_LOCK_FREE  2
-#endif //! defined(__CLANG_ATOMIC_BOOL_LOCK_FREE) && !defined(__GCC_ATOMIC_BOOL_LOCK_FREE)
-
-#ifndef __ATOMIC_RELAXED
-#  define __ATOMIC_RELAXED 0
-#  define __ATOMIC_CONSUME 1
-#  define __ATOMIC_ACQUIRE 2
-#  define __ATOMIC_RELEASE 3
-#  define __ATOMIC_ACQ_REL 4
-#  define __ATOMIC_SEQ_CST 5
-#endif //__ATOMIC_RELAXED
-
-_LIBCUDACXX_BEGIN_NAMESPACE_STD
-
-// Figure out what the underlying type for `memory_order` would be if it were
-// declared as an unscoped enum (accounting for -fshort-enums). Use this result
-// to pin the underlying type in C++20.
-enum __legacy_memory_order
-{
-  __mo_relaxed,
-  __mo_consume,
-  __mo_acquire,
-  __mo_release,
-  __mo_acq_rel,
-  __mo_seq_cst
-};
-
-typedef underlying_type<__legacy_memory_order>::type __memory_order_underlying_t;
-
-#if _CCCL_STD_VER > 2017
-
-enum class memory_order : __memory_order_underlying_t
-{
-  relaxed = __mo_relaxed,
-  consume = __mo_consume,
-  acquire = __mo_acquire,
-  release = __mo_release,
-  acq_rel = __mo_acq_rel,
-  seq_cst = __mo_seq_cst
-};
-
-inline constexpr auto memory_order_relaxed = memory_order::relaxed;
-inline constexpr auto memory_order_consume = memory_order::consume;
-inline constexpr auto memory_order_acquire = memory_order::acquire;
-inline constexpr auto memory_order_release = memory_order::release;
-inline constexpr auto memory_order_acq_rel = memory_order::acq_rel;
-inline constexpr auto memory_order_seq_cst = memory_order::seq_cst;
-
-#else
-
-typedef enum memory_order
-{
-  memory_order_relaxed = __mo_relaxed,
-  memory_order_consume = __mo_consume,
-  memory_order_acquire = __mo_acquire,
-  memory_order_release = __mo_release,
-  memory_order_acq_rel = __mo_acq_rel,
-  memory_order_seq_cst = __mo_seq_cst,
-} memory_order;
-
-#endif // _CCCL_STD_VER > 2017
-
-template <typename _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY bool __cxx_nonatomic_compare_equal(_Tp const& __lhs, _Tp const& __rhs)
-{
-#if defined(_CCCL_CUDA_COMPILER)
-  return __lhs == __rhs;
-#else
-  return memcmp(&__lhs, &__rhs, sizeof(_Tp)) == 0;
-#endif
-}
-
-static_assert((is_same<underlying_type<memory_order>::type, __memory_order_underlying_t>::value),
-              "unexpected underlying type for std::memory_order");
-
-#if defined(_LIBCUDACXX_HAS_GCC_ATOMIC_IMP) || defined(_LIBCUDACXX_ATOMIC_ONLY_USE_BUILTINS)
-
-// [atomics.types.generic]p1 guarantees _Tp is trivially copyable. Because
-// the default operator= in an object is not volatile, a byte-by-byte copy
-// is required.
-template <typename _Tp, typename _Tv>
-_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_assignable<_Tp&, _Tv>::value>
-__cxx_atomic_assign_volatile(_Tp& __a_value, _Tv const& __val)
-{
-  __a_value = __val;
-}
-template <typename _Tp, typename _Tv>
-_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_assignable<_Tp&, _Tv>::value>
-__cxx_atomic_assign_volatile(_Tp volatile& __a_value, _Tv volatile const& __val)
-{
-  volatile char* __to         = reinterpret_cast<volatile char*>(&__a_value);
-  volatile char* __end        = __to + sizeof(_Tp);
-  volatile const char* __from = reinterpret_cast<volatile const char*>(&__val);
-  while (__to != __end)
-  {
-    *__to++ = *__from++;
-  }
-}
-
-#endif
-
-// Headers are wrapped like so: (cuda::std::|std::)detail
-namespace __detail
-{
-#if defined(_LIBCUDACXX_HAS_CUDA_ATOMIC_EXT)
-#  include <cuda/std/detail/libcxx/include/support/atomic/atomic_scopes.h>
-#endif
-
-#if defined(_LIBCUDACXX_HAS_CUDA_ATOMIC_IMPL)
-#  include <cuda/std/detail/libcxx/include/support/atomic/atomic_cuda.h>
-#elif defined(_LIBCUDACXX_HAS_MSVC_ATOMIC_IMPL)
-#  include <cuda/std/detail/libcxx/include/support/atomic/atomic_msvc.h>
-#elif defined(_LIBCUDACXX_HAS_GCC_ATOMIC_IMP)
-#  include <cuda/std/detail/libcxx/include/support/atomic/atomic_gcc.h>
-#elif defined(_LIBCUDACXX_HAS_C_ATOMIC_IMP)
-// TODO: Maybe support C11 atomics?
-// #include <cuda/std/detail/libcxx/include/support/atomic/atomic_c11.h>
-#endif // _LIBCUDACXX_HAS_GCC_ATOMIC_IMP, _LIBCUDACXX_HAS_C_ATOMIC_IMP
-} // namespace __detail
-
-using __detail::__cxx_atomic_base_impl;
-using __detail::__cxx_atomic_compare_exchange_strong;
-using __detail::__cxx_atomic_compare_exchange_weak;
-using __detail::__cxx_atomic_exchange;
-using __detail::__cxx_atomic_fetch_add;
-using __detail::__cxx_atomic_fetch_and;
-using __detail::__cxx_atomic_fetch_or;
-using __detail::__cxx_atomic_fetch_sub;
-using __detail::__cxx_atomic_fetch_xor;
-using __detail::__cxx_atomic_load;
-using __detail::__cxx_atomic_ref_base_impl;
-using __detail::__cxx_atomic_signal_fence;
-using __detail::__cxx_atomic_store;
-using __detail::__cxx_atomic_thread_fence;
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp kill_dependency(_Tp __y) noexcept
-{
-  return __y;
-}
-
-#if defined(__CLANG_ATOMIC_BOOL_LOCK_FREE)
-#  define ATOMIC_BOOL_LOCK_FREE     __CLANG_ATOMIC_BOOL_LOCK_FREE
-#  define ATOMIC_CHAR_LOCK_FREE     __CLANG_ATOMIC_CHAR_LOCK_FREE
-#  define ATOMIC_CHAR16_T_LOCK_FREE __CLANG_ATOMIC_CHAR16_T_LOCK_FREE
-#  define ATOMIC_CHAR32_T_LOCK_FREE __CLANG_ATOMIC_CHAR32_T_LOCK_FREE
-#  define ATOMIC_WCHAR_T_LOCK_FREE  __CLANG_ATOMIC_WCHAR_T_LOCK_FREE
-#  define ATOMIC_SHORT_LOCK_FREE    __CLANG_ATOMIC_SHORT_LOCK_FREE
-#  define ATOMIC_INT_LOCK_FREE      __CLANG_ATOMIC_INT_LOCK_FREE
-#  define ATOMIC_LONG_LOCK_FREE     __CLANG_ATOMIC_LONG_LOCK_FREE
-#  define ATOMIC_LLONG_LOCK_FREE    __CLANG_ATOMIC_LLONG_LOCK_FREE
-#  define ATOMIC_POINTER_LOCK_FREE  __CLANG_ATOMIC_POINTER_LOCK_FREE
-#elif defined(__GCC_ATOMIC_BOOL_LOCK_FREE)
-#  define ATOMIC_BOOL_LOCK_FREE     __GCC_ATOMIC_BOOL_LOCK_FREE
-#  define ATOMIC_CHAR_LOCK_FREE     __GCC_ATOMIC_CHAR_LOCK_FREE
-#  define ATOMIC_CHAR16_T_LOCK_FREE __GCC_ATOMIC_CHAR16_T_LOCK_FREE
-#  define ATOMIC_CHAR32_T_LOCK_FREE __GCC_ATOMIC_CHAR32_T_LOCK_FREE
-#  define ATOMIC_WCHAR_T_LOCK_FREE  __GCC_ATOMIC_WCHAR_T_LOCK_FREE
-#  define ATOMIC_SHORT_LOCK_FREE    __GCC_ATOMIC_SHORT_LOCK_FREE
-#  define ATOMIC_INT_LOCK_FREE      __GCC_ATOMIC_INT_LOCK_FREE
-#  define ATOMIC_LONG_LOCK_FREE     __GCC_ATOMIC_LONG_LOCK_FREE
-#  define ATOMIC_LLONG_LOCK_FREE    __GCC_ATOMIC_LLONG_LOCK_FREE
-#  define ATOMIC_POINTER_LOCK_FREE  __GCC_ATOMIC_POINTER_LOCK_FREE
-#endif
-
-#ifdef _LIBCUDACXX_ATOMIC_ONLY_USE_BUILTINS
-
-template <typename _Tp, int _Sco>
-struct __cxx_atomic_lock_impl
-{
-  _LIBCUDACXX_INLINE_VISIBILITY __cxx_atomic_lock_impl() noexcept
-      : __a_value()
-      , __a_lock(0)
-  {}
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit __cxx_atomic_lock_impl(_Tp value) noexcept
-      : __a_value(value)
-      , __a_lock(0)
-  {}
-
-  _Tp __a_value;
-  mutable __cxx_atomic_base_impl<_LIBCUDACXX_ATOMIC_FLAG_TYPE, _Sco> __a_lock;
-
-  _LIBCUDACXX_INLINE_VISIBILITY void __lock() const volatile
-  {
-    while (1 == __cxx_atomic_exchange(&__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), memory_order_acquire))
-      /*spin*/;
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY void __lock() const
-  {
-    while (1 == __cxx_atomic_exchange(&__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), memory_order_acquire))
-      /*spin*/;
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY void __unlock() const volatile
-  {
-    __cxx_atomic_store(&__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), memory_order_release);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY void __unlock() const
-  {
-    __cxx_atomic_store(&__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), memory_order_release);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp __read() const volatile
-  {
-    __lock();
-    _Tp __old;
-    __cxx_atomic_assign_volatile(__old, __a_value);
-    __unlock();
-    return __old;
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp __read() const
-  {
-    __lock();
-    _Tp __old = __a_value;
-    __unlock();
-    return __old;
-  }
-};
-
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_init(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __val)
-{
-  __cxx_atomic_assign_volatile(__a->__a_value, __val);
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_init(__cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __val)
-{
-  __a->__a_value = __val;
-}
-
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY void
-__cxx_atomic_store(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __val, memory_order)
-{
-  __a->__lock();
-  __cxx_atomic_assign_volatile(__a->__a_value, __val);
-  __a->__unlock();
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_store(__cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __val, memory_order)
-{
-  __a->__lock();
-  __a->__a_value = __val;
-  __a->__unlock();
-}
-
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp __cxx_atomic_load(const volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, memory_order)
-{
-  return __a->__read();
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp __cxx_atomic_load(const __cxx_atomic_lock_impl<_Tp, _Sco>* __a, memory_order)
-{
-  return __a->__read();
-}
-
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp
-__cxx_atomic_exchange(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __value, memory_order)
-{
-  __a->__lock();
-  _Tp __old;
-  __cxx_atomic_assign_volatile(__old, __a->__a_value);
-  __cxx_atomic_assign_volatile(__a->__a_value, __value);
-  __a->__unlock();
-  return __old;
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp
-__cxx_atomic_exchange(__cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __value, memory_order)
-{
-  __a->__lock();
-  _Tp __old      = __a->__a_value;
-  __a->__a_value = __value;
-  __a->__unlock();
-  return __old;
-}
-
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY bool __cxx_atomic_compare_exchange_strong(
-  volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp* __expected, _Tp __value, memory_order, memory_order)
-{
-  __a->__lock();
-  _Tp __temp;
-  __cxx_atomic_assign_volatile(__temp, __a->__a_value);
-  bool __ret = __temp == *__expected;
-  if (__ret)
-  {
-    __cxx_atomic_assign_volatile(__a->__a_value, __value);
-  }
-  else
-  {
-    __cxx_atomic_assign_volatile(*__expected, __a->__a_value);
-  }
-  __a->__unlock();
-  return __ret;
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY bool __cxx_atomic_compare_exchange_strong(
-  __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp* __expected, _Tp __value, memory_order, memory_order)
-{
-  __a->__lock();
-  bool __ret = __a->__a_value == *__expected;
-  if (__ret)
-  {
-    __a->__a_value = __value;
-  }
-  else
-  {
-    *__expected = __a->__a_value;
-  }
-  __a->__unlock();
-  return __ret;
-}
-
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY bool __cxx_atomic_compare_exchange_weak(
-  volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp* __expected, _Tp __value, memory_order, memory_order)
-{
-  __a->__lock();
-  _Tp __temp;
-  __cxx_atomic_assign_volatile(__temp, __a->__a_value);
-  bool __ret = __temp == *__expected;
-  if (__ret)
-  {
-    __cxx_atomic_assign_volatile(__a->__a_value, __value);
-  }
-  else
-  {
-    __cxx_atomic_assign_volatile(*__expected, __a->__a_value);
-  }
-  __a->__unlock();
-  return __ret;
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY bool __cxx_atomic_compare_exchange_weak(
-  __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp* __expected, _Tp __value, memory_order, memory_order)
-{
-  __a->__lock();
-  bool __ret = __a->__a_value == *__expected;
-  if (__ret)
-  {
-    __a->__a_value = __value;
-  }
-  else
-  {
-    *__expected = __a->__a_value;
-  }
-  __a->__unlock();
-  return __ret;
-}
-
-template <typename _Tp, typename _Td, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp
-__cxx_atomic_fetch_add(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Td __delta, memory_order)
-{
-  __a->__lock();
-  _Tp __old;
-  __cxx_atomic_assign_volatile(__old, __a->__a_value);
-  __cxx_atomic_assign_volatile(__a->__a_value, _Tp(__old + __delta));
-  __a->__unlock();
-  return __old;
-}
-template <typename _Tp, typename _Td, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp
-__cxx_atomic_fetch_add(__cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Td __delta, memory_order)
-{
-  __a->__lock();
-  _Tp __old = __a->__a_value;
-  __a->__a_value += __delta;
-  __a->__unlock();
-  return __old;
-}
-
-template <typename _Tp, typename _Td, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp*
-__cxx_atomic_fetch_add(volatile __cxx_atomic_lock_impl<_Tp*, _Sco>* __a, ptrdiff_t __delta, memory_order)
-{
-  __a->__lock();
-  _Tp* __old;
-  __cxx_atomic_assign_volatile(__old, __a->__a_value);
-  __cxx_atomic_assign_volatile(__a->__a_value, __old + __delta);
-  __a->__unlock();
-  return __old;
-}
-template <typename _Tp, typename _Td, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp*
-__cxx_atomic_fetch_add(__cxx_atomic_lock_impl<_Tp*, _Sco>* __a, ptrdiff_t __delta, memory_order)
-{
-  __a->__lock();
-  _Tp* __old = __a->__a_value;
-  __a->__a_value += __delta;
-  __a->__unlock();
-  return __old;
-}
-
-template <typename _Tp, typename _Td, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp
-__cxx_atomic_fetch_sub(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Td __delta, memory_order)
-{
-  __a->__lock();
-  _Tp __old;
-  __cxx_atomic_assign_volatile(__old, __a->__a_value);
-  __cxx_atomic_assign_volatile(__a->__a_value, _Tp(__old - __delta));
-  __a->__unlock();
-  return __old;
-}
-template <typename _Tp, typename _Td, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp
-__cxx_atomic_fetch_sub(__cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Td __delta, memory_order)
-{
-  __a->__lock();
-  _Tp __old = __a->__a_value;
-  __a->__a_value -= __delta;
-  __a->__unlock();
-  return __old;
-}
-
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp
-__cxx_atomic_fetch_and(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __pattern, memory_order)
-{
-  __a->__lock();
-  _Tp __old;
-  __cxx_atomic_assign_volatile(__old, __a->__a_value);
-  __cxx_atomic_assign_volatile(__a->__a_value, _Tp(__old & __pattern));
-  __a->__unlock();
-  return __old;
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp
-__cxx_atomic_fetch_and(__cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __pattern, memory_order)
-{
-  __a->__lock();
-  _Tp __old = __a->__a_value;
-  __a->__a_value &= __pattern;
-  __a->__unlock();
-  return __old;
-}
-
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp
-__cxx_atomic_fetch_or(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __pattern, memory_order)
-{
-  __a->__lock();
-  _Tp __old;
-  __cxx_atomic_assign_volatile(__old, __a->__a_value);
-  __cxx_atomic_assign_volatile(__a->__a_value, _Tp(__old | __pattern));
-  __a->__unlock();
-  return __old;
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp
-__cxx_atomic_fetch_or(__cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __pattern, memory_order)
-{
-  __a->__lock();
-  _Tp __old = __a->__a_value;
-  __a->__a_value |= __pattern;
-  __a->__unlock();
-  return __old;
-}
-
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp
-__cxx_atomic_fetch_xor(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __pattern, memory_order)
-{
-  __a->__lock();
-  _Tp __old;
-  __cxx_atomic_assign_volatile(__old, __a->__a_value);
-  __cxx_atomic_assign_volatile(__a->__a_value, _Tp(__old ^ __pattern));
-  __a->__unlock();
-  return __old;
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp
-__cxx_atomic_fetch_xor(__cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __pattern, memory_order)
-{
-  __a->__lock();
-  _Tp __old = __a->__a_value;
-  __a->__a_value ^= __pattern;
-  __a->__unlock();
-  return __old;
-}
-
-#  if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
-
-template <typename _Tp>
-struct __cxx_is_always_lock_free
-{
-  enum
-  {
-    __value = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0)
-  };
-};
-
-#  else
-
-template <typename _Tp>
-struct __cxx_is_always_lock_free
-{
-  enum
-  {
-    __value = sizeof(_Tp) <= 8
-  };
-};
-
-#  endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
-
-template <typename _Tp, int _Sco>
-struct __cxx_atomic_impl_conditional
-{
-  using type = __conditional_t<__cxx_is_always_lock_free<_Tp>::__value,
-                               __cxx_atomic_base_impl<_Tp, _Sco>,
-                               __cxx_atomic_lock_impl<_Tp, _Sco>>;
-};
-
-template <typename _Tp, int _Sco, typename _Base = typename __cxx_atomic_impl_conditional<_Tp, _Sco>::type>
-#else
-template <typename _Tp, int _Sco, typename _Base = __cxx_atomic_base_impl<_Tp, _Sco>>
-#endif //_LIBCUDACXX_ATOMIC_ONLY_USE_BUILTINS
-struct __cxx_atomic_impl : public _Base
-{
-  __cxx_atomic_impl() noexcept = default;
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit __cxx_atomic_impl(_Tp value) noexcept
-      : _Base(value)
-  {}
-};
-
-template <int _Sco, typename _Tp = int>
-_LIBCUDACXX_INLINE_VISIBILITY __cxx_atomic_impl<_Tp, _Sco>* __cxx_atomic_rebind(_Tp* __inst)
-{
-  static_assert(sizeof(__cxx_atomic_impl<_Tp, _Sco>) == sizeof(_Tp), "");
-  static_assert(alignof(__cxx_atomic_impl<_Tp, _Sco>) == alignof(_Tp), "");
-  return (__cxx_atomic_impl<_Tp, _Sco>*) __inst;
-}
-
-template <typename _Tp, int _Sco>
-using __cxx_atomic_ref_impl = __cxx_atomic_ref_base_impl<_Tp, _Sco>;
-
-#ifdef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
-
-template <class _Ty, class _Tp = __detail::__cxx_atomic_underlying_t<_Ty>, int _Sco = _Ty::__sco>
-struct __cxx_atomic_poll_tester
-{
-  _Ty const volatile* __a;
-  _Tp __val;
-  memory_order __order;
-
-  _LIBCUDACXX_INLINE_VISIBILITY __cxx_atomic_poll_tester(_Ty const volatile* __a_, _Tp __val_, memory_order __order_)
-      : __a(__a_)
-      , __val(__val_)
-      , __order(__order_)
-  {}
-
-  _LIBCUDACXX_INLINE_VISIBILITY bool operator()() const
-  {
-    return !(__cxx_atomic_load(__a, __order) == __val);
-  }
-};
-
-template <class _Ty, class _Tp = __detail::__cxx_atomic_underlying_t<_Ty>, int _Sco = _Ty::__sco>
-_LIBCUDACXX_INLINE_VISIBILITY void
-__cxx_atomic_try_wait_slow_fallback(_Ty const volatile* __a, _Tp __val, memory_order __order)
-{
-  __libcpp_thread_poll_with_backoff(__cxx_atomic_poll_tester<_Ty>(__a, __val, __order));
-}
-
-#endif
-
-#ifdef _LIBCUDACXX_HAS_PLATFORM_WAIT
-
-template <class _Tp, int _Sco, __enable_if_t<!__libcpp_platform_wait_uses_type<_Tp>::__value, int> = 1>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_all(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a)
-{
-#  ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
-  auto* const __c = __libcpp_contention_state(__a);
-  __cxx_atomic_fetch_add(__cxx_atomic_rebind<_Sco>(&__c->__version), (__libcpp_platform_wait_t) 1, memory_order_relaxed);
-  __cxx_atomic_thread_fence(memory_order_seq_cst);
-  if (0 != __cxx_atomic_exchange(__cxx_atomic_rebind<_Sco>(&__c->__waiters), (ptrdiff_t) 0, memory_order_relaxed))
-  {
-    __libcpp_platform_wake(&__c->__version, true);
-  }
-#  endif
-}
-template <class _Tp, int _Sco, __enable_if_t<!__libcpp_platform_wait_uses_type<_Tp>::__value, int> = 1>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_one(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a)
-{
-  __cxx_atomic_notify_all(__a);
-}
-template <class _Ty,
-          class _Tp = __detail::__cxx_atomic_underlying_t<_Ty>,
-          int _Sco  = _Ty::__sco,
-          __enable_if_t<!__libcpp_platform_wait_uses_type<_Tp>::__value, int> = 1>
-_LIBCUDACXX_INLINE_VISIBILITY void
-__cxx_atomic_try_wait_slow(_Ty const volatile* __a, _Tp const __val, memory_order __order)
-{
-#  ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
-  auto* const __c = __libcpp_contention_state(__a);
-  __cxx_atomic_store(__cxx_atomic_rebind<_Sco>(&__c->__waiters), (ptrdiff_t) 1, memory_order_relaxed);
-  __cxx_atomic_thread_fence(memory_order_seq_cst);
-  auto const __version = __cxx_atomic_load(__cxx_atomic_rebind<_Sco>(&__c->__version), memory_order_relaxed);
-  if (!__cxx_nonatomic_compare_equal(__cxx_atomic_load(__a, __order), __val))
-  {
-    return;
-  }
-  if (sizeof(__libcpp_platform_wait_t) < 8)
-  {
-    constexpr timespec __timeout = {2, 0}; // Hedge on rare 'int version' aliasing.
-    __libcpp_platform_wait(&__c->__version, __version, &__timeout);
-  }
-  else
-  {
-    __libcpp_platform_wait(&__c->__version, __version, nullptr);
-  }
-#  else
-  __cxx_atomic_try_wait_slow_fallback(__a, __val, __order);
-#  endif // _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
-}
-
-template <class _Tp, int _Sco, __enable_if_t<__libcpp_platform_wait_uses_type<_Tp>::__value, int> = 1>
-_LIBCUDACXX_INLINE_VISIBILITY void
-__cxx_atomic_try_wait_slow(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a, _Tp __val, memory_order)
-{
-#  ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
-  auto* const __c = __libcpp_contention_state(__a);
-  __cxx_atomic_fetch_add(__cxx_atomic_rebind<_Sco>(&__c->__waiters), (ptrdiff_t) 1, memory_order_relaxed);
-  __cxx_atomic_thread_fence(memory_order_seq_cst);
-#  endif
-  __libcpp_platform_wait((_Tp*) __a, __val, nullptr);
-#  ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
-  __cxx_atomic_fetch_sub(__cxx_atomic_rebind<_Sco>(&__c->__waiters), (ptrdiff_t) 1, memory_order_relaxed);
-#  endif
-}
-template <class _Tp, int _Sco, __enable_if_t<__libcpp_platform_wait_uses_type<_Tp>::__value, int> = 1>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_all(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a)
-{
-#  ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
-  auto* const __c = __libcpp_contention_state(__a);
-  __cxx_atomic_thread_fence(memory_order_seq_cst);
-  if (0 != __cxx_atomic_load(__cxx_atomic_rebind<_Sco>(&__c->__waiters), memory_order_relaxed))
-#  endif
-    __libcpp_platform_wake((_Tp*) __a, true);
-}
-template <class _Tp, int _Sco, __enable_if_t<__libcpp_platform_wait_uses_type<_Tp>::__value, int> = 1>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_one(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a)
-{
-#  ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
-  auto* const __c = __libcpp_contention_state(__a);
-  __cxx_atomic_thread_fence(memory_order_seq_cst);
-  if (0 != __cxx_atomic_load(__cxx_atomic_rebind<_Sco>(&__c->__waiters), memory_order_relaxed))
-#  endif
-    __libcpp_platform_wake((_Tp*) __a, false);
-}
-
-#elif !defined(_LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE)
-
-template <class _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_all(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a)
-{
-  auto* const __c = __libcpp_contention_state(__a);
-  __cxx_atomic_thread_fence(memory_order_seq_cst);
-  if (0 == __cxx_atomic_load(__cxx_atomic_rebind<_Sco>(&__c->__credit), memory_order_relaxed))
-  {
-    return;
-  }
-  if (0 != __cxx_atomic_exchange(__cxx_atomic_rebind<_Sco>(&__c->__credit), (ptrdiff_t) 0, memory_order_relaxed))
-  {
-    __libcpp_mutex_lock(&__c->__mutex);
-    __libcpp_mutex_unlock(&__c->__mutex);
-    __libcpp_condvar_broadcast(&__c->__condvar);
-  }
-}
-template <class _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_one(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a)
-{
-  __cxx_atomic_notify_all(__a);
-}
-template <class _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY void
-__cxx_atomic_try_wait_slow(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a, _Tp const __val, memory_order __order)
-{
-  auto* const __c = __libcpp_contention_state(__a);
-  __libcpp_mutex_lock(&__c->__mutex);
-  __cxx_atomic_store(__cxx_atomic_rebind<_Sco>(&__c->__credit), (ptrdiff_t) 1, memory_order_relaxed);
-  __cxx_atomic_thread_fence(memory_order_seq_cst);
-  if (__cxx_nonatomic_compare_equal(__cxx_atomic_load(__a, __order), __val))
-  {
-    __libcpp_condvar_wait(&__c->__condvar, &__c->__mutex);
-  }
-  __libcpp_mutex_unlock(&__c->__mutex);
-}
-
-#else
-
-template <typename T>
-struct __atomic_wait_and_notify_supported
-#  if defined(__CUDA_MINIMUM_ARCH__) && __CUDA_MINIMUM_ARCH__ < 700
-    : false_type
-#  else
-    : true_type
-#  endif
-{};
-
-template <class _Ty, class _Tp = __detail::__cxx_atomic_underlying_t<_Ty>>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_try_wait_slow(_Ty const volatile* __a, _Tp __val, memory_order __order)
-{
-  static_assert(__atomic_wait_and_notify_supported<_Tp>::value, "atomic wait operations are unsupported on Pascal");
-  __cxx_atomic_try_wait_slow_fallback(__a, __val, __order);
-}
-
-template <class _Ty, class _Tp = __detail::__cxx_atomic_underlying_t<_Ty>>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_one(_Ty const volatile*)
-{
-  static_assert(__atomic_wait_and_notify_supported<_Tp>::value,
-                "atomic notify-one operations are unsupported on Pascal");
-}
-
-template <class _Ty, class _Tp = __detail::__cxx_atomic_underlying_t<_Ty>>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_all(_Ty const volatile*)
-{
-  static_assert(__atomic_wait_and_notify_supported<_Tp>::value,
-                "atomic notify-all operations are unsupported on Pascal");
-}
-
-#endif // _LIBCUDACXX_HAS_PLATFORM_WAIT || !defined(_LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE)
-
-template <class _Ty, class _Tp = __detail::__cxx_atomic_underlying_t<_Ty>>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_wait(_Ty const volatile* __a, _Tp const __val, memory_order __order)
-{
-  for (int __i = 0; __i < _LIBCUDACXX_POLLING_COUNT; ++__i)
-  {
-    if (!__cxx_nonatomic_compare_equal(__cxx_atomic_load(__a, __order), __val))
-    {
-      return;
-    }
-    if (__i < 12)
-    {
-      __libcpp_thread_yield_processor();
-    }
-    else
-    {
-      __libcpp_thread_yield();
-    }
-  }
-  while (__cxx_nonatomic_compare_equal(__cxx_atomic_load(__a, __order), __val))
-  {
-    __cxx_atomic_try_wait_slow(__a, __val, __order);
-  }
-}
-
-template <class _Tp, typename _Storage>
-struct __atomic_base_storage
-{
-  mutable _Storage __a_;
-
-  __atomic_base_storage()                             = default;
-  __atomic_base_storage(const __atomic_base_storage&) = default;
-  __atomic_base_storage(__atomic_base_storage&&)      = default;
-
-  __atomic_base_storage& operator=(const __atomic_base_storage&) = default;
-  __atomic_base_storage& operator=(__atomic_base_storage&&)      = default;
-
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_base_storage(_Storage&& __a) noexcept
-      : __a_(_CUDA_VSTD::forward<_Storage>(__a))
-  {}
-};
-
-template <class _Tp, bool _Cq, typename _Storage>
-struct __atomic_base_core : public __atomic_base_storage<_Tp, _Storage>
-{
-  __atomic_base_core()                          = default;
-  __atomic_base_core(const __atomic_base_core&) = delete;
-  __atomic_base_core(__atomic_base_core&&)      = delete;
-
-  __atomic_base_core& operator=(const __atomic_base_core&) = delete;
-  __atomic_base_core& operator=(__atomic_base_core&&)      = delete;
-
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_base_core(_Storage&& __a) noexcept
-      : __atomic_base_storage<_Tp, _Storage>(_CUDA_VSTD::forward<_Storage>(__a))
-  {}
-
-#if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
-  static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
-#endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
-
-  _LIBCUDACXX_INLINE_VISIBILITY bool is_lock_free() const volatile noexcept
-  {
-    return _LIBCUDACXX_ATOMIC_IS_LOCK_FREE(sizeof(_Tp));
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY bool is_lock_free() const noexcept
-  {
-    return static_cast<__atomic_base_core const volatile*>(this)->is_lock_free();
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY
-
-    void
-    store(_Tp __d, memory_order __m = memory_order_seq_cst) volatile noexcept _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
-  {
-    __cxx_atomic_store(&this->__a_, __d, __m);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY void store(_Tp __d, memory_order __m = memory_order_seq_cst) noexcept
-    _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
-  {
-    __cxx_atomic_store(&this->__a_, __d, __m);
-  }
-
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp load(memory_order __m = memory_order_seq_cst) const volatile noexcept
-    _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
-  {
-    return __cxx_atomic_load(&this->__a_, __m);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp load(memory_order __m = memory_order_seq_cst) const noexcept
-    _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
-  {
-    return __cxx_atomic_load(&this->__a_, __m);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY operator _Tp() const volatile noexcept
-  {
-    return load();
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY operator _Tp() const noexcept
-  {
-    return load();
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) volatile noexcept
-  {
-    return __cxx_atomic_exchange(&this->__a_, __d, __m);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) noexcept
-  {
-    return __cxx_atomic_exchange(&this->__a_, __d, __m);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY bool
-  compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __s, memory_order __f) volatile noexcept
-    _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-  {
-    return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __s, __f);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY bool
-  compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __s, memory_order __f) noexcept
-    _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-  {
-    return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __s, __f);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY bool
-  compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __s, memory_order __f) volatile noexcept
-    _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-  {
-    return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __s, __f);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY bool
-  compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __s, memory_order __f) noexcept
-    _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-  {
-    return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __s, __f);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY bool
-  compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst) volatile noexcept
-  {
-    if (memory_order_acq_rel == __m)
-    {
-      return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_acquire);
-    }
-    else if (memory_order_release == __m)
-    {
-      return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_relaxed);
-    }
-    else
-    {
-      return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, __m);
-    }
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY bool
-  compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst) noexcept
-  {
-    if (memory_order_acq_rel == __m)
-    {
-      return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_acquire);
-    }
-    else if (memory_order_release == __m)
-    {
-      return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_relaxed);
-    }
-    else
-    {
-      return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, __m);
-    }
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY bool
-  compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst) volatile noexcept
-  {
-    if (memory_order_acq_rel == __m)
-    {
-      return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_acquire);
-    }
-    else if (memory_order_release == __m)
-    {
-      return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_relaxed);
-    }
-    else
-    {
-      return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, __m);
-    }
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY bool
-  compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst) noexcept
-  {
-    if (memory_order_acq_rel == __m)
-    {
-      return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_acquire);
-    }
-    else if (memory_order_release == __m)
-    {
-      return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_relaxed);
-    }
-    else
-    {
-      return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, __m);
-    }
-  }
-
-  _LIBCUDACXX_INLINE_VISIBILITY void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const volatile noexcept
-  {
-    __cxx_atomic_wait(&this->__a_, __v, __m);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const noexcept
-  {
-    __cxx_atomic_wait(&this->__a_, __v, __m);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY void notify_one() volatile noexcept
-  {
-    __cxx_atomic_notify_one(&this->__a_);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY void notify_one() noexcept
-  {
-    __cxx_atomic_notify_one(&this->__a_);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY void notify_all() volatile noexcept
-  {
-    __cxx_atomic_notify_all(&this->__a_);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY void notify_all() noexcept
-  {
-    __cxx_atomic_notify_all(&this->__a_);
-  }
-};
-
-template <class _Tp, typename _Storage>
-struct __atomic_base_core<_Tp, true, _Storage> : public __atomic_base_storage<_Tp, _Storage>
-{
-  __atomic_base_core()                          = default;
-  __atomic_base_core(const __atomic_base_core&) = default;
-  __atomic_base_core(__atomic_base_core&&)      = default;
-
-  __atomic_base_core& operator=(const __atomic_base_core&) = default;
-  __atomic_base_core& operator=(__atomic_base_core&&)      = default;
-
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_base_core(_Storage&& __a) noexcept
-      : __atomic_base_storage<_Tp, _Storage>(_CUDA_VSTD::forward<_Storage>(__a))
-  {}
-
-#if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
-  static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
-#endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
-
-  _LIBCUDACXX_INLINE_VISIBILITY bool is_lock_free() const volatile noexcept
-  {
-    return _LIBCUDACXX_ATOMIC_IS_LOCK_FREE(sizeof(_Tp));
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY bool is_lock_free() const noexcept
-  {
-    return static_cast<__atomic_base_core const volatile*>(this)->is_lock_free();
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY
-
-    void
-    store(_Tp __d, memory_order __m = memory_order_seq_cst) const volatile noexcept
-    _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
-  {
-    __cxx_atomic_store(&this->__a_, __d, __m);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY void store(_Tp __d, memory_order __m = memory_order_seq_cst) const noexcept
-    _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
-  {
-    __cxx_atomic_store(&this->__a_, __d, __m);
-  }
-
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp load(memory_order __m = memory_order_seq_cst) const volatile noexcept
-    _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
-  {
-    return __cxx_atomic_load(&this->__a_, __m);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp load(memory_order __m = memory_order_seq_cst) const noexcept
-    _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
-  {
-    return __cxx_atomic_load(&this->__a_, __m);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY operator _Tp() const volatile noexcept
-  {
-    return load();
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY operator _Tp() const noexcept
-  {
-    return load();
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) const volatile noexcept
-  {
-    return __cxx_atomic_exchange(&this->__a_, __d, __m);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) const noexcept
-  {
-    return __cxx_atomic_exchange(&this->__a_, __d, __m);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY bool compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __s, memory_order __f) const
-    volatile noexcept _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-  {
-    return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __s, __f);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY bool
-  compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __s, memory_order __f) const noexcept
-    _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-  {
-    return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __s, __f);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY bool compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __s, memory_order __f) const
-    volatile noexcept _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-  {
-    return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __s, __f);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY bool
-  compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __s, memory_order __f) const noexcept
-    _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-  {
-    return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __s, __f);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY bool
-  compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst) const volatile noexcept
-  {
-    if (memory_order_acq_rel == __m)
-    {
-      return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_acquire);
-    }
-    else if (memory_order_release == __m)
-    {
-      return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_relaxed);
-    }
-    else
-    {
-      return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, __m);
-    }
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY bool
-  compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst) const noexcept
-  {
-    if (memory_order_acq_rel == __m)
-    {
-      return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_acquire);
-    }
-    else if (memory_order_release == __m)
-    {
-      return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_relaxed);
-    }
-    else
-    {
-      return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, __m);
-    }
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY bool
-  compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst) const volatile noexcept
-  {
-    if (memory_order_acq_rel == __m)
-    {
-      return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_acquire);
-    }
-    else if (memory_order_release == __m)
-    {
-      return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_relaxed);
-    }
-    else
-    {
-      return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, __m);
-    }
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY bool
-  compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst) const noexcept
-  {
-    if (memory_order_acq_rel == __m)
-    {
-      return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_acquire);
-    }
-    else if (memory_order_release == __m)
-    {
-      return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_relaxed);
-    }
-    else
-    {
-      return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, __m);
-    }
-  }
-
-  _LIBCUDACXX_INLINE_VISIBILITY void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const volatile noexcept
-  {
-    __cxx_atomic_wait(&this->__a_, __v, __m);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const noexcept
-  {
-    __cxx_atomic_wait(&this->__a_, __v, __m);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY void notify_one() const volatile noexcept
-  {
-    __cxx_atomic_notify_one(&this->__a_);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY void notify_one() const noexcept
-  {
-    __cxx_atomic_notify_one(&this->__a_);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY void notify_all() const volatile noexcept
-  {
-    __cxx_atomic_notify_all(&this->__a_);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY void notify_all() const noexcept
-  {
-    __cxx_atomic_notify_all(&this->__a_);
-  }
-};
-
-template <class _Tp, bool _Cq, typename _Storage>
-struct __atomic_base_arithmetic : public __atomic_base_core<_Tp, _Cq, _Storage>
-{
-  __atomic_base_arithmetic()                                = default;
-  __atomic_base_arithmetic(const __atomic_base_arithmetic&) = delete;
-  __atomic_base_arithmetic(__atomic_base_arithmetic&&)      = delete;
-
-  __atomic_base_arithmetic& operator=(const __atomic_base_arithmetic&) = delete;
-  __atomic_base_arithmetic& operator=(__atomic_base_arithmetic&&)      = delete;
-
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_base_arithmetic(_Storage&& __a) noexcept
-      : __atomic_base_core<_Tp, _Cq, _Storage>(_CUDA_VSTD::forward<_Storage>(__a))
-  {}
-
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
-  {
-    return __cxx_atomic_fetch_add(&this->__a_, __op, __m);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
-  {
-    return __cxx_atomic_fetch_add(&this->__a_, __op, __m);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
-  {
-    return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
-  {
-    return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);
-  }
-
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator++(int) volatile noexcept
-  {
-    return fetch_add(_Tp(1));
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator++(int) noexcept
-  {
-    return fetch_add(_Tp(1));
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator--(int) volatile noexcept
-  {
-    return fetch_sub(_Tp(1));
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator--(int) noexcept
-  {
-    return fetch_sub(_Tp(1));
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator++() volatile noexcept
-  {
-    return fetch_add(_Tp(1)) + _Tp(1);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator++() noexcept
-  {
-    return fetch_add(_Tp(1)) + _Tp(1);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator--() volatile noexcept
-  {
-    return fetch_sub(_Tp(1)) - _Tp(1);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator--() noexcept
-  {
-    return fetch_sub(_Tp(1)) - _Tp(1);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator+=(_Tp __op) volatile noexcept
-  {
-    return fetch_add(__op) + __op;
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator+=(_Tp __op) noexcept
-  {
-    return fetch_add(__op) + __op;
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator-=(_Tp __op) volatile noexcept
-  {
-    return fetch_sub(__op) - __op;
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator-=(_Tp __op) noexcept
-  {
-    return fetch_sub(__op) - __op;
-  }
-};
-
-template <class _Tp, typename _Storage>
-struct __atomic_base_arithmetic<_Tp, true, _Storage> : public __atomic_base_core<_Tp, true, _Storage>
-{
-  __atomic_base_arithmetic()                                = default;
-  __atomic_base_arithmetic(const __atomic_base_arithmetic&) = default;
-  __atomic_base_arithmetic(__atomic_base_arithmetic&&)      = default;
-
-  __atomic_base_arithmetic& operator=(const __atomic_base_arithmetic&) = default;
-  __atomic_base_arithmetic& operator=(__atomic_base_arithmetic&&)      = default;
-
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_base_arithmetic(_Storage&& __a) noexcept
-      : __atomic_base_core<_Tp, true, _Storage>(_CUDA_VSTD::forward<_Storage>(__a))
-  {}
-
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) const volatile noexcept
-  {
-    return __cxx_atomic_fetch_add(&this->__a_, __op, __m);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
-  {
-    return __cxx_atomic_fetch_add(&this->__a_, __op, __m);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) const volatile noexcept
-  {
-    return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
-  {
-    return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);
-  }
-
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator++(int) const volatile noexcept
-  {
-    return fetch_add(_Tp(1));
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator++(int) const noexcept
-  {
-    return fetch_add(_Tp(1));
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator--(int) const volatile noexcept
-  {
-    return fetch_sub(_Tp(1));
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator--(int) const noexcept
-  {
-    return fetch_sub(_Tp(1));
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator++() const volatile noexcept
-  {
-    return fetch_add(_Tp(1)) + _Tp(1);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator++() const noexcept
-  {
-    return fetch_add(_Tp(1)) + _Tp(1);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator--() const volatile noexcept
-  {
-    return fetch_sub(_Tp(1)) - _Tp(1);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator--() const noexcept
-  {
-    return fetch_sub(_Tp(1)) - _Tp(1);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator+=(_Tp __op) const volatile noexcept
-  {
-    return fetch_add(__op) + __op;
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator+=(_Tp __op) const noexcept
-  {
-    return fetch_add(__op) + __op;
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator-=(_Tp __op) const volatile noexcept
-  {
-    return fetch_sub(__op) - __op;
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator-=(_Tp __op) const noexcept
-  {
-    return fetch_sub(__op) - __op;
-  }
-};
-
-template <class _Tp, bool _Cq, typename _Storage>
-struct __atomic_base_bitwise : public __atomic_base_arithmetic<_Tp, _Cq, _Storage>
-{
-  __atomic_base_bitwise()                             = default;
-  __atomic_base_bitwise(const __atomic_base_bitwise&) = delete;
-  __atomic_base_bitwise(__atomic_base_bitwise&&)      = delete;
-
-  __atomic_base_bitwise& operator=(const __atomic_base_bitwise&) = delete;
-  __atomic_base_bitwise& operator=(__atomic_base_bitwise&&)      = delete;
-
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_base_bitwise(_Storage&& __a) noexcept
-      : __atomic_base_arithmetic<_Tp, _Cq, _Storage>(_CUDA_VSTD::forward<_Storage>(__a))
-  {}
-
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
-  {
-    return __cxx_atomic_fetch_and(&this->__a_, __op, __m);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
-  {
-    return __cxx_atomic_fetch_and(&this->__a_, __op, __m);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
-  {
-    return __cxx_atomic_fetch_or(&this->__a_, __op, __m);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
-  {
-    return __cxx_atomic_fetch_or(&this->__a_, __op, __m);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
-  {
-    return __cxx_atomic_fetch_xor(&this->__a_, __op, __m);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
-  {
-    return __cxx_atomic_fetch_xor(&this->__a_, __op, __m);
-  }
-
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator&=(_Tp __op) volatile noexcept
-  {
-    return fetch_and(__op) & __op;
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator&=(_Tp __op) noexcept
-  {
-    return fetch_and(__op) & __op;
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator|=(_Tp __op) volatile noexcept
-  {
-    return fetch_or(__op) | __op;
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator|=(_Tp __op) noexcept
-  {
-    return fetch_or(__op) | __op;
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator^=(_Tp __op) volatile noexcept
-  {
-    return fetch_xor(__op) ^ __op;
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator^=(_Tp __op) noexcept
-  {
-    return fetch_xor(__op) ^ __op;
-  }
-};
-
-template <class _Tp, typename _Storage>
-struct __atomic_base_bitwise<_Tp, true, _Storage> : public __atomic_base_arithmetic<_Tp, true, _Storage>
-{
-  __atomic_base_bitwise()                             = default;
-  __atomic_base_bitwise(const __atomic_base_bitwise&) = default;
-  __atomic_base_bitwise(__atomic_base_bitwise&&)      = default;
-
-  __atomic_base_bitwise& operator=(const __atomic_base_bitwise&) = default;
-  __atomic_base_bitwise& operator=(__atomic_base_bitwise&&)      = default;
-
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_base_bitwise(_Storage&& __a) noexcept
-      : __atomic_base_arithmetic<_Tp, true, _Storage>(_CUDA_VSTD::forward<_Storage>(__a))
-  {}
-
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) const volatile noexcept
-  {
-    return __cxx_atomic_fetch_and(&this->__a_, __op, __m);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
-  {
-    return __cxx_atomic_fetch_and(&this->__a_, __op, __m);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) const volatile noexcept
-  {
-    return __cxx_atomic_fetch_or(&this->__a_, __op, __m);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
-  {
-    return __cxx_atomic_fetch_or(&this->__a_, __op, __m);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) const volatile noexcept
-  {
-    return __cxx_atomic_fetch_xor(&this->__a_, __op, __m);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
-  {
-    return __cxx_atomic_fetch_xor(&this->__a_, __op, __m);
-  }
-
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator&=(_Tp __op) const volatile noexcept
-  {
-    return fetch_and(__op) & __op;
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator&=(_Tp __op) const noexcept
-  {
-    return fetch_and(__op) & __op;
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator|=(_Tp __op) const volatile noexcept
-  {
-    return fetch_or(__op) | __op;
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator|=(_Tp __op) const noexcept
-  {
-    return fetch_or(__op) | __op;
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator^=(_Tp __op) const volatile noexcept
-  {
-    return fetch_xor(__op) ^ __op;
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator^=(_Tp __op) const noexcept
-  {
-    return fetch_xor(__op) ^ __op;
-  }
-};
-
-template <typename _Tp, bool _Cq, typename _Storage>
-using __atomic_select_base =
-  __conditional_t<is_floating_point<_Tp>::value,
-                  __atomic_base_arithmetic<_Tp, _Cq, _Storage>,
-                  __conditional_t<is_integral<_Tp>::value,
-                                  __atomic_base_bitwise<_Tp, _Cq, _Storage>,
-                                  __atomic_base_core<_Tp, _Cq, _Storage>>>;
-
-template <typename _Tp, int _Sco = 0, typename _Base = __atomic_select_base<_Tp, false, __cxx_atomic_impl<_Tp, _Sco>>>
-struct __atomic_base : public _Base
-{
-  __atomic_base()                     = default;
-  __atomic_base(const __atomic_base&) = delete;
-  __atomic_base(__atomic_base&&)      = delete;
-
-  __atomic_base& operator=(const __atomic_base&) = delete;
-  __atomic_base& operator=(__atomic_base&&)      = delete;
-
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_base(const _Tp& __a) noexcept
-      : _Base(__cxx_atomic_impl<_Tp, _Sco>(__a))
-  {}
-};
-
-template <typename _Tp, int _Sco = 0, typename _Base = __atomic_select_base<_Tp, true, __cxx_atomic_ref_impl<_Tp, _Sco>>>
-struct __atomic_base_ref : public _Base
-{
-  __atomic_base_ref()                         = default;
-  __atomic_base_ref(const __atomic_base_ref&) = default;
-  __atomic_base_ref(__atomic_base_ref&&)      = default;
-
-  __atomic_base_ref& operator=(const __atomic_base_ref&) = default;
-  __atomic_base_ref& operator=(__atomic_base_ref&&)      = default;
-
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_base_ref(_Tp& __a) noexcept
-      : _Base(__cxx_atomic_ref_impl<_Tp, _Sco>(__a))
-  {}
-};
-
-#if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
-template <class _Tp, bool _Cq, typename _Storage>
-constexpr bool __atomic_base_core<_Tp, _Cq, _Storage>::is_always_lock_free;
-#endif
-
-// atomic<T>
-template <class _Tp>
-struct atomic : public __atomic_base<_Tp>
-{
-  typedef __atomic_base<_Tp> __base;
-  using value_type = _Tp;
-
-  atomic() noexcept = default;
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr atomic(_Tp __d) noexcept
-      : __base(__d)
-  {}
-
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator=(_Tp __d) volatile noexcept
-  {
-    __base::store(__d);
-    return __d;
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator=(_Tp __d) noexcept
-  {
-    __base::store(__d);
-    return __d;
-  }
-};
-
-// atomic<T*>
-
-template <class _Tp>
-struct atomic<_Tp*> : public __atomic_base<_Tp*>
-{
-  typedef __atomic_base<_Tp*> __base;
-  using value_type = _Tp*;
-
-  atomic() noexcept = default;
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr atomic(_Tp* __d) noexcept
-      : __base(__d)
-  {}
-
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator=(_Tp* __d) volatile noexcept
-  {
-    __base::store(__d);
-    return __d;
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator=(_Tp* __d) noexcept
-  {
-    __base::store(__d);
-    return __d;
-  }
-
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp* fetch_add(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) volatile noexcept
-  {
-    return __cxx_atomic_fetch_add(&this->__a_, __op, __m);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp* fetch_add(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) noexcept
-  {
-    return __cxx_atomic_fetch_add(&this->__a_, __op, __m);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp* fetch_sub(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) volatile noexcept
-  {
-    return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp* fetch_sub(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) noexcept
-  {
-    return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);
-  }
-
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator++(int) volatile noexcept
-  {
-    return fetch_add(1);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator++(int) noexcept
-  {
-    return fetch_add(1);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator--(int) volatile noexcept
-  {
-    return fetch_sub(1);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator--(int) noexcept
-  {
-    return fetch_sub(1);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator++() volatile noexcept
-  {
-    return fetch_add(1) + 1;
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator++() noexcept
-  {
-    return fetch_add(1) + 1;
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator--() volatile noexcept
-  {
-    return fetch_sub(1) - 1;
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator--() noexcept
-  {
-    return fetch_sub(1) - 1;
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator+=(ptrdiff_t __op) volatile noexcept
-  {
-    return fetch_add(__op) + __op;
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator+=(ptrdiff_t __op) noexcept
-  {
-    return fetch_add(__op) + __op;
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator-=(ptrdiff_t __op) volatile noexcept
-  {
-    return fetch_sub(__op) - __op;
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator-=(ptrdiff_t __op) noexcept
-  {
-    return fetch_sub(__op) - __op;
-  }
-};
-
-// atomic_ref<T>
-
-template <class _Tp>
-struct atomic_ref : public __atomic_base_ref<_Tp>
-{
-  typedef __atomic_base_ref<_Tp> __base;
-  using value_type = _Tp;
-
-  static constexpr size_t required_alignment = sizeof(_Tp);
-
-  static constexpr bool is_always_lock_free = sizeof(_Tp) <= 8;
-
-  _LIBCUDACXX_INLINE_VISIBILITY explicit atomic_ref(_Tp& __ref)
-      : __base(__ref)
-  {}
-
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator=(_Tp __v) const volatile noexcept
-  {
-    __base::store(__v);
-    return __v;
-  }
-};
-
-// atomic_ref<T*>
-
-template <class _Tp>
-struct atomic_ref<_Tp*> : public __atomic_base_ref<_Tp*>
-{
-  typedef __atomic_base_ref<_Tp*> __base;
-  using value_type = _Tp*;
-
-  static constexpr size_t required_alignment = sizeof(_Tp*);
-
-  static constexpr bool is_always_lock_free = sizeof(_Tp*) <= 8;
-
-  _LIBCUDACXX_INLINE_VISIBILITY explicit atomic_ref(_Tp*& __ref)
-      : __base(__ref)
-  {}
-
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator=(_Tp* __v) const noexcept
-  {
-    __base::store(__v);
-    return __v;
-  }
-
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp* fetch_add(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) const noexcept
-  {
-    return __cxx_atomic_fetch_add(&this->__a_, __op, __m);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp* fetch_sub(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) const noexcept
-  {
-    return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);
-  }
-
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator++(int) const noexcept
-  {
-    return fetch_add(1);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator--(int) const noexcept
-  {
-    return fetch_sub(1);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator++() const noexcept
-  {
-    return fetch_add(1) + 1;
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator--() const noexcept
-  {
-    return fetch_sub(1) - 1;
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator+=(ptrdiff_t __op) const noexcept
-  {
-    return fetch_add(__op) + __op;
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator-=(ptrdiff_t __op) const noexcept
-  {
-    return fetch_sub(__op) - __op;
-  }
-};
-
-// atomic_is_lock_free
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY bool atomic_is_lock_free(const volatile atomic<_Tp>* __o) noexcept
-{
-  return __o->is_lock_free();
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY bool atomic_is_lock_free(const atomic<_Tp>* __o) noexcept
-{
-  return __o->is_lock_free();
-}
-
-// atomic_init
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY void atomic_init(volatile atomic<_Tp>* __o, _Tp __d) noexcept
-{
-  __cxx_atomic_init(&__o->__a_, __d);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY void atomic_init(atomic<_Tp>* __o, _Tp __d) noexcept
-{
-  __cxx_atomic_init(&__o->__a_, __d);
-}
-
-// atomic_store
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY void atomic_store(volatile atomic<_Tp>* __o, _Tp __d) noexcept
-{
-  __o->store(__d);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY void atomic_store(atomic<_Tp>* __o, _Tp __d) noexcept
-{
-  __o->store(__d);
-}
-
-// atomic_store_explicit
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY void atomic_store_explicit(volatile atomic<_Tp>* __o, _Tp __d, memory_order __m) noexcept
-  _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
-{
-  __o->store(__d, __m);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY void atomic_store_explicit(atomic<_Tp>* __o, _Tp __d, memory_order __m) noexcept
-  _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
-{
-  __o->store(__d, __m);
-}
-
-// atomic_load
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_load(const volatile atomic<_Tp>* __o) noexcept
-{
-  return __o->load();
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_load(const atomic<_Tp>* __o) noexcept
-{
-  return __o->load();
-}
-
-// atomic_load_explicit
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_load_explicit(const volatile atomic<_Tp>* __o, memory_order __m) noexcept
-  _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
-{
-  return __o->load(__m);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_load_explicit(const atomic<_Tp>* __o, memory_order __m) noexcept
-  _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
-{
-  return __o->load(__m);
-}
-
-// atomic_exchange
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_exchange(volatile atomic<_Tp>* __o, _Tp __d) noexcept
-{
-  return __o->exchange(__d);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_exchange(atomic<_Tp>* __o, _Tp __d) noexcept
-{
-  return __o->exchange(__d);
-}
-
-// atomic_exchange_explicit
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_exchange_explicit(volatile atomic<_Tp>* __o, _Tp __d, memory_order __m) noexcept
-{
-  return __o->exchange(__d, __m);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_exchange_explicit(atomic<_Tp>* __o, _Tp __d, memory_order __m) noexcept
-{
-  return __o->exchange(__d, __m);
-}
-
-// atomic_compare_exchange_weak
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_weak(volatile atomic<_Tp>* __o, _Tp* __e, _Tp __d) noexcept
-{
-  return __o->compare_exchange_weak(*__e, __d);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_weak(atomic<_Tp>* __o, _Tp* __e, _Tp __d) noexcept
-{
-  return __o->compare_exchange_weak(*__e, __d);
-}
-
-// atomic_compare_exchange_strong
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_strong(volatile atomic<_Tp>* __o, _Tp* __e, _Tp __d) noexcept
-{
-  return __o->compare_exchange_strong(*__e, __d);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_strong(atomic<_Tp>* __o, _Tp* __e, _Tp __d) noexcept
-{
-  return __o->compare_exchange_strong(*__e, __d);
-}
-
-// atomic_compare_exchange_weak_explicit
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_weak_explicit(
-  volatile atomic<_Tp>* __o, _Tp* __e, _Tp __d, memory_order __s, memory_order __f) noexcept
-  _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-{
-  return __o->compare_exchange_weak(*__e, __d, __s, __f);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY bool
-atomic_compare_exchange_weak_explicit(atomic<_Tp>* __o, _Tp* __e, _Tp __d, memory_order __s, memory_order __f) noexcept
-  _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-{
-  return __o->compare_exchange_weak(*__e, __d, __s, __f);
-}
-
-// atomic_compare_exchange_strong_explicit
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_strong_explicit(
-  volatile atomic<_Tp>* __o, _Tp* __e, _Tp __d, memory_order __s, memory_order __f) noexcept
-  _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-{
-  return __o->compare_exchange_strong(*__e, __d, __s, __f);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_strong_explicit(
-  atomic<_Tp>* __o, _Tp* __e, _Tp __d, memory_order __s, memory_order __f) noexcept
-  _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-{
-  return __o->compare_exchange_strong(*__e, __d, __s, __f);
-}
-
-// atomic_wait
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY void
-atomic_wait(const volatile atomic<_Tp>* __o, typename atomic<_Tp>::value_type __v) noexcept
-{
-  return __o->wait(__v);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY void atomic_wait(const atomic<_Tp>* __o, typename atomic<_Tp>::value_type __v) noexcept
-{
-  return __o->wait(__v);
-}
-
-// atomic_wait_explicit
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY void
-atomic_wait_explicit(const volatile atomic<_Tp>* __o, typename atomic<_Tp>::value_type __v, memory_order __m) noexcept
-  _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
-{
-  return __o->wait(__v, __m);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY void
-atomic_wait_explicit(const atomic<_Tp>* __o, typename atomic<_Tp>::value_type __v, memory_order __m) noexcept
-  _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
-{
-  return __o->wait(__v, __m);
-}
-
-// atomic_notify_one
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY void atomic_notify_one(volatile atomic<_Tp>* __o) noexcept
-{
-  __o->notify_one();
-}
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY void atomic_notify_one(atomic<_Tp>* __o) noexcept
-{
-  __o->notify_one();
-}
-
-// atomic_notify_one
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY void atomic_notify_all(volatile atomic<_Tp>* __o) noexcept
-{
-  __o->notify_all();
-}
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY void atomic_notify_all(atomic<_Tp>* __o) noexcept
-{
-  __o->notify_all();
-}
-
-// atomic_fetch_add
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-  __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
-  atomic_fetch_add(volatile atomic<_Tp>* __o, _Tp __op) noexcept
-{
-  return __o->fetch_add(__op);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-  __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
-  atomic_fetch_add(atomic<_Tp>* __o, _Tp __op) noexcept
-{
-  return __o->fetch_add(__op);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp* atomic_fetch_add(volatile atomic<_Tp*>* __o, ptrdiff_t __op) noexcept
-{
-  return __o->fetch_add(__op);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp* atomic_fetch_add(atomic<_Tp*>* __o, ptrdiff_t __op) noexcept
-{
-  return __o->fetch_add(__op);
-}
-
-// atomic_fetch_add_explicit
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-  __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
-  atomic_fetch_add_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
-{
-  return __o->fetch_add(__op, __m);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-  __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
-  atomic_fetch_add_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
-{
-  return __o->fetch_add(__op, __m);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp*
-atomic_fetch_add_explicit(volatile atomic<_Tp*>* __o, ptrdiff_t __op, memory_order __m) noexcept
-{
-  return __o->fetch_add(__op, __m);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp*
-atomic_fetch_add_explicit(atomic<_Tp*>* __o, ptrdiff_t __op, memory_order __m) noexcept
-{
-  return __o->fetch_add(__op, __m);
-}
-
-// atomic_fetch_sub
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-  __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
-  atomic_fetch_sub(volatile atomic<_Tp>* __o, _Tp __op) noexcept
-{
-  return __o->fetch_sub(__op);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-  __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
-  atomic_fetch_sub(atomic<_Tp>* __o, _Tp __op) noexcept
-{
-  return __o->fetch_sub(__op);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp* atomic_fetch_sub(volatile atomic<_Tp*>* __o, ptrdiff_t __op) noexcept
-{
-  return __o->fetch_sub(__op);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp* atomic_fetch_sub(atomic<_Tp*>* __o, ptrdiff_t __op) noexcept
-{
-  return __o->fetch_sub(__op);
-}
-
-// atomic_fetch_sub_explicit
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-  __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
-  atomic_fetch_sub_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
-{
-  return __o->fetch_sub(__op, __m);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-  __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
-  atomic_fetch_sub_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
-{
-  return __o->fetch_sub(__op, __m);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp*
-atomic_fetch_sub_explicit(volatile atomic<_Tp*>* __o, ptrdiff_t __op, memory_order __m) noexcept
-{
-  return __o->fetch_sub(__op, __m);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp*
-atomic_fetch_sub_explicit(atomic<_Tp*>* __o, ptrdiff_t __op, memory_order __m) noexcept
-{
-  return __o->fetch_sub(__op, __m);
-}
-
-// atomic_fetch_and
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
-atomic_fetch_and(volatile atomic<_Tp>* __o, _Tp __op) noexcept
-{
-  return __o->fetch_and(__op);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
-atomic_fetch_and(atomic<_Tp>* __o, _Tp __op) noexcept
-{
-  return __o->fetch_and(__op);
-}
-
-// atomic_fetch_and_explicit
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
-atomic_fetch_and_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
-{
-  return __o->fetch_and(__op, __m);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
-atomic_fetch_and_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
-{
-  return __o->fetch_and(__op, __m);
-}
-
-// atomic_fetch_or
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
-atomic_fetch_or(volatile atomic<_Tp>* __o, _Tp __op) noexcept
-{
-  return __o->fetch_or(__op);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
-atomic_fetch_or(atomic<_Tp>* __o, _Tp __op) noexcept
-{
-  return __o->fetch_or(__op);
-}
-
-// atomic_fetch_or_explicit
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
-atomic_fetch_or_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
-{
-  return __o->fetch_or(__op, __m);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
-atomic_fetch_or_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
-{
-  return __o->fetch_or(__op, __m);
-}
-
-// atomic_fetch_xor
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
-atomic_fetch_xor(volatile atomic<_Tp>* __o, _Tp __op) noexcept
-{
-  return __o->fetch_xor(__op);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
-atomic_fetch_xor(atomic<_Tp>* __o, _Tp __op) noexcept
-{
-  return __o->fetch_xor(__op);
-}
-
-// atomic_fetch_xor_explicit
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
-atomic_fetch_xor_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
-{
-  return __o->fetch_xor(__op, __m);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
-atomic_fetch_xor_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
-{
-  return __o->fetch_xor(__op, __m);
-}
-
-// flag type and operations
-
-typedef struct atomic_flag
-{
-  __cxx_atomic_impl<_LIBCUDACXX_ATOMIC_FLAG_TYPE, 0> __a_;
-
-  _LIBCUDACXX_INLINE_VISIBILITY bool test(memory_order __m = memory_order_seq_cst) const volatile noexcept
-  {
-    return _LIBCUDACXX_ATOMIC_FLAG_TYPE(true) == __cxx_atomic_load(&__a_, __m);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY bool test(memory_order __m = memory_order_seq_cst) const noexcept
-  {
-    return _LIBCUDACXX_ATOMIC_FLAG_TYPE(true) == __cxx_atomic_load(&__a_, __m);
-  }
-
-  _LIBCUDACXX_INLINE_VISIBILITY bool test_and_set(memory_order __m = memory_order_seq_cst) volatile noexcept
-  {
-    return __cxx_atomic_exchange(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), __m);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY bool test_and_set(memory_order __m = memory_order_seq_cst) noexcept
-  {
-    return __cxx_atomic_exchange(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), __m);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY void clear(memory_order __m = memory_order_seq_cst) volatile noexcept
-  {
-    __cxx_atomic_store(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), __m);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY void clear(memory_order __m = memory_order_seq_cst) noexcept
-  {
-    __cxx_atomic_store(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), __m);
-  }
-
-#if !defined(__CUDA_MINIMUM_ARCH__) || __CUDA_MINIMUM_ARCH__ >= 700
-  _LIBCUDACXX_INLINE_VISIBILITY void wait(bool __v, memory_order __m = memory_order_seq_cst) const volatile noexcept
-  {
-    __cxx_atomic_wait(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(__v), __m);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY void wait(bool __v, memory_order __m = memory_order_seq_cst) const noexcept
-  {
-    __cxx_atomic_wait(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(__v), __m);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY void notify_one() volatile noexcept
-  {
-    __cxx_atomic_notify_one(&__a_);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY void notify_one() noexcept
-  {
-    __cxx_atomic_notify_one(&__a_);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY void notify_all() volatile noexcept
-  {
-    __cxx_atomic_notify_all(&__a_);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY void notify_all() noexcept
-  {
-    __cxx_atomic_notify_all(&__a_);
-  }
-#endif
-
-  atomic_flag() noexcept = default;
-
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr atomic_flag(bool __b) noexcept
-      : __a_(__b)
-  {} // EXTENSION
-
-  atomic_flag(const atomic_flag&)                     = delete;
-  atomic_flag& operator=(const atomic_flag&)          = delete;
-  atomic_flag& operator=(const atomic_flag&) volatile = delete;
-} atomic_flag;
-
-inline _LIBCUDACXX_INLINE_VISIBILITY bool atomic_flag_test(const volatile atomic_flag* __o) noexcept
-{
-  return __o->test();
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY bool atomic_flag_test(const atomic_flag* __o) noexcept
-{
-  return __o->test();
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY bool
-atomic_flag_test_explicit(const volatile atomic_flag* __o, memory_order __m) noexcept
-{
-  return __o->test(__m);
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY bool atomic_flag_test_explicit(const atomic_flag* __o, memory_order __m) noexcept
-{
-  return __o->test(__m);
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY bool atomic_flag_test_and_set(volatile atomic_flag* __o) noexcept
-{
-  return __o->test_and_set();
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY bool atomic_flag_test_and_set(atomic_flag* __o) noexcept
-{
-  return __o->test_and_set();
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY bool
-atomic_flag_test_and_set_explicit(volatile atomic_flag* __o, memory_order __m) noexcept
-{
-  return __o->test_and_set(__m);
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY bool atomic_flag_test_and_set_explicit(atomic_flag* __o, memory_order __m) noexcept
-{
-  return __o->test_and_set(__m);
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_clear(volatile atomic_flag* __o) noexcept
-{
-  __o->clear();
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_clear(atomic_flag* __o) noexcept
-{
-  __o->clear();
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY void
-atomic_flag_clear_explicit(volatile atomic_flag* __o, memory_order __m) noexcept
-{
-  __o->clear(__m);
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_clear_explicit(atomic_flag* __o, memory_order __m) noexcept
-{
-  __o->clear(__m);
-}
-
-#if !defined(__CUDA_MINIMUM_ARCH__) || __CUDA_MINIMUM_ARCH__ >= 700
-
-inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_wait(const volatile atomic_flag* __o, bool __v) noexcept
-{
-  __o->wait(__v);
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_wait(const atomic_flag* __o, bool __v) noexcept
-{
-  __o->wait(__v);
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY void
-atomic_flag_wait_explicit(const volatile atomic_flag* __o, bool __v, memory_order __m) noexcept
-{
-  __o->wait(__v, __m);
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY void
-atomic_flag_wait_explicit(const atomic_flag* __o, bool __v, memory_order __m) noexcept
-{
-  __o->wait(__v, __m);
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_notify_one(volatile atomic_flag* __o) noexcept
-{
-  __o->notify_one();
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_notify_one(atomic_flag* __o) noexcept
-{
-  __o->notify_one();
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_notify_all(volatile atomic_flag* __o) noexcept
-{
-  __o->notify_all();
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_notify_all(atomic_flag* __o) noexcept
-{
-  __o->notify_all();
-}
-
-#endif
-
-// fences
-
-inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_thread_fence(memory_order __m) noexcept
-{
-  __cxx_atomic_thread_fence(__m);
-}
-
-inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_signal_fence(memory_order __m) noexcept
-{
-  __cxx_atomic_signal_fence(__m);
-}
-
-// Atomics for standard typedef types
-
-typedef atomic<bool> atomic_bool;
-typedef atomic<char> atomic_char;
-typedef atomic<signed char> atomic_schar;
-typedef atomic<unsigned char> atomic_uchar;
-typedef atomic<short> atomic_short;
-typedef atomic<unsigned short> atomic_ushort;
-typedef atomic<int> atomic_int;
-typedef atomic<unsigned int> atomic_uint;
-typedef atomic<long> atomic_long;
-typedef atomic<unsigned long> atomic_ulong;
-typedef atomic<long long> atomic_llong;
-typedef atomic<unsigned long long> atomic_ullong;
-typedef atomic<char16_t> atomic_char16_t;
-typedef atomic<char32_t> atomic_char32_t;
-typedef atomic<wchar_t> atomic_wchar_t;
-
-typedef atomic<int_least8_t> atomic_int_least8_t;
-typedef atomic<uint_least8_t> atomic_uint_least8_t;
-typedef atomic<int_least16_t> atomic_int_least16_t;
-typedef atomic<uint_least16_t> atomic_uint_least16_t;
-typedef atomic<int_least32_t> atomic_int_least32_t;
-typedef atomic<uint_least32_t> atomic_uint_least32_t;
-typedef atomic<int_least64_t> atomic_int_least64_t;
-typedef atomic<uint_least64_t> atomic_uint_least64_t;
-
-typedef atomic<int_fast8_t> atomic_int_fast8_t;
-typedef atomic<uint_fast8_t> atomic_uint_fast8_t;
-typedef atomic<int_fast16_t> atomic_int_fast16_t;
-typedef atomic<uint_fast16_t> atomic_uint_fast16_t;
-typedef atomic<int_fast32_t> atomic_int_fast32_t;
-typedef atomic<uint_fast32_t> atomic_uint_fast32_t;
-typedef atomic<int_fast64_t> atomic_int_fast64_t;
-typedef atomic<uint_fast64_t> atomic_uint_fast64_t;
-
-typedef atomic<int8_t> atomic_int8_t;
-typedef atomic<uint8_t> atomic_uint8_t;
-typedef atomic<int16_t> atomic_int16_t;
-typedef atomic<uint16_t> atomic_uint16_t;
-typedef atomic<int32_t> atomic_int32_t;
-typedef atomic<uint32_t> atomic_uint32_t;
-typedef atomic<int64_t> atomic_int64_t;
-typedef atomic<uint64_t> atomic_uint64_t;
-
-typedef atomic<intptr_t> atomic_intptr_t;
-typedef atomic<uintptr_t> atomic_uintptr_t;
-typedef atomic<size_t> atomic_size_t;
-typedef atomic<ptrdiff_t> atomic_ptrdiff_t;
-typedef atomic<intmax_t> atomic_intmax_t;
-typedef atomic<uintmax_t> atomic_uintmax_t;
-
-static_assert(ATOMIC_INT_LOCK_FREE, "This library assumes atomic<int> is lock-free.");
-
-typedef atomic<int> atomic_signed_lock_free;
-typedef atomic<unsigned> atomic_unsigned_lock_free;
-
-#define ATOMIC_FLAG_INIT \
-  {                      \
-    false                \
-  }
-#define ATOMIC_VAR_INIT(__v) \
-  {                          \
-    __v                      \
-  }
-
-_LIBCUDACXX_END_NAMESPACE_STD
-
-#include <cuda/std/__cuda/atomic.h>
-_CCCL_POP_MACROS
-
-#endif // _LIBCUDACXX_ATOMIC
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/barrier b/libcudacxx/include/cuda/std/detail/libcxx/include/barrier
index c2308aeb88..58e0e2d240 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/barrier
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/barrier
@@ -79,17 +79,17 @@ template <class _CompletionF = __empty_completion, int _Sco = 0>
 class alignas(64) __barrier_base
 {
   ptrdiff_t __expected;
-  __atomic_base<ptrdiff_t, _Sco> __expected_adjustment;
+  __atomic_impl<ptrdiff_t, _Sco> __expected_adjustment;
   _CompletionF __completion;
 
   using __phase_t = uint8_t;
-  __atomic_base<__phase_t, _Sco> __phase;
+  __atomic_impl<__phase_t, _Sco> __phase;
 
   struct alignas(64) __state_t
   {
     struct
     {
-      __atomic_base<__phase_t, _Sco> __phase = ATOMIC_VAR_INIT(0);
+      __atomic_impl<__phase_t, _Sco> __phase = LIBCUDACXX_ATOMIC_VAR_INIT(0);
     } __tickets[64];
   };
   ::std::vector<__state_t> __state;
@@ -263,12 +263,12 @@ _LIBCUDACXX_INLINE_VISIBILITY bool __call_try_wait_parity(const _Barrier& __b, b
   return __b.__try_wait_parity(__parity);
 }
 
-template <class _CompletionF, int _Sco = 0>
+template <class _CompletionF, thread_scope _Sco = thread_scope_system>
 class __barrier_base
 {
-  _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_base<ptrdiff_t, _Sco> __expected, __arrived;
+  _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_impl<ptrdiff_t, _Sco> __expected, __arrived;
   _LIBCUDACXX_BARRIER_ALIGNMENTS _CompletionF __completion;
-  _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_base<bool, _Sco> __phase;
+  _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_impl<bool, _Sco> __phase;
 
 public:
   using arrival_token = bool;
@@ -321,7 +321,7 @@ public:
       __completion();
       __arrived.store(__new_expected, memory_order_relaxed);
       __phase.store(!__old_phase, memory_order_release);
-      __cxx_atomic_notify_all(&__phase.__a_);
+      __atomic_notify_all(&__phase.__a, __scope_to_tag<_Sco>{});
     }
     return __old_phase;
   }
@@ -345,7 +345,7 @@ public:
   }
 };
 
-template <int _Sco>
+template <thread_scope _Sco>
 class __barrier_base<__empty_completion, _Sco>
 {
   static constexpr uint64_t __expected_unit = 1ull;
@@ -354,7 +354,7 @@ class __barrier_base<__empty_completion, _Sco>
   static constexpr uint64_t __phase_bit     = 1ull << 63;
   static constexpr uint64_t __arrived_mask  = (__phase_bit - 1) & ~__expected_mask;
 
-  _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_base<uint64_t, _Sco> __phase_arrived_expected;
+  _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_impl<uint64_t, _Sco> __phase_arrived_expected;
 
 public:
   using arrival_token = uint64_t;
@@ -457,6 +457,7 @@ public:
 _LIBCUDACXX_END_NAMESPACE_STD
 
 #include <cuda/std/__cuda/barrier.h>
+
 _CCCL_POP_MACROS
 
 #endif //_LIBCUDACXX_BARRIER
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/latch b/libcudacxx/include/cuda/std/detail/libcxx/include/latch
index 1272091737..26442e8283 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/latch
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/latch
@@ -48,9 +48,11 @@ namespace std
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/std/__atomic/api/owned.h>
 #include <cuda/std/atomic>
 #include <cuda/std/detail/libcxx/include/__assert> // all public C++ headers provide the assertion handler
 #include <cuda/std/detail/libcxx/include/__debug>
+#include <cuda/std/limits>
 
 _CCCL_PUSH_MACROS
 
@@ -66,10 +68,10 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #  define _LIBCUDACXX_LATCH_ALIGNMENT
 #endif
 
-template <int _Sco = 0>
+template <thread_scope _Sco = thread_scope_system>
 class __latch_base
 {
-  _LIBCUDACXX_LATCH_ALIGNMENT __atomic_base<ptrdiff_t, _Sco> __counter;
+  _LIBCUDACXX_LATCH_ALIGNMENT __atomic_impl<ptrdiff_t, _Sco> __counter;
 
 public:
   inline _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit __latch_base(ptrdiff_t __expected)
@@ -123,6 +125,7 @@ using latch = __latch_base<>;
 _LIBCUDACXX_END_NAMESPACE_STD
 
 #include <cuda/std/__cuda/latch.h>
+
 _CCCL_POP_MACROS
 
 #endif //_LIBCUDACXX_LATCH
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/semaphore b/libcudacxx/include/cuda/std/detail/libcxx/include/semaphore
index 6f2f3f9c12..74b421d903 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/semaphore
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/semaphore
@@ -68,7 +68,7 @@ _CCCL_PUSH_MACROS
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-template <int _Sco, ptrdiff_t __least_max_value>
+template <thread_scope _Sco, ptrdiff_t __least_max_value>
 class __atomic_semaphore_base
 {
   _LIBCUDACXX_INLINE_VISIBILITY bool __fetch_sub_if_slow(ptrdiff_t __old)
@@ -119,7 +119,7 @@ class __atomic_semaphore_base
       },
       __rel_time);
   }
-  __atomic_base<ptrdiff_t, _Sco> __count;
+  __atomic_impl<ptrdiff_t, _Sco> __count;
 
 public:
   _LIBCUDACXX_INLINE_VISIBILITY static constexpr ptrdiff_t max() noexcept
@@ -191,7 +191,7 @@ public:
 
 #ifndef _LIBCUDACXX_USE_NATIVE_SEMAPHORES
 
-template <int _Sco>
+template <thread_scope _Sco>
 class __atomic_semaphore_base<_Sco, 1>
 {
   _LIBCUDACXX_INLINE_VISIBILITY bool __acquire_slow_timed(chrono::nanoseconds const& __rel_time)
@@ -202,7 +202,7 @@ class __atomic_semaphore_base<_Sco, 1>
       },
       __rel_time);
   }
-  __atomic_base<int, _Sco> __available;
+  __atomic_impl<int, _Sco> __available;
 
 public:
   _LIBCUDACXX_INLINE_VISIBILITY static constexpr ptrdiff_t max() noexcept
@@ -269,7 +269,7 @@ public:
 
 #else
 
-template <int _Sco>
+template <thread_scope _Sco>
 class __sem_semaphore_base
 {
   _LIBCUDACXX_INLINE_VISIBILITY bool __backfill(bool __success)
@@ -278,81 +278,83 @@ class __sem_semaphore_base
     if (__success)
     {
       auto const __back_amount = __backbuffer.fetch_sub(2, memory_order_acquire);
-      bool const __post_one    = __back_amount > 0;
-      bool const __post_two    = __back_amount > 1;
-      auto const __success     = (!__post_one || __libcpp_semaphore_post(&__semaphore))
-                          && (!__post_two || __libcpp_semaphore_post(&__semaphore));
-      _LIBCUDACXX_ASSERT(__success, "");
-      if (!__post_one || !__post_two)
-      {
-        __backbuffer.fetch_add(!__post_one ? 2 : 1, memory_order_relaxed);
-      }
     }
-#  endif
-    return __success;
+    bool const __post_one = __back_amount > 0;
+    bool const __post_two = __back_amount > 1;
+    auto const __success =
+      (!__post_one || __libcpp_semaphore_post(&__semaphore)) && (!__post_two || __libcpp_semaphore_post(&__semaphore));
+    _LIBCUDACXX_ASSERT(__success, "");
+    if (!__post_one || !__post_two)
+    {
+      __backbuffer.fetch_add(!__post_one ? 2 : 1, memory_order_relaxed);
+    }
   }
+#  endif
+  return __success;
+}
 
-  _LIBCUDACXX_INLINE_VISIBILITY bool __try_acquire_fast()
-  {
+_LIBCUDACXX_INLINE_VISIBILITY bool
+__try_acquire_fast()
+{
 #  ifndef _LIBCUDACXX_HAS_NO_SEMAPHORE_FRONT_BUFFER
 
-    ptrdiff_t __old;
-    __libcpp_thread_poll_with_backoff(
-      [&]() {
-        __old = __frontbuffer.load(memory_order_relaxed);
-        return 0 != (__old >> 32);
-      },
-      chrono::microseconds(5));
+  ptrdiff_t __old;
+  __libcpp_thread_poll_with_backoff(
+    [&]() {
+      __old = __frontbuffer.load(memory_order_relaxed);
+      return 0 != (__old >> 32);
+    },
+    chrono::microseconds(5));
 
-    // always steal if you can
-    while (__old >> 32)
-    {
-      if (__frontbuffer.compare_exchange_weak(__old, __old - (1ll << 32), memory_order_acquire))
-      {
-        return true;
-      }
-    }
-    // record we're waiting
-    __old = __frontbuffer.fetch_add(1ll, memory_order_release);
-    // ALWAYS steal if you can!
-    while (__old >> 32)
+  // always steal if you can
+  while (__old >> 32)
+  {
+    if (__frontbuffer.compare_exchange_weak(__old, __old - (1ll << 32), memory_order_acquire))
     {
-      if (__frontbuffer.compare_exchange_weak(__old, __old - (1ll << 32), memory_order_acquire))
-      {
-        break;
-      }
+      return true;
     }
-    // not going to wait after all
-    if (__old >> 32)
+  }
+  // record we're waiting
+  __old = __frontbuffer.fetch_add(1ll, memory_order_release);
+  // ALWAYS steal if you can!
+  while (__old >> 32)
+  {
+    if (__frontbuffer.compare_exchange_weak(__old, __old - (1ll << 32), memory_order_acquire))
     {
-      return __try_done(true);
+      break;
     }
-#  endif
-    // the wait has begun...
-    return false;
   }
-
-  _LIBCUDACXX_INLINE_VISIBILITY bool __try_done(bool __success)
+  // not going to wait after all
+  if (__old >> 32)
   {
+    return __try_done(true);
+  }
+#  endif
+  // the wait has begun...
+  return false;
+}
+
+_LIBCUDACXX_INLINE_VISIBILITY bool __try_done(bool __success)
+{
 #  ifndef _LIBCUDACXX_HAS_NO_SEMAPHORE_FRONT_BUFFER
-    // record we're NOT waiting
-    __frontbuffer.fetch_sub(1ll, memory_order_release);
+  // record we're NOT waiting
+  __frontbuffer.fetch_sub(1ll, memory_order_release);
 #  endif
-    return __backfill(__success);
-  }
+  return __backfill(__success);
+}
 
-  _LIBCUDACXX_INLINE_VISIBILITY void __release_slow(ptrdiff_t __post_amount)
-  {
+_LIBCUDACXX_INLINE_VISIBILITY void __release_slow(ptrdiff_t __post_amount)
+{
 #  ifndef _LIBCUDACXX_HAS_NO_SEMAPHORE_BACK_BUFFER
-    bool const __post_one = __post_amount > 0;
-    bool const __post_two = __post_amount > 1;
-    if (__post_amount > 2)
-    {
-      __backbuffer.fetch_add(__post_amount - 2, memory_order_acq_rel);
-    }
-    auto const __success =
-      (!__post_one || __libcpp_semaphore_post(&__semaphore)) && (!__post_two || __libcpp_semaphore_post(&__semaphore));
-    _LIBCUDACXX_ASSERT(__success, "");
+  bool const __post_one = __post_amount > 0;
+  bool const __post_two = __post_amount > 1;
+  if (__post_amount > 2)
+  {
+    __backbuffer.fetch_add(__post_amount - 2, memory_order_acq_rel);
+  }
+  auto const __success =
+    (!__post_one || __libcpp_semaphore_post(&__semaphore)) && (!__post_two || __libcpp_semaphore_post(&__semaphore));
+  _LIBCUDACXX_ASSERT(__success, "");
 #  else
     for (; __post_amount; --__post_amount)
     {
@@ -360,101 +362,102 @@ class __sem_semaphore_base
       _LIBCUDACXX_ASSERT(__success, "");
     }
 #  endif
-  }
+}
 
-  __libcpp_semaphore_t __semaphore;
+__libcpp_semaphore_t __semaphore;
 #  ifndef _LIBCUDACXX_HAS_NO_SEMAPHORE_FRONT_BUFFER
-  __atomic_base<ptrdiff_t, _Sco> __frontbuffer;
+__atomic_impl<ptrdiff_t, _Sco> __frontbuffer;
 #  endif
 #  ifndef _LIBCUDACXX_HAS_NO_SEMAPHORE_BACK_BUFFER
-  __atomic_base<ptrdiff_t, _Sco> __backbuffer;
+__atomic_impl<ptrdiff_t, _Sco> __backbuffer;
 #  endif
 
 public:
-  static constexpr ptrdiff_t max() noexcept
-  {
-    return _LIBCUDACXX_SEMAPHORE_MAX;
-  }
+static constexpr ptrdiff_t max() noexcept
+{
+  return _LIBCUDACXX_SEMAPHORE_MAX;
+}
 
-  _LIBCUDACXX_INLINE_VISIBILITY __sem_semaphore_base(ptrdiff_t __count = 0)
-      : __semaphore()
+_LIBCUDACXX_INLINE_VISIBILITY __sem_semaphore_base(ptrdiff_t __count = 0)
+    : __semaphore()
 #  ifndef _LIBCUDACXX_HAS_NO_SEMAPHORE_FRONT_BUFFER
-      , __frontbuffer(__count << 32)
+    , __frontbuffer(__count << 32)
 #  endif
 #  ifndef _LIBCUDACXX_HAS_NO_SEMAPHORE_BACK_BUFFER
-      , __backbuffer(0)
+    , __backbuffer(0)
 #  endif
-  {
-    _LIBCUDACXX_ASSERT(__count <= max(), "");
-    auto const __success =
+{
+  _LIBCUDACXX_ASSERT(__count <= max(), "");
+  auto const __success =
 #  ifndef _LIBCUDACXX_HAS_NO_SEMAPHORE_FRONT_BUFFER
-      __libcpp_semaphore_init(&__semaphore, 0);
+    __libcpp_semaphore_init(&__semaphore, 0);
 #  else
       __libcpp_semaphore_init(&__semaphore, __count);
 #  endif
-    _LIBCUDACXX_ASSERT(__success, "");
-  }
+  _LIBCUDACXX_ASSERT(__success, "");
+}
 
-  _LIBCUDACXX_INLINE_VISIBILITY ~__sem_semaphore_base()
-  {
+_LIBCUDACXX_INLINE_VISIBILITY ~__sem_semaphore_base()
+{
 #  ifndef _LIBCUDACXX_HAS_NO_SEMAPHORE_FRONT_BUFFER
-    _LIBCUDACXX_ASSERT(0 == (__frontbuffer.load(memory_order_relaxed) & ~0u), "");
+  _LIBCUDACXX_ASSERT(0 == (__frontbuffer.load(memory_order_relaxed) & ~0u), "");
 #  endif
-    auto const __success = __libcpp_semaphore_destroy(&__semaphore);
-    _LIBCUDACXX_ASSERT(__success, "");
-  }
+  auto const __success = __libcpp_semaphore_destroy(&__semaphore);
+  _LIBCUDACXX_ASSERT(__success, "");
+}
 
-  __sem_semaphore_base(const __sem_semaphore_base&)            = delete;
-  __sem_semaphore_base& operator=(const __sem_semaphore_base&) = delete;
+__sem_semaphore_base(const __sem_semaphore_base&)            = delete;
+__sem_semaphore_base& operator=(const __sem_semaphore_base&) = delete;
 
-  _LIBCUDACXX_INLINE_VISIBILITY void release(ptrdiff_t __update = 1)
-  {
+_LIBCUDACXX_INLINE_VISIBILITY void release(ptrdiff_t __update = 1)
+{
 #  ifndef _LIBCUDACXX_HAS_NO_SEMAPHORE_FRONT_BUFFER
-    // boldly assume the semaphore is taken but uncontended
-    ptrdiff_t __old = 0;
-    // try to fast-release as long as it's uncontended
-    while (0 == (__old & ~0ul))
-    {
-      if (__frontbuffer.compare_exchange_weak(__old, __old + (__update << 32), memory_order_acq_rel))
-      {
-        return;
-      }
-    }
-#  endif
-    // slow-release it is
-    __release_slow(__update);
-  }
-
-  _LIBCUDACXX_INLINE_VISIBILITY void acquire()
+  // boldly assume the semaphore is taken but uncontended
+  ptrdiff_t __old = 0;
+  // try to fast-release as long as it's uncontended
+  while (0 == (__old & ~0ul))
   {
-    if (!__try_acquire_fast())
+    if (__frontbuffer.compare_exchange_weak(__old, __old + (__update << 32), memory_order_acq_rel))
     {
-      __try_done(__libcpp_semaphore_wait(&__semaphore));
+      return;
     }
   }
+#  endif
+  // slow-release it is
+  __release_slow(__update);
+}
 
-  _LIBCUDACXX_INLINE_VISIBILITY bool try_acquire() noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void acquire()
+{
+  if (!__try_acquire_fast())
   {
-    return try_acquire_for(chrono::nanoseconds(0));
+    __try_done(__libcpp_semaphore_wait(&__semaphore));
   }
+}
 
-  template <class Clock, class Duration>
-  _LIBCUDACXX_INLINE_VISIBILITY bool try_acquire_until(chrono::time_point<Clock, Duration> const& __abs_time)
-  {
-    auto const current = max(Clock::now(), __abs_time);
-    return try_acquire_for(chrono::duration_cast<chrono::nanoseconds>(__abs_time - current));
-  }
+_LIBCUDACXX_INLINE_VISIBILITY bool try_acquire() noexcept
+{
+  return try_acquire_for(chrono::nanoseconds(0));
+}
 
-  template <class Rep, class Period>
-  _LIBCUDACXX_INLINE_VISIBILITY bool try_acquire_for(chrono::duration<Rep, Period> const& __rel_time)
-  {
-    return __try_acquire_fast() || __try_done(__libcpp_semaphore_wait_timed(&__semaphore, __rel_time));
-  }
-};
+template <class Clock, class Duration>
+_LIBCUDACXX_INLINE_VISIBILITY bool try_acquire_until(chrono::time_point<Clock, Duration> const& __abs_time)
+{
+  auto const current = max(Clock::now(), __abs_time);
+  return try_acquire_for(chrono::duration_cast<chrono::nanoseconds>(__abs_time - current));
+}
+
+template <class Rep, class Period>
+_LIBCUDACXX_INLINE_VISIBILITY bool try_acquire_for(chrono::duration<Rep, Period> const& __rel_time)
+{
+  return __try_acquire_fast() || __try_done(__libcpp_semaphore_wait_timed(&__semaphore, __rel_time));
+}
+}
+;
 
 #endif //_LIBCUDACXX_HAS_NO_SEMAPHORES
 
-template <ptrdiff_t __least_max_value, int _Sco>
+template <ptrdiff_t __least_max_value, thread_scope _Sco>
 using __semaphore_base =
 #ifdef _LIBCUDACXX_USE_NATIVE_SEMAPHORES
   __conditional_t<__least_max_value <= __sem_semaphore_base<_Sco>::max(),
@@ -466,13 +469,13 @@ using __semaphore_base =
   ;
 
 template <ptrdiff_t __least_max_value = INT_MAX>
-class counting_semaphore : public __semaphore_base<__least_max_value, 0>
+class counting_semaphore : public __semaphore_base<__least_max_value, thread_scope_system>
 {
-  static_assert(__least_max_value <= __semaphore_base<__least_max_value, 0>::max(), "");
+  static_assert(__least_max_value <= __semaphore_base<__least_max_value, thread_scope_system>::max(), "");
 
 public:
   _LIBCUDACXX_INLINE_VISIBILITY constexpr counting_semaphore(ptrdiff_t __count = 0)
-      : __semaphore_base<__least_max_value, 0>(__count)
+      : __semaphore_base<__least_max_value, thread_scope_system>(__count)
   {}
   ~counting_semaphore() = default;
 
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_base.h b/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_base.h
deleted file mode 100644
index 65be5cfd97..0000000000
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_base.h
+++ /dev/null
@@ -1,246 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of libcu++, the C++ Standard Library for your entire system,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCUDACXX_ATOMIC_BASE_H
-#define _LIBCUDACXX_ATOMIC_BASE_H
-
-#include <cuda/std/detail/libcxx/include/support/atomic/cxx_atomic.h>
-
-// Guard ifdef for lock free query in case it is assigned elsewhere (MSVC/CUDA)
-#ifndef _LIBCUDACXX_ATOMIC_IS_LOCK_FREE
-#  define _LIBCUDACXX_ATOMIC_IS_LOCK_FREE(__x) __atomic_is_lock_free(__x, 0)
-#endif
-
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr int __cxx_atomic_order_to_int(memory_order __order)
-{
-  // Avoid switch statement to make this a constexpr.
-  return __order == memory_order_relaxed
-         ? __ATOMIC_RELAXED
-         : (__order == memory_order_acquire
-              ? __ATOMIC_ACQUIRE
-              : (__order == memory_order_release
-                   ? __ATOMIC_RELEASE
-                   : (__order == memory_order_seq_cst
-                        ? __ATOMIC_SEQ_CST
-                        : (__order == memory_order_acq_rel ? __ATOMIC_ACQ_REL : __ATOMIC_CONSUME))));
-}
-
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr int __cxx_atomic_failure_order_to_int(memory_order __order)
-{
-  // Avoid switch statement to make this a constexpr.
-  return __order == memory_order_relaxed
-         ? __ATOMIC_RELAXED
-         : (__order == memory_order_acquire
-              ? __ATOMIC_ACQUIRE
-              : (__order == memory_order_release
-                   ? __ATOMIC_RELAXED
-                   : (__order == memory_order_seq_cst
-                        ? __ATOMIC_SEQ_CST
-                        : (__order == memory_order_acq_rel ? __ATOMIC_ACQUIRE : __ATOMIC_CONSUME))));
-}
-
-template <typename _Tp, typename _Up>
-inline void __cxx_atomic_init(volatile _Tp* __a, _Up __val)
-{
-  auto __a_tmp = __cxx_get_underlying_atomic(__cxx_atomic_unwrap(__a));
-  __cxx_atomic_assign_volatile(*__a_tmp, __val);
-}
-
-template <typename _Tp, typename _Up>
-inline void __cxx_atomic_init(_Tp* __a, _Up __val)
-{
-  auto __a_tmp = __cxx_get_underlying_atomic(__cxx_atomic_unwrap(__a));
-  *__a_tmp     = __val;
-}
-
-inline void __cxx_atomic_thread_fence(memory_order __order)
-{
-  __atomic_thread_fence(__cxx_atomic_order_to_int(__order));
-}
-
-inline void __cxx_atomic_signal_fence(memory_order __order)
-{
-  __atomic_signal_fence(__cxx_atomic_order_to_int(__order));
-}
-
-template <typename _Tp, typename _Up>
-inline void __cxx_atomic_store(_Tp* __a, _Up __val, memory_order __order)
-{
-  auto __v_temp = __cxx_atomic_wrap_to_base(__a, __val);
-  __atomic_store(__cxx_atomic_unwrap(__a), &__v_temp, __cxx_atomic_order_to_int(__order));
-}
-
-template <typename _Tp>
-inline auto __cxx_atomic_load(const _Tp* __a, memory_order __order) -> __cxx_atomic_underlying_t<_Tp>
-{
-  auto __ret = __cxx_atomic_base_temporary(__a);
-  __atomic_load(__cxx_atomic_unwrap(__a), &__ret, __cxx_atomic_order_to_int(__order));
-  return *__cxx_get_underlying_atomic(&__ret);
-}
-
-template <typename _Tp, typename _Up>
-inline auto __cxx_atomic_exchange(_Tp* __a, _Up __val, memory_order __order) -> __cxx_atomic_underlying_t<_Tp>
-{
-  auto __v_temp = __cxx_atomic_wrap_to_base(__a, __val);
-  auto __ret    = __cxx_atomic_base_temporary(__a);
-  __atomic_exchange(__cxx_atomic_unwrap(__a), &__v_temp, &__ret, __cxx_atomic_order_to_int(__order));
-  return *__cxx_get_underlying_atomic(&__ret);
-}
-
-template <typename _Tp, typename _Up>
-inline bool __cxx_atomic_compare_exchange_strong(
-  _Tp* __a, _Up* __expected, _Up __value, memory_order __success, memory_order __failure)
-{
-  (void) __expected;
-  return __atomic_compare_exchange(
-    __cxx_get_underlying_atomic(__cxx_atomic_unwrap(__a)),
-    __expected,
-    &__value,
-    false,
-    __cxx_atomic_order_to_int(__success),
-    __cxx_atomic_failure_order_to_int(__failure));
-}
-
-template <typename _Tp, typename _Up>
-inline bool __cxx_atomic_compare_exchange_weak(
-  _Tp* __a, _Up* __expected, _Up __value, memory_order __success, memory_order __failure)
-{
-  (void) __expected;
-  return __atomic_compare_exchange(
-    __cxx_get_underlying_atomic(__cxx_atomic_unwrap(__a)),
-    __expected,
-    &__value,
-    true,
-    __cxx_atomic_order_to_int(__success),
-    __cxx_atomic_failure_order_to_int(__failure));
-}
-
-template <typename _Tp>
-struct __atomic_ptr_inc
-{
-  enum
-  {
-    value = 1
-  };
-};
-
-template <typename _Tp>
-struct __atomic_ptr_inc<_Tp*>
-{
-  enum
-  {
-    value = sizeof(_Tp)
-  };
-};
-
-// FIXME: Haven't figured out what the spec says about using arrays with
-// atomic_fetch_add. Force a failure rather than creating bad behavior.
-template <typename _Tp>
-struct __atomic_ptr_inc<_Tp[]>
-{};
-template <typename _Tp, int n>
-struct __atomic_ptr_inc<_Tp[n]>
-{};
-
-template <typename _Tp, typename _Td, __enable_if_t<!is_floating_point<__cxx_atomic_underlying_t<_Tp>>::value, int> = 0>
-inline auto __cxx_atomic_fetch_add(_Tp* __a, _Td __delta, memory_order __order) -> __cxx_atomic_underlying_t<_Tp>
-{
-  constexpr auto __skip_v = __atomic_ptr_inc<__cxx_atomic_underlying_t<_Tp>>::value;
-  auto __a_tmp            = __cxx_get_underlying_atomic(__cxx_atomic_unwrap(__a));
-  return __atomic_fetch_add(__a_tmp, __delta * __skip_v, __cxx_atomic_order_to_int(__order));
-}
-
-template <typename _Tp, typename _Td, __enable_if_t<is_floating_point<__cxx_atomic_underlying_t<_Tp>>::value, int> = 0>
-inline auto __cxx_atomic_fetch_add(_Tp* __a, _Td __delta, memory_order __order) -> __cxx_atomic_underlying_t<_Tp>
-{
-  auto __expected = __cxx_atomic_load(__a, memory_order_relaxed);
-  auto __desired  = __expected + __delta;
-
-  while (!__cxx_atomic_compare_exchange_strong(__a, &__expected, __desired, __order, __order))
-  {
-    __desired = __expected + __delta;
-  }
-
-  return __expected;
-}
-
-template <typename _Tp, typename _Td, __enable_if_t<!is_floating_point<__cxx_atomic_underlying_t<_Tp>>::value, int> = 0>
-inline auto __cxx_atomic_fetch_sub(_Tp* __a, _Td __delta, memory_order __order) -> __cxx_atomic_underlying_t<_Tp>
-{
-  constexpr auto __skip_v = __atomic_ptr_inc<__cxx_atomic_underlying_t<_Tp>>::value;
-  auto __a_tmp            = __cxx_get_underlying_atomic(__cxx_atomic_unwrap(__a));
-  return __atomic_fetch_sub(__a_tmp, __delta * __skip_v, __cxx_atomic_order_to_int(__order));
-}
-
-template <typename _Tp, typename _Td, __enable_if_t<is_floating_point<__cxx_atomic_underlying_t<_Tp>>::value, int> = 0>
-inline auto __cxx_atomic_fetch_sub(_Tp* __a, _Td __delta, memory_order __order) -> __cxx_atomic_underlying_t<_Tp>
-{
-  auto __expected = __cxx_atomic_load(__a, memory_order_relaxed);
-  auto __desired  = __expected - __delta;
-
-  while (!__cxx_atomic_compare_exchange_strong(__a, &__expected, __desired, __order, __order))
-  {
-    __desired = __expected - __delta;
-  }
-
-  return __expected;
-}
-
-template <typename _Tp, typename _Td>
-inline auto __cxx_atomic_fetch_and(_Tp* __a, _Td __pattern, memory_order __order) -> __cxx_atomic_underlying_t<_Tp>
-{
-  auto __a_tmp = __cxx_get_underlying_atomic(__cxx_atomic_unwrap(__a));
-  return __atomic_fetch_and(__a_tmp, __pattern, __cxx_atomic_order_to_int(__order));
-}
-
-template <typename _Tp, typename _Td>
-inline auto __cxx_atomic_fetch_or(_Tp* __a, _Td __pattern, memory_order __order) -> __cxx_atomic_underlying_t<_Tp>
-{
-  auto __a_tmp = __cxx_get_underlying_atomic(__cxx_atomic_unwrap(__a));
-  return __atomic_fetch_or(__a_tmp, __pattern, __cxx_atomic_order_to_int(__order));
-}
-
-template <typename _Tp, typename _Td>
-inline auto __cxx_atomic_fetch_xor(_Tp* __a, _Td __pattern, memory_order __order) -> __cxx_atomic_underlying_t<_Tp>
-{
-  auto __a_tmp = __cxx_get_underlying_atomic(__cxx_atomic_unwrap(__a));
-  return __atomic_fetch_xor(__a_tmp, __pattern, __cxx_atomic_order_to_int(__order));
-}
-
-template <typename _Tp, typename _Td>
-inline auto __cxx_atomic_fetch_max(_Tp* __a, _Td __val, memory_order __order) -> __cxx_atomic_underlying_t<_Tp>
-{
-  auto __expected = __cxx_atomic_load(__a, memory_order_relaxed);
-  auto __desired  = __expected > __val ? __expected : __val;
-
-  while (__desired == __val && !__cxx_atomic_compare_exchange_strong(__a, &__expected, __desired, __order, __order))
-  {
-    __desired = __expected > __val ? __expected : __val;
-  }
-
-  return __expected;
-}
-
-template <typename _Tp, typename _Td>
-inline auto __cxx_atomic_fetch_min(_Tp* __a, _Td __val, memory_order __order) -> __cxx_atomic_underlying_t<_Tp>
-{
-  auto __expected = __cxx_atomic_load(__a, memory_order_relaxed);
-  auto __desired  = __expected < __val ? __expected : __val;
-
-  while (__desired == __val && !__cxx_atomic_compare_exchange_strong(__a, &__expected, __desired, __order, __order))
-  {
-    __desired = __expected < __val ? __expected : __val;
-  }
-
-  return __expected;
-}
-
-#endif // _LIBCUDACXX_ATOMIC_BASE_H
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_c11.h b/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_c11.h
deleted file mode 100644
index 1e5c55d243..0000000000
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_c11.h
+++ /dev/null
@@ -1,241 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of libcu++, the C++ Standard Library for your entire system,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-// Atomics for C11
-
-template <typename _Tp>
-struct __cxx_atomic_base_impl
-{
-  _LIBCUDACXX_INLINE_VISIBILITY __cxx_atomic_base_impl() noexcept = default;
-
-  constexpr explicit __cxx_atomic_base_impl(_Tp value) noexcept
-      : __a_value(value)
-  {}
-  _LIBCUDACXX_DISABLE_EXTENSION_WARNING _Atomic(_Tp) __a_value;
-};
-
-#ifndef _LIBCUDACXX_ATOMIC_IS_LOCK_FREE
-#  define _LIBCUDACXX_ATOMIC_IS_LOCK_FREE(__x) __c11_atomic_is_lock_free(__x, 0)
-#endif
-
-_LIBCUDACXX_INLINE_VISIBILITY inline void __cxx_atomic_thread_fence(memory_order __order) noexcept
-{
-  __c11_atomic_thread_fence(static_cast<__memory_order_underlying_t>(__order));
-}
-
-_LIBCUDACXX_INLINE_VISIBILITY inline void __cxx_atomic_signal_fence(memory_order __order) noexcept
-{
-  __c11_atomic_signal_fence(static_cast<__memory_order_underlying_t>(__order));
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_init(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp __val) noexcept
-{
-  __c11_atomic_init(&__a->__a_value, __val);
-}
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_init(__cxx_atomic_base_impl<_Tp>* __a, _Tp __val) noexcept
-{
-  __c11_atomic_init(&__a->__a_value, __val);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY void
-__cxx_atomic_store(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp __val, memory_order __order) noexcept
-{
-  __c11_atomic_store(&__a->__a_value, __val, static_cast<__memory_order_underlying_t>(__order));
-}
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY void
-__cxx_atomic_store(__cxx_atomic_base_impl<_Tp>* __a, _Tp __val, memory_order __order) noexcept
-{
-  __c11_atomic_store(&__a->__a_value, __val, static_cast<__memory_order_underlying_t>(__order));
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp
-__cxx_atomic_load(__cxx_atomic_base_impl<_Tp> const volatile* __a, memory_order __order) noexcept
-{
-  using __ptr_type = typename remove_const<decltype(__a->__a_value)>::type*;
-  return __c11_atomic_load(const_cast<__ptr_type>(&__a->__a_value), static_cast<__memory_order_underlying_t>(__order));
-}
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp
-__cxx_atomic_load(__cxx_atomic_base_impl<_Tp> const* __a, memory_order __order) noexcept
-{
-  using __ptr_type = typename remove_const<decltype(__a->__a_value)>::type*;
-  return __c11_atomic_load(const_cast<__ptr_type>(&__a->__a_value), static_cast<__memory_order_underlying_t>(__order));
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp
-__cxx_atomic_exchange(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp __value, memory_order __order) noexcept
-{
-  return __c11_atomic_exchange(&__a->__a_value, __value, static_cast<__memory_order_underlying_t>(__order));
-}
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp
-__cxx_atomic_exchange(__cxx_atomic_base_impl<_Tp>* __a, _Tp __value, memory_order __order) noexcept
-{
-  return __c11_atomic_exchange(&__a->__a_value, __value, static_cast<__memory_order_underlying_t>(__order));
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY bool __cxx_atomic_compare_exchange_strong(
-  __cxx_atomic_base_impl<_Tp> volatile* __a,
-  _Tp* __expected,
-  _Tp __value,
-  memory_order __success,
-  memory_order __failure) noexcept
-{
-  return __c11_atomic_compare_exchange_strong(
-    &__a->__a_value,
-    __expected,
-    __value,
-    static_cast<__memory_order_underlying_t>(__success),
-    static_cast<__memory_order_underlying_t>(__failure));
-}
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY bool __cxx_atomic_compare_exchange_strong(
-  __cxx_atomic_base_impl<_Tp>* __a,
-  _Tp* __expected,
-  _Tp __value,
-  memory_order __success,
-  memory_order __failure) noexcept
-{
-  return __c11_atomic_compare_exchange_strong(
-    &__a->__a_value,
-    __expected,
-    __value,
-    static_cast<__memory_order_underlying_t>(__success),
-    static_cast<__memory_order_underlying_t>(__failure));
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY bool __cxx_atomic_compare_exchange_weak(
-  __cxx_atomic_base_impl<_Tp> volatile* __a,
-  _Tp* __expected,
-  _Tp __value,
-  memory_order __success,
-  memory_order __failure) noexcept
-{
-  return __c11_atomic_compare_exchange_weak(
-    &__a->__a_value,
-    __expected,
-    __value,
-    static_cast<__memory_order_underlying_t>(__success),
-    static_cast<__memory_order_underlying_t>(__failure));
-}
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY bool __cxx_atomic_compare_exchange_weak(
-  __cxx_atomic_base_impl<_Tp>* __a,
-  _Tp* __expected,
-  _Tp __value,
-  memory_order __success,
-  memory_order __failure) noexcept
-{
-  return __c11_atomic_compare_exchange_weak(
-    &__a->__a_value,
-    __expected,
-    __value,
-    static_cast<__memory_order_underlying_t>(__success),
-    static_cast<__memory_order_underlying_t>(__failure));
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp
-__cxx_atomic_fetch_add(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp __delta, memory_order __order) noexcept
-{
-  return __c11_atomic_fetch_add(&__a->__a_value, __delta, static_cast<__memory_order_underlying_t>(__order));
-}
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp
-__cxx_atomic_fetch_add(__cxx_atomic_base_impl<_Tp>* __a, _Tp __delta, memory_order __order) noexcept
-{
-  return __c11_atomic_fetch_add(&__a->__a_value, __delta, static_cast<__memory_order_underlying_t>(__order));
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp*
-__cxx_atomic_fetch_add(__cxx_atomic_base_impl<_Tp*> volatile* __a, ptrdiff_t __delta, memory_order __order) noexcept
-{
-  return __c11_atomic_fetch_add(&__a->__a_value, __delta, static_cast<__memory_order_underlying_t>(__order));
-}
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp*
-__cxx_atomic_fetch_add(__cxx_atomic_base_impl<_Tp*>* __a, ptrdiff_t __delta, memory_order __order) noexcept
-{
-  return __c11_atomic_fetch_add(&__a->__a_value, __delta, static_cast<__memory_order_underlying_t>(__order));
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp
-__cxx_atomic_fetch_sub(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp __delta, memory_order __order) noexcept
-{
-  return __c11_atomic_fetch_sub(&__a->__a_value, __delta, static_cast<__memory_order_underlying_t>(__order));
-}
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp
-__cxx_atomic_fetch_sub(__cxx_atomic_base_impl<_Tp>* __a, _Tp __delta, memory_order __order) noexcept
-{
-  return __c11_atomic_fetch_sub(&__a->__a_value, __delta, static_cast<__memory_order_underlying_t>(__order));
-}
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp*
-__cxx_atomic_fetch_sub(__cxx_atomic_base_impl<_Tp*> volatile* __a, ptrdiff_t __delta, memory_order __order) noexcept
-{
-  return __c11_atomic_fetch_sub(&__a->__a_value, __delta, static_cast<__memory_order_underlying_t>(__order));
-}
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp*
-__cxx_atomic_fetch_sub(__cxx_atomic_base_impl<_Tp*>* __a, ptrdiff_t __delta, memory_order __order) noexcept
-{
-  return __c11_atomic_fetch_sub(&__a->__a_value, __delta, static_cast<__memory_order_underlying_t>(__order));
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp
-__cxx_atomic_fetch_and(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp __pattern, memory_order __order) noexcept
-{
-  return __c11_atomic_fetch_and(&__a->__a_value, __pattern, static_cast<__memory_order_underlying_t>(__order));
-}
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp
-__cxx_atomic_fetch_and(__cxx_atomic_base_impl<_Tp>* __a, _Tp __pattern, memory_order __order) noexcept
-{
-  return __c11_atomic_fetch_and(&__a->__a_value, __pattern, static_cast<__memory_order_underlying_t>(__order));
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp
-__cxx_atomic_fetch_or(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp __pattern, memory_order __order) noexcept
-{
-  return __c11_atomic_fetch_or(&__a->__a_value, __pattern, static_cast<__memory_order_underlying_t>(__order));
-}
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp
-__cxx_atomic_fetch_or(__cxx_atomic_base_impl<_Tp>* __a, _Tp __pattern, memory_order __order) noexcept
-{
-  return __c11_atomic_fetch_or(&__a->__a_value, __pattern, static_cast<__memory_order_underlying_t>(__order));
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp
-__cxx_atomic_fetch_xor(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp __pattern, memory_order __order) noexcept
-{
-  return __c11_atomic_fetch_xor(&__a->__a_value, __pattern, static_cast<__memory_order_underlying_t>(__order));
-}
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp
-__cxx_atomic_fetch_xor(__cxx_atomic_base_impl<_Tp>* __a, _Tp __pattern, memory_order __order) noexcept
-{
-  return __c11_atomic_fetch_xor(&__a->__a_value, __pattern, static_cast<__memory_order_underlying_t>(__order));
-}
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda.h b/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda.h
deleted file mode 100644
index b6fa9a16fd..0000000000
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda.h
+++ /dev/null
@@ -1,787 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of libcu++, the C++ Standard Library for your entire system,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#if defined(__CUDA_MINIMUM_ARCH__)                                   \
-  && ((!defined(_CCCL_COMPILER_MSVC) && __CUDA_MINIMUM_ARCH__ < 600) \
-      || (defined(_CCCL_COMPILER_MSVC) && __CUDA_MINIMUM_ARCH__ < 700))
-#  error "CUDA atomics are only supported for sm_60 and up on *nix and sm_70 and up on Windows."
-#endif
-
-inline _CCCL_HOST_DEVICE int __stronger_order_cuda(int __a, int __b)
-{
-  int const __max = __a > __b ? __a : __b;
-  if (__max != __ATOMIC_RELEASE)
-  {
-    return __max;
-  }
-  static int const __xform[] = {__ATOMIC_RELEASE, __ATOMIC_ACQ_REL, __ATOMIC_ACQ_REL, __ATOMIC_RELEASE};
-  return __xform[__a < __b ? __a : __b];
-}
-
-// pre-define lock free query for heterogeneous compatibility
-#ifndef _LIBCUDACXX_ATOMIC_IS_LOCK_FREE
-#  define _LIBCUDACXX_ATOMIC_IS_LOCK_FREE(__x) (__x <= 8)
-#endif
-
-// Wrap host atomic implementations into a sub-namespace
-namespace __host
-{
-#if defined(_CCCL_COMPILER_MSVC)
-#  include <cuda/std/detail/libcxx/include/support/atomic/atomic_msvc.h>
-#elif defined(_LIBCUDACXX_HAS_GCC_ATOMIC_IMP)
-#  include <cuda/std/detail/libcxx/include/support/atomic/atomic_gcc.h>
-#elif defined(_LIBCUDACXX_HAS_C11_ATOMIC_IMP)
-// TODO
-//  #  include <cuda/std/detail/libcxx/include/support/atomic/atomic_c11.h>
-#elif defined(_CCCL_COMPILER_NVRTC)
-#  include <cuda/std/detail/libcxx/include/support/atomic/atomic_nvrtc.h>
-#endif
-} // namespace __host
-
-using __host::__cxx_atomic_underlying_t;
-
-#include <cuda/std/detail/libcxx/include/support/atomic/atomic_cuda_derived.h>
-#include <cuda/std/detail/libcxx/include/support/atomic/atomic_cuda_generated.h>
-
-_CCCL_HOST_DEVICE inline void __cxx_atomic_thread_fence(memory_order __order)
-{
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (__atomic_thread_fence_cuda(static_cast<__memory_order_underlying_t>(__order), __thread_scope_system_tag());),
-    NV_IS_HOST,
-    (__host::__cxx_atomic_thread_fence(__order);))
-}
-
-_CCCL_HOST_DEVICE inline void __cxx_atomic_signal_fence(memory_order __order)
-{
-  NV_DISPATCH_TARGET(NV_IS_DEVICE,
-                     (__atomic_signal_fence_cuda(static_cast<__memory_order_underlying_t>(__order));),
-                     NV_IS_HOST,
-                     (__host::__cxx_atomic_signal_fence(__order);))
-}
-
-template <typename _Tp, int _Sco, bool _Ref = false>
-struct __cxx_atomic_base_heterogeneous_impl
-{
-  __cxx_atomic_base_heterogeneous_impl() noexcept = default;
-
-  _CCCL_HOST_DEVICE constexpr explicit __cxx_atomic_base_heterogeneous_impl(_Tp __value)
-      : __a_value(__value)
-  {}
-
-  using __underlying_t       = _Tp;
-  static constexpr int __sco = _Sco;
-
-  __host::__cxx_atomic_base_impl<_Tp, _Sco> __a_value;
-};
-
-template <typename _Tp, int _Sco>
-struct __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, true>
-{
-  __cxx_atomic_base_heterogeneous_impl() noexcept = default;
-
-  static_assert(sizeof(_Tp) >= 4, "atomic_ref does not support 1 or 2 byte types");
-  static_assert(sizeof(_Tp) <= 8, "atomic_ref does not support types larger than 8 bytes");
-
-  _CCCL_HOST_DEVICE constexpr explicit __cxx_atomic_base_heterogeneous_impl(_Tp& __value)
-      : __a_value(__value)
-  {}
-
-  using __underlying_t       = _Tp;
-  static constexpr int __sco = _Sco;
-
-  __host::__cxx_atomic_ref_base_impl<_Tp, _Sco> __a_value;
-};
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE constexpr _Tp*
-__cxx_get_underlying_device_atomic(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref>* __a) noexcept
-{
-  return __cxx_get_underlying_atomic(&__a->__a_value);
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE constexpr volatile _Tp*
-__cxx_get_underlying_device_atomic(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a) noexcept
-{
-  return __cxx_get_underlying_atomic(&__a->__a_value);
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE constexpr const _Tp*
-__cxx_get_underlying_device_atomic(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> const* __a) noexcept
-{
-  return __cxx_get_underlying_atomic(&__a->__a_value);
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE constexpr const volatile _Tp*
-__cxx_get_underlying_device_atomic(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> const volatile* __a) noexcept
-{
-  return __cxx_get_underlying_atomic(&__a->__a_value);
-}
-
-template <typename _Tp>
-using __cxx_atomic_small_to_32 = __conditional_t<is_signed<_Tp>::value, int32_t, uint32_t>;
-
-// Arithmetic conversions to/from proxy types
-template <class _Tp, __enable_if_t<is_arithmetic<_Tp>::value, int> = 0>
-constexpr _CCCL_HOST_DEVICE inline __cxx_atomic_small_to_32<_Tp> __cxx_small_to_32(_Tp __val)
-{
-  return static_cast<__cxx_atomic_small_to_32<_Tp>>(__val);
-}
-
-template <class _Tp, __enable_if_t<is_arithmetic<_Tp>::value, int> = 0>
-constexpr _CCCL_HOST_DEVICE inline _Tp __cxx_small_from_32(__cxx_atomic_small_to_32<_Tp> __val)
-{
-  return static_cast<_Tp>(__val);
-}
-
-// Non-arithmetic conversion to/from proxy types
-template <class _Tp, __enable_if_t<!is_arithmetic<_Tp>::value, int> = 0>
-_CCCL_HOST_DEVICE inline __cxx_atomic_small_to_32<_Tp> __cxx_small_to_32(_Tp __val)
-{
-  __cxx_atomic_small_to_32<_Tp> __temp{};
-  memcpy(&__temp, &__val, sizeof(_Tp));
-  return __temp;
-}
-
-template <class _Tp, __enable_if_t<!is_arithmetic<_Tp>::value, int> = 0>
-_CCCL_HOST_DEVICE inline _Tp __cxx_small_from_32(__cxx_atomic_small_to_32<_Tp> __val)
-{
-  _Tp __temp{};
-  memcpy(&__temp, &__val, sizeof(_Tp));
-  return __temp;
-}
-
-template <typename _Tp, int _Sco>
-struct __cxx_atomic_base_small_impl
-{
-  __cxx_atomic_base_small_impl() noexcept = default;
-  _CCCL_HOST_DEVICE constexpr explicit __cxx_atomic_base_small_impl(_Tp __value)
-      : __a_value(__cxx_small_to_32(__value))
-  {}
-
-  using __underlying_t       = _Tp;
-  static constexpr int __sco = _Sco;
-
-  __cxx_atomic_base_heterogeneous_impl<__cxx_atomic_small_to_32<_Tp>, _Sco, false> __a_value;
-};
-
-template <typename _Tp, int _Sco>
-using __cxx_atomic_base_impl =
-  __conditional_t<sizeof(_Tp) < 4,
-                  __cxx_atomic_base_small_impl<_Tp, _Sco>,
-                  __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco>>;
-
-template <typename _Tp, int _Sco>
-using __cxx_atomic_ref_base_impl = __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, true>;
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE void __cxx_atomic_init(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref>* __a, _Tp __val)
-{
-  alignas(_Tp) auto __tmp = __val;
-  __cxx_atomic_assign_volatile(*__cxx_get_underlying_device_atomic(__a), __tmp);
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE void __cxx_atomic_init(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp __val)
-{
-  alignas(_Tp) auto __tmp = __val;
-  __cxx_atomic_assign_volatile(*__cxx_get_underlying_device_atomic(__a), __tmp);
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE void
-__cxx_atomic_store(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref>* __a, _Tp __val, memory_order __order)
-{
-  alignas(_Tp) auto __tmp = __val;
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (__atomic_store_n_cuda(__cxx_get_underlying_device_atomic(__a),
-                           __tmp,
-                           static_cast<__memory_order_underlying_t>(__order),
-                           __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (__host::__cxx_atomic_store(&__a->__a_value, __tmp, __order);))
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE void
-__cxx_atomic_store(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp __val, memory_order __order)
-{
-  alignas(_Tp) auto __tmp = __val;
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (__atomic_store_n_cuda(__cxx_get_underlying_device_atomic(__a),
-                           __tmp,
-                           static_cast<__memory_order_underlying_t>(__order),
-                           __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (__host::__cxx_atomic_store(&__a->__a_value, __tmp, __order);))
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp
-__cxx_atomic_load(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> const* __a, memory_order __order)
-{
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_load_n_cuda(__cxx_get_underlying_device_atomic(__a),
-                                 static_cast<__memory_order_underlying_t>(__order),
-                                 __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (return __host::__cxx_atomic_load(&__a->__a_value, __order);))
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp
-__cxx_atomic_load(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> const volatile* __a, memory_order __order)
-{
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_load_n_cuda(__cxx_get_underlying_device_atomic(__a),
-                                 static_cast<__memory_order_underlying_t>(__order),
-                                 __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (return __host::__cxx_atomic_load(&__a->__a_value, __order);))
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp
-__cxx_atomic_exchange(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref>* __a, _Tp __val, memory_order __order)
-{
-  alignas(_Tp) auto __tmp = __val;
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_exchange_n_cuda(__cxx_get_underlying_device_atomic(__a),
-                                     __tmp,
-                                     static_cast<__memory_order_underlying_t>(__order),
-                                     __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (return __host::__cxx_atomic_exchange(&__a->__a_value, __tmp, __order);))
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp __cxx_atomic_exchange(
-  __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp __val, memory_order __order)
-{
-  alignas(_Tp) auto __tmp = __val;
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_exchange_n_cuda(__cxx_get_underlying_device_atomic(__a),
-                                     __tmp,
-                                     static_cast<__memory_order_underlying_t>(__order),
-                                     __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (return __host::__cxx_atomic_exchange(&__a->__a_value, __tmp, __order);))
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE bool __cxx_atomic_compare_exchange_strong(
-  __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref>* __a,
-  _Tp* __expected,
-  _Tp __val,
-  memory_order __success,
-  memory_order __failure)
-{
-  alignas(_Tp) auto __tmp = *__expected;
-  bool __result           = false;
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (alignas(_Tp) auto __tmp_v = __val;
-     __result                  = __atomic_compare_exchange_cuda(
-       __cxx_get_underlying_device_atomic(__a),
-       &__tmp,
-       &__tmp_v,
-       false,
-       static_cast<__memory_order_underlying_t>(__success),
-       static_cast<__memory_order_underlying_t>(__failure),
-       __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (__result = __host::__cxx_atomic_compare_exchange_strong(&__a->__a_value, &__tmp, __val, __success, __failure);))
-  *__expected = __tmp;
-  return __result;
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE bool __cxx_atomic_compare_exchange_strong(
-  __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a,
-  _Tp* __expected,
-  _Tp __val,
-  memory_order __success,
-  memory_order __failure)
-{
-  alignas(_Tp) auto __tmp = *__expected;
-  bool __result           = false;
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (alignas(_Tp) auto __tmp_v = __val;
-     __result                  = __atomic_compare_exchange_cuda(
-       __cxx_get_underlying_device_atomic(__a),
-       &__tmp,
-       &__tmp_v,
-       false,
-       static_cast<__memory_order_underlying_t>(__success),
-       static_cast<__memory_order_underlying_t>(__failure),
-       __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (__result = __host::__cxx_atomic_compare_exchange_strong(&__a->__a_value, &__tmp, __val, __success, __failure);))
-  *__expected = __tmp;
-  return __result;
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE bool __cxx_atomic_compare_exchange_weak(
-  __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref>* __a,
-  _Tp* __expected,
-  _Tp __val,
-  memory_order __success,
-  memory_order __failure)
-{
-  alignas(_Tp) auto __tmp = *__expected;
-  bool __result           = false;
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (alignas(_Tp) auto __tmp_v = __val;
-     __result                  = __atomic_compare_exchange_cuda(
-       __cxx_get_underlying_device_atomic(__a),
-       &__tmp,
-       &__tmp_v,
-       true,
-       static_cast<__memory_order_underlying_t>(__success),
-       static_cast<__memory_order_underlying_t>(__failure),
-       __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (__result = __host::__cxx_atomic_compare_exchange_weak(&__a->__a_value, &__tmp, __val, __success, __failure);))
-  *__expected = __tmp;
-  return __result;
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE bool __cxx_atomic_compare_exchange_weak(
-  __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a,
-  _Tp* __expected,
-  _Tp __val,
-  memory_order __success,
-  memory_order __failure)
-{
-  alignas(_Tp) auto __tmp = *__expected;
-  bool __result           = false;
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (alignas(_Tp) auto __tmp_v = __val;
-     __result                  = __atomic_compare_exchange_cuda(
-       __cxx_get_underlying_device_atomic(__a),
-       &__tmp,
-       &__tmp_v,
-       true,
-       static_cast<__memory_order_underlying_t>(__success),
-       static_cast<__memory_order_underlying_t>(__failure),
-       __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (__result = __host::__cxx_atomic_compare_exchange_weak(&__a->__a_value, &__tmp, __val, __success, __failure);))
-  *__expected = __tmp;
-  return __result;
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp
-__cxx_atomic_fetch_add(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref>* __a, _Tp __delta, memory_order __order)
-{
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_fetch_add_cuda(__cxx_get_underlying_device_atomic(__a),
-                                    __delta,
-                                    static_cast<__memory_order_underlying_t>(__order),
-                                    __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (return __host::__cxx_atomic_fetch_add(&__a->__a_value, __delta, __order);))
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp __cxx_atomic_fetch_add(
-  __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp __delta, memory_order __order)
-{
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_fetch_add_cuda(__cxx_get_underlying_device_atomic(__a),
-                                    __delta,
-                                    static_cast<__memory_order_underlying_t>(__order),
-                                    __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (return __host::__cxx_atomic_fetch_add(&__a->__a_value, __delta, __order);))
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp* __cxx_atomic_fetch_add(
-  __cxx_atomic_base_heterogeneous_impl<_Tp*, _Sco, _Ref>* __a, ptrdiff_t __delta, memory_order __order)
-{
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_fetch_add_cuda(__cxx_get_underlying_device_atomic(__a),
-                                    __delta,
-                                    static_cast<__memory_order_underlying_t>(__order),
-                                    __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (return __host::__cxx_atomic_fetch_add(&__a->__a_value, __delta, __order);))
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp* __cxx_atomic_fetch_add(
-  __cxx_atomic_base_heterogeneous_impl<_Tp*, _Sco, _Ref> volatile* __a, ptrdiff_t __delta, memory_order __order)
-{
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_fetch_add_cuda(__cxx_get_underlying_device_atomic(__a),
-                                    __delta,
-                                    static_cast<__memory_order_underlying_t>(__order),
-                                    __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (return __host::__cxx_atomic_fetch_add(&__a->__a_value, __delta, __order);))
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp
-__cxx_atomic_fetch_sub(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref>* __a, _Tp __delta, memory_order __order)
-{
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_fetch_sub_cuda(__cxx_get_underlying_device_atomic(__a),
-                                    __delta,
-                                    static_cast<__memory_order_underlying_t>(__order),
-                                    __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (return __host::__cxx_atomic_fetch_sub(&__a->__a_value, __delta, __order);))
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp __cxx_atomic_fetch_sub(
-  __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp __delta, memory_order __order)
-{
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_fetch_sub_cuda(__cxx_get_underlying_device_atomic(__a),
-                                    __delta,
-                                    static_cast<__memory_order_underlying_t>(__order),
-                                    __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (return __host::__cxx_atomic_fetch_sub(&__a->__a_value, __delta, __order);))
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp* __cxx_atomic_fetch_sub(
-  __cxx_atomic_base_heterogeneous_impl<_Tp*, _Sco, _Ref>* __a, ptrdiff_t __delta, memory_order __order)
-{
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_fetch_sub_cuda(__cxx_get_underlying_device_atomic(__a),
-                                    __delta,
-                                    static_cast<__memory_order_underlying_t>(__order),
-                                    __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (return __host::__cxx_atomic_fetch_sub(&__a->__a_value, __delta, __order);))
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp* __cxx_atomic_fetch_sub(
-  __cxx_atomic_base_heterogeneous_impl<_Tp*, _Sco, _Ref> volatile* __a, ptrdiff_t __delta, memory_order __order)
-{
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_fetch_sub_cuda(__cxx_get_underlying_device_atomic(__a),
-                                    __delta,
-                                    static_cast<__memory_order_underlying_t>(__order),
-                                    __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (return __host::__cxx_atomic_fetch_sub(&__a->__a_value, __delta, __order);))
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp
-__cxx_atomic_fetch_and(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref>* __a, _Tp __pattern, memory_order __order)
-{
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_fetch_and_cuda(__cxx_get_underlying_device_atomic(__a),
-                                    __pattern,
-                                    static_cast<__memory_order_underlying_t>(__order),
-                                    __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (return __host::__cxx_atomic_fetch_and(&__a->__a_value, __pattern, __order);))
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp __cxx_atomic_fetch_and(
-  __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp __pattern, memory_order __order)
-{
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_fetch_and_cuda(__cxx_get_underlying_device_atomic(__a),
-                                    __pattern,
-                                    static_cast<__memory_order_underlying_t>(__order),
-                                    __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (return __host::__cxx_atomic_fetch_and(&__a->__a_value, __pattern, __order);))
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp
-__cxx_atomic_fetch_or(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref>* __a, _Tp __pattern, memory_order __order)
-{
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_fetch_or_cuda(__cxx_get_underlying_device_atomic(__a),
-                                   __pattern,
-                                   static_cast<__memory_order_underlying_t>(__order),
-                                   __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (return __host::__cxx_atomic_fetch_or(&__a->__a_value, __pattern, __order);))
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp __cxx_atomic_fetch_or(
-  __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp __pattern, memory_order __order)
-{
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_fetch_or_cuda(__cxx_get_underlying_device_atomic(__a),
-                                   __pattern,
-                                   static_cast<__memory_order_underlying_t>(__order),
-                                   __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (return __host::__cxx_atomic_fetch_or(&__a->__a_value, __pattern, __order);))
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp
-__cxx_atomic_fetch_xor(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref>* __a, _Tp __pattern, memory_order __order)
-{
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_fetch_xor_cuda(__cxx_get_underlying_device_atomic(__a),
-                                    __pattern,
-                                    static_cast<__memory_order_underlying_t>(__order),
-                                    __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (return __host::__cxx_atomic_fetch_xor(&__a->__a_value, __pattern, __order);))
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp __cxx_atomic_fetch_xor(
-  __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp __pattern, memory_order __order)
-{
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_fetch_xor_cuda(__cxx_get_underlying_device_atomic(__a),
-                                    __pattern,
-                                    static_cast<__memory_order_underlying_t>(__order),
-                                    __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (return __host::__cxx_atomic_fetch_xor(&__a->__a_value, __pattern, __order);))
-}
-
-template <typename _Tp, typename _Delta, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp
-__cxx_atomic_fetch_max(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref>* __a, _Delta __val, memory_order __order)
-{
-  NV_IF_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_fetch_max_cuda(__cxx_get_underlying_device_atomic(__a),
-                                    __val,
-                                    static_cast<__memory_order_underlying_t>(__order),
-                                    __scope_tag<_Sco>());),
-    (return __host::__cxx_atomic_fetch_max(&__a->__a_value, __val, __order);))
-}
-
-template <typename _Tp, typename _Delta, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp __cxx_atomic_fetch_max(
-  __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Delta __val, memory_order __order)
-{
-  NV_IF_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_fetch_max_cuda(__cxx_get_underlying_device_atomic(__a),
-                                    __val,
-                                    static_cast<__memory_order_underlying_t>(__order),
-                                    __scope_tag<_Sco>());),
-    (return __host::__cxx_atomic_fetch_max(&__a->__a_value, __val, __order);))
-}
-
-template <typename _Tp, typename _Delta, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp
-__cxx_atomic_fetch_min(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref>* __a, _Delta __val, memory_order __order)
-{
-  NV_IF_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_fetch_min_cuda(__cxx_get_underlying_device_atomic(__a),
-                                    __val,
-                                    static_cast<__memory_order_underlying_t>(__order),
-                                    __scope_tag<_Sco>());),
-    (return __host::__cxx_atomic_fetch_min(&__a->__a_value, __val, __order);))
-}
-
-template <typename _Tp, typename _Delta, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp __cxx_atomic_fetch_min(
-  __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Delta __val, memory_order __order)
-{
-  NV_IF_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_fetch_min_cuda(__cxx_get_underlying_device_atomic(__a),
-                                    __val,
-                                    static_cast<__memory_order_underlying_t>(__order),
-                                    __scope_tag<_Sco>());),
-    (return __host::__cxx_atomic_fetch_min(&__a->__a_value, __val, __order);))
-}
-
-template <typename _Tp, int _Sco>
-_CCCL_HOST_DEVICE inline void __cxx_atomic_init(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp __val)
-{
-  __cxx_atomic_init(&__a->__a_value, __cxx_small_to_32(__val));
-}
-
-template <typename _Tp, int _Sco>
-_CCCL_HOST_DEVICE inline void
-__cxx_atomic_store(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp __val, memory_order __order)
-{
-  __cxx_atomic_store(&__a->__a_value, __cxx_small_to_32(__val), __order);
-}
-
-template <typename _Tp, int _Sco>
-_CCCL_HOST_DEVICE inline _Tp
-__cxx_atomic_load(__cxx_atomic_base_small_impl<_Tp, _Sco> const volatile* __a, memory_order __order)
-{
-  return __cxx_small_from_32<_Tp>(__cxx_atomic_load(&__a->__a_value, __order));
-}
-
-template <typename _Tp, int _Sco>
-_CCCL_HOST_DEVICE inline _Tp
-__cxx_atomic_exchange(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp __value, memory_order __order)
-{
-  return __cxx_small_from_32<_Tp>(__cxx_atomic_exchange(&__a->__a_value, __cxx_small_to_32(__value), __order));
-}
-_CCCL_HOST_DEVICE inline int __cuda_memcmp(void const* __lhs, void const* __rhs, size_t __count)
-{
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (auto __lhs_c = reinterpret_cast<unsigned char const*>(__lhs);
-     auto __rhs_c = reinterpret_cast<unsigned char const*>(__rhs);
-     while (__count--) {
-       auto const __lhs_v = *__lhs_c++;
-       auto const __rhs_v = *__rhs_c++;
-       if (__lhs_v < __rhs_v)
-       {
-         return -1;
-       }
-       if (__lhs_v > __rhs_v)
-       {
-         return 1;
-       }
-     } return 0;),
-    NV_IS_HOST,
-    (return memcmp(__lhs, __rhs, __count);))
-}
-
-template <typename _Tp, int _Sco>
-_CCCL_HOST_DEVICE inline bool __cxx_atomic_compare_exchange_weak(
-  __cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a,
-  _Tp* __expected,
-  _Tp __value,
-  memory_order __success,
-  memory_order __failure)
-{
-  auto __temp = __cxx_small_to_32(*__expected);
-  auto const __ret =
-    __cxx_atomic_compare_exchange_weak(&__a->__a_value, &__temp, __cxx_small_to_32(__value), __success, __failure);
-  auto const __actual   = __cxx_small_from_32<_Tp>(__temp);
-  constexpr auto __mask = static_cast<decltype(__temp)>((1u << (8 * sizeof(_Tp))) - 1);
-  if (!__ret)
-  {
-    if (0 == __cuda_memcmp(&__actual, __expected, sizeof(_Tp)))
-    {
-      __cxx_atomic_fetch_and(&__a->__a_value, __mask, memory_order_relaxed);
-    }
-    else
-    {
-      *__expected = __actual;
-    }
-  }
-  return __ret;
-}
-
-template <typename _Tp, int _Sco>
-_CCCL_HOST_DEVICE inline bool __cxx_atomic_compare_exchange_strong(
-  __cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a,
-  _Tp* __expected,
-  _Tp __value,
-  memory_order __success,
-  memory_order __failure)
-{
-  auto const __old = *__expected;
-  while (1)
-  {
-    if (__cxx_atomic_compare_exchange_weak(__a, __expected, __value, __success, __failure))
-    {
-      return true;
-    }
-    if (0 != __cuda_memcmp(&__old, __expected, sizeof(_Tp)))
-    {
-      return false;
-    }
-  }
-}
-
-template <typename _Tp, int _Sco>
-_CCCL_HOST_DEVICE inline _Tp
-__cxx_atomic_fetch_add(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp __delta, memory_order __order)
-{
-  return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_add(&__a->__a_value, __cxx_small_to_32(__delta), __order));
-}
-
-template <typename _Tp, int _Sco>
-_CCCL_HOST_DEVICE inline _Tp
-__cxx_atomic_fetch_sub(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp __delta, memory_order __order)
-{
-  return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_sub(&__a->__a_value, __cxx_small_to_32(__delta), __order));
-}
-
-template <typename _Tp, int _Sco>
-_CCCL_HOST_DEVICE inline _Tp
-__cxx_atomic_fetch_and(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp __pattern, memory_order __order)
-{
-  return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_and(&__a->__a_value, __cxx_small_to_32(__pattern), __order));
-}
-
-template <typename _Tp, int _Sco>
-_CCCL_HOST_DEVICE inline _Tp
-__cxx_atomic_fetch_or(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp __pattern, memory_order __order)
-{
-  return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_or(&__a->__a_value, __cxx_small_to_32(__pattern), __order));
-}
-
-template <typename _Tp, int _Sco>
-_CCCL_HOST_DEVICE inline _Tp
-__cxx_atomic_fetch_xor(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp __pattern, memory_order __order)
-{
-  return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_xor(&__a->__a_value, __cxx_small_to_32(__pattern), __order));
-}
-
-template <typename _Tp, typename _Delta, int _Sco>
-_CCCL_HOST_DEVICE inline _Tp
-__cxx_atomic_fetch_max(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Delta __val, memory_order __order)
-{
-  return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_max(&__a->__a_value, __cxx_small_to_32(__val), __order));
-}
-
-template <typename _Tp, typename _Delta, int _Sco>
-_CCCL_HOST_DEVICE inline _Tp
-__cxx_atomic_fetch_min(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Delta __val, memory_order __order)
-{
-  return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_min(&__a->__a_value, __cxx_small_to_32(__val), __order));
-}
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda_derived.h b/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda_derived.h
deleted file mode 100644
index 891b0ffe1c..0000000000
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda_derived.h
+++ /dev/null
@@ -1,190 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of libcu++, the C++ Standard Library for your entire system,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-template <class _Type, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type) <= 2, int>::type = 0>
-bool _CCCL_DEVICE __atomic_compare_exchange_cuda(
-  _Type volatile* __ptr,
-  _Type* __expected,
-  const _Type* __desired,
-  bool,
-  int __success_memorder,
-  int __failure_memorder,
-  _Scope __s)
-{
-  auto const __aligned = (uint32_t*) ((intptr_t) __ptr & ~(sizeof(uint32_t) - 1));
-  auto const __offset  = uint32_t((intptr_t) __ptr & (sizeof(uint32_t) - 1)) * 8;
-  auto const __mask    = ((1 << sizeof(_Type) * 8) - 1) << __offset;
-
-  uint32_t __old = *__expected << __offset;
-  uint32_t __old_value;
-  while (1)
-  {
-    __old_value = (__old & __mask) >> __offset;
-    if (__old_value != *__expected)
-    {
-      break;
-    }
-    uint32_t const __attempt = (__old & ~__mask) | (*__desired << __offset);
-    if (__atomic_compare_exchange_cuda(__aligned, &__old, &__attempt, true, __success_memorder, __failure_memorder, __s))
-    {
-      return true;
-    }
-  }
-  *__expected = __old_value;
-  return false;
-}
-
-template <class _Type, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type) <= 2, int>::type = 0>
-void _CCCL_DEVICE __atomic_exchange_cuda(_Type volatile* __ptr, _Type* __val, _Type* __ret, int __memorder, _Scope __s)
-{
-  _Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s);
-  while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __val, true, __memorder, __memorder, __s))
-    ;
-  *__ret = __expected;
-}
-
-template <class _Type, class _Delta, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type) <= 2, int>::type = 0>
-_Type _CCCL_DEVICE __atomic_fetch_add_cuda(_Type volatile* __ptr, _Delta __val, int __memorder, _Scope __s)
-{
-  _Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s);
-  _Type __desired  = __expected + __val;
-  while (!__atomic_compare_exchange_cuda(__ptr, &__expected, &__desired, true, __memorder, __memorder, __s))
-  {
-    __desired = __expected + __val;
-  }
-  return __expected;
-}
-
-template <
-  class _Type,
-  class _Delta,
-  class _Scope,
-  typename _CUDA_VSTD::enable_if<sizeof(_Type) <= 2 || _CUDA_VSTD::is_floating_point<_Type>::value, int>::type = 0>
-_Type _CCCL_HOST_DEVICE __atomic_fetch_max_cuda(_Type volatile* __ptr, _Delta __val, int __memorder, _Scope __s)
-{
-  _Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s);
-  _Type __desired  = __expected > __val ? __expected : __val;
-
-  while (__desired == __val
-         && !__atomic_compare_exchange_cuda(__ptr, &__expected, &__desired, true, __memorder, __memorder, __s))
-  {
-    __desired = __expected > __val ? __expected : __val;
-  }
-
-  return __expected;
-}
-
-template <
-  class _Type,
-  class _Delta,
-  class _Scope,
-  typename _CUDA_VSTD::enable_if<sizeof(_Type) <= 2 || _CUDA_VSTD::is_floating_point<_Type>::value, int>::type = 0>
-_Type _CCCL_HOST_DEVICE __atomic_fetch_min_cuda(_Type volatile* __ptr, _Delta __val, int __memorder, _Scope __s)
-{
-  _Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s);
-  _Type __desired  = __expected < __val ? __expected : __val;
-
-  while (__desired == __val
-         && !__atomic_compare_exchange_cuda(__ptr, &__expected, &__desired, true, __memorder, __memorder, __s))
-  {
-    __desired = __expected < __val ? __expected : __val;
-  }
-
-  return __expected;
-}
-
-template <class _Type, class _Delta, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type) <= 2, int>::type = 0>
-_Type _CCCL_DEVICE __atomic_fetch_sub_cuda(_Type volatile* __ptr, _Delta __val, int __memorder, _Scope __s)
-{
-  _Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s);
-  _Type __desired  = __expected - __val;
-  while (!__atomic_compare_exchange_cuda(__ptr, &__expected, &__desired, true, __memorder, __memorder, __s))
-  {
-    __desired = __expected - __val;
-  }
-  return __expected;
-}
-
-template <class _Type, class _Delta, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type) <= 2, int>::type = 0>
-_Type _CCCL_DEVICE __atomic_fetch_and_cuda(_Type volatile* __ptr, _Delta __val, int __memorder, _Scope __s)
-{
-  _Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s);
-  _Type __desired  = __expected & __val;
-  while (!__atomic_compare_exchange_cuda(__ptr, &__expected, &__desired, true, __memorder, __memorder, __s))
-  {
-    __desired = __expected & __val;
-  }
-  return __expected;
-}
-
-template <class _Type, class _Delta, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type) <= 2, int>::type = 0>
-_Type _CCCL_DEVICE __atomic_fetch_xor_cuda(_Type volatile* __ptr, _Delta __val, int __memorder, _Scope __s)
-{
-  _Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s);
-  _Type __desired  = __expected ^ __val;
-  while (!__atomic_compare_exchange_cuda(__ptr, &__expected, &__desired, true, __memorder, __memorder, __s))
-  {
-    __desired = __expected ^ __val;
-  }
-  return __expected;
-}
-
-template <class _Type, class _Delta, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type) <= 2, int>::type = 0>
-_Type _CCCL_DEVICE __atomic_fetch_or_cuda(_Type volatile* __ptr, _Delta __val, int __memorder, _Scope __s)
-{
-  _Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s);
-  _Type __desired  = __expected | __val;
-  while (!__atomic_compare_exchange_cuda(__ptr, &__expected, &__desired, true, __memorder, __memorder, __s))
-  {
-    __desired = __expected | __val;
-  }
-  return __expected;
-}
-
-template <class _Type, class _Scope>
-_Type _CCCL_DEVICE __atomic_load_n_cuda(const _Type volatile* __ptr, int __memorder, _Scope __s)
-{
-  _Type __ret;
-  __atomic_load_cuda(__ptr, &__ret, __memorder, __s);
-  return __ret;
-}
-
-template <class _Type, class _Scope>
-void _CCCL_DEVICE __atomic_store_n_cuda(_Type volatile* __ptr, _Type __val, int __memorder, _Scope __s)
-{
-  __atomic_store_cuda(__ptr, &__val, __memorder, __s);
-}
-
-template <class _Type, class _Scope>
-bool _CCCL_DEVICE __atomic_compare_exchange_n_cuda(
-  _Type volatile* __ptr,
-  _Type* __expected,
-  _Type __desired,
-  bool __weak,
-  int __success_memorder,
-  int __failure_memorder,
-  _Scope __s)
-{
-  return __atomic_compare_exchange_cuda(
-    __ptr, __expected, &__desired, __weak, __success_memorder, __failure_memorder, __s);
-}
-
-template <class _Type, class _Scope>
-_Type _CCCL_DEVICE __atomic_exchange_n_cuda(_Type volatile* __ptr, _Type __val, int __memorder, _Scope __s)
-{
-  _Type __ret;
-  __atomic_exchange_cuda(__ptr, &__val, &__ret, __memorder, __s);
-  return __ret;
-}
-
-static inline _CCCL_DEVICE void __atomic_signal_fence_cuda(int)
-{
-  asm volatile("" ::: "memory");
-}
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_gcc.h b/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_gcc.h
deleted file mode 100644
index 8d5d7967cb..0000000000
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_gcc.h
+++ /dev/null
@@ -1,17 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of libcu++, the C++ Standard Library for your entire system,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCUDACXX_ATOMIC_GCC_H
-#define _LIBCUDACXX_ATOMIC_GCC_H
-
-#include <cuda/std/detail/libcxx/include/support/atomic/atomic_base.h>
-
-#endif // _LIBCUDACXX_ATOMIC_GCC_H
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_nvrtc.h b/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_nvrtc.h
deleted file mode 100644
index 129b088081..0000000000
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_nvrtc.h
+++ /dev/null
@@ -1,17 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of libcu++, the C++ Standard Library for your entire system,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCUDACXX_ATOMIC_NVRTC_H
-#define _LIBCUDACXX_ATOMIC_NVRTC_H
-
-#include <cuda/std/detail/libcxx/include/support/atomic/cxx_atomic.h>
-
-#endif // _LIBCUDACXX_ATOMIC_NVRTC_H
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_scopes.h b/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_scopes.h
deleted file mode 100644
index 9a035b1e4d..0000000000
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_scopes.h
+++ /dev/null
@@ -1,67 +0,0 @@
-#ifndef __LIBCUDACXX_ATOMIC_SCOPES_H
-#define __LIBCUDACXX_ATOMIC_SCOPES_H
-
-// REMEMBER CHANGES TO THESE ARE ABI BREAKING
-// TODO: Space values out for potential new scopes
-#ifndef __ATOMIC_BLOCK
-#  define __ATOMIC_SYSTEM 0 // 0 indicates default
-#  define __ATOMIC_DEVICE 1
-#  define __ATOMIC_BLOCK  2
-#  define __ATOMIC_THREAD 10
-#endif //__ATOMIC_BLOCK
-
-enum thread_scope
-{
-  thread_scope_system = __ATOMIC_SYSTEM,
-  thread_scope_device = __ATOMIC_DEVICE,
-  thread_scope_block  = __ATOMIC_BLOCK,
-  thread_scope_thread = __ATOMIC_THREAD
-};
-
-#define _LIBCUDACXX_ATOMIC_SCOPE_TYPE    ::cuda::thread_scope
-#define _LIBCUDACXX_ATOMIC_SCOPE_DEFAULT ::cuda::thread_scope::system
-
-struct __thread_scope_thread_tag
-{};
-struct __thread_scope_block_tag
-{};
-struct __thread_scope_device_tag
-{};
-struct __thread_scope_system_tag
-{};
-
-template <int _Scope>
-struct __scope_enum_to_tag
-{};
-/* This would be the implementation once an actual thread-scope backend exists.
-template<> struct __scope_enum_to_tag<(int)thread_scope_thread> {
-    using type = __thread_scope_thread_tag; };
-Until then: */
-template <>
-struct __scope_enum_to_tag<(int) thread_scope_thread>
-{
-  using type = __thread_scope_block_tag;
-};
-template <>
-struct __scope_enum_to_tag<(int) thread_scope_block>
-{
-  using type = __thread_scope_block_tag;
-};
-template <>
-struct __scope_enum_to_tag<(int) thread_scope_device>
-{
-  using type = __thread_scope_device_tag;
-};
-template <>
-struct __scope_enum_to_tag<(int) thread_scope_system>
-{
-  using type = __thread_scope_system_tag;
-};
-
-template <int _Scope>
-_LIBCUDACXX_INLINE_VISIBILITY auto constexpr __scope_tag() -> typename __scope_enum_to_tag<_Scope>::type
-{
-  return typename __scope_enum_to_tag<_Scope>::type();
-}
-
-#endif // __LIBCUDACXX_ATOMIC_SCOPES_H
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/cxx_atomic.h b/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/cxx_atomic.h
deleted file mode 100644
index a4212f44a7..0000000000
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/cxx_atomic.h
+++ /dev/null
@@ -1,180 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of libcu++, the C++ Standard Library for your entire system,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCUDACXX_CXX_ATOMIC_H
-#define _LIBCUDACXX_CXX_ATOMIC_H
-
-template <typename _Tp, int _Sco>
-struct __cxx_atomic_base_impl
-{
-  using __underlying_t = _Tp;
-  using __temporary_t  = __cxx_atomic_base_impl<_Tp, _Sco>;
-  using __wrap_t       = __cxx_atomic_base_impl<_Tp, _Sco>;
-
-  static constexpr int __sco = _Sco;
-
-#if !defined(_CCCL_COMPILER_GCC) || (__GNUC__ >= 5)
-  static_assert(is_trivially_copyable<_Tp>::value, "std::atomic<Tp> requires that 'Tp' be a trivially copyable type");
-#endif
-
-  constexpr __cxx_atomic_base_impl() noexcept                         = default;
-  constexpr __cxx_atomic_base_impl(__cxx_atomic_base_impl&&) noexcept = default;
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit __cxx_atomic_base_impl(_Tp value) noexcept
-      : __a_value(value)
-  {}
-
-  __cxx_atomic_base_impl& operator=(const __cxx_atomic_base_impl&) noexcept = default;
-
-  _CCCL_ALIGNAS(sizeof(_Tp)) _Tp __a_value;
-};
-
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr _Tp* __cxx_get_underlying_atomic(__cxx_atomic_base_impl<_Tp, _Sco>* __a) noexcept
-{
-  return &__a->__a_value;
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr volatile _Tp*
-__cxx_get_underlying_atomic(__cxx_atomic_base_impl<_Tp, _Sco> volatile* __a) noexcept
-{
-  return &__a->__a_value;
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr const _Tp*
-__cxx_get_underlying_atomic(__cxx_atomic_base_impl<_Tp, _Sco> const* __a) noexcept
-{
-  return &__a->__a_value;
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr const volatile _Tp*
-__cxx_get_underlying_atomic(__cxx_atomic_base_impl<_Tp, _Sco> const volatile* __a) noexcept
-{
-  return &__a->__a_value;
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __cxx_atomic_base_impl<_Tp, _Sco>*
-__cxx_atomic_unwrap(__cxx_atomic_base_impl<_Tp, _Sco>* __a) noexcept
-{
-  return __a;
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr volatile __cxx_atomic_base_impl<_Tp, _Sco>*
-__cxx_atomic_unwrap(__cxx_atomic_base_impl<_Tp, _Sco> volatile* __a) noexcept
-{
-  return __a;
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr const __cxx_atomic_base_impl<_Tp, _Sco>*
-__cxx_atomic_unwrap(__cxx_atomic_base_impl<_Tp, _Sco> const* __a) noexcept
-{
-  return __a;
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr const volatile __cxx_atomic_base_impl<_Tp, _Sco>*
-__cxx_atomic_unwrap(__cxx_atomic_base_impl<_Tp, _Sco> const volatile* __a) noexcept
-{
-  return __a;
-}
-
-template <typename _Tp, int _Sco>
-struct __cxx_atomic_ref_base_impl
-{
-  using __underlying_t = _Tp;
-  using __temporary_t  = _Tp;
-  using __wrap_t       = _Tp;
-
-  static constexpr int __sco = _Sco;
-
-#if !defined(_CCCL_COMPILER_GCC) || (__GNUC__ >= 5)
-  static_assert(is_trivially_copyable<_Tp>::value,
-                "std::atomic_ref<Tp> requires that 'Tp' be a trivially copyable type");
-#endif
-
-  constexpr __cxx_atomic_ref_base_impl() noexcept                                  = delete;
-  constexpr __cxx_atomic_ref_base_impl(__cxx_atomic_ref_base_impl&&) noexcept      = default;
-  constexpr __cxx_atomic_ref_base_impl(const __cxx_atomic_ref_base_impl&) noexcept = default;
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit __cxx_atomic_ref_base_impl(_Tp& value) noexcept
-      : __a_value(&value)
-  {}
-
-  _Tp* __a_value;
-};
-
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr _Tp*
-__cxx_get_underlying_atomic(__cxx_atomic_ref_base_impl<_Tp, _Sco>* __a) noexcept
-{
-  return __a->__a_value;
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr volatile _Tp*
-__cxx_get_underlying_atomic(__cxx_atomic_ref_base_impl<_Tp, _Sco> volatile* __a) noexcept
-{
-  return __a->__a_value;
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr const _Tp*
-__cxx_get_underlying_atomic(__cxx_atomic_ref_base_impl<_Tp, _Sco> const* __a) noexcept
-{
-  return __a->__a_value;
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr const volatile _Tp*
-__cxx_get_underlying_atomic(__cxx_atomic_ref_base_impl<_Tp, _Sco> const volatile* __a) noexcept
-{
-  return __a->__a_value;
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr _Tp* __cxx_atomic_unwrap(__cxx_atomic_ref_base_impl<_Tp, _Sco>* __a) noexcept
-{
-  return __cxx_get_underlying_atomic(__a);
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr volatile _Tp*
-__cxx_atomic_unwrap(__cxx_atomic_ref_base_impl<_Tp, _Sco> volatile* __a) noexcept
-{
-  return __cxx_get_underlying_atomic(__a);
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr const _Tp*
-__cxx_atomic_unwrap(__cxx_atomic_ref_base_impl<_Tp, _Sco> const* __a) noexcept
-{
-  return __cxx_get_underlying_atomic(__a);
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr const volatile _Tp*
-__cxx_atomic_unwrap(__cxx_atomic_ref_base_impl<_Tp, _Sco> const volatile* __a) noexcept
-{
-  return __cxx_get_underlying_atomic(__a);
-}
-
-template <typename _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr _Tp* __cxx_get_underlying_atomic(_Tp* __a) noexcept
-{
-  return __a;
-}
-
-template <typename _Tp, typename _Up>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr auto __cxx_atomic_wrap_to_base(_Tp*, _Up __val) noexcept ->
-  typename _Tp::__wrap_t
-{
-  return typename _Tp::__wrap_t(__val);
-}
-template <typename _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr auto __cxx_atomic_base_temporary(_Tp*) noexcept -> typename _Tp::__temporary_t
-{
-  return typename _Tp::__temporary_t();
-}
-
-template <typename _Tp>
-using __cxx_atomic_underlying_t = typename _Tp::__underlying_t;
-
-#endif //_LIBCUDACXX_CXX_ATOMIC_H
diff --git a/libcudacxx/test/libcudacxx/cuda/annotated_ptr/utils.h b/libcudacxx/test/libcudacxx/cuda/annotated_ptr/utils.h
index 5eddfd442d..588bbedb4f 100644
--- a/libcudacxx/test/libcudacxx/cuda/annotated_ptr/utils.h
+++ b/libcudacxx/test/libcudacxx/cuda/annotated_ptr/utils.h
@@ -14,6 +14,7 @@
 #endif
 
 #include <cuda/annotated_ptr>
+#include <cuda/std/cassert>
 
 #if defined(DEBUG)
 #  define DPRINTF(...)     \
diff --git a/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_fetch.fail.cpp b/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_fetch.fail.cpp
index e2d73258c9..2a855a6223 100644
--- a/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_fetch.fail.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_fetch.fail.cpp
@@ -9,9 +9,9 @@
 // UNSUPPORTED: libcpp-has-no-threads, pre-sm-60
 // UNSUPPORTED: windows && pre-sm-70
 
-// <cuda/std/atomic>
+// <cuda/atomic>
 
-#include <cuda/std/atomic>
+#include <cuda/atomic>
 #include <cuda/std/cassert>
 #include <cuda/std/type_traits>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_fetch_max.pass.cpp b/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_fetch_max.pass.cpp
index 2c83f5d66e..3818fc3ab7 100644
--- a/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_fetch_max.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_fetch_max.pass.cpp
@@ -9,9 +9,9 @@
 // UNSUPPORTED: libcpp-has-no-threads, pre-sm-60
 // UNSUPPORTED: windows && pre-sm-70
 
-// <cuda/std/atomic>
+// <cuda/atomic>
 
-#include <cuda/std/atomic>
+#include <cuda/atomic>
 #include <cuda/std/cassert>
 #include <cuda/std/type_traits>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_fetch_min.pass.cpp b/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_fetch_min.pass.cpp
index 05920744c6..4a5c9dfef2 100644
--- a/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_fetch_min.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_fetch_min.pass.cpp
@@ -9,9 +9,9 @@
 // UNSUPPORTED: libcpp-has-no-threads, pre-sm-60
 // UNSUPPORTED: windows && pre-sm-70
 
-// <cuda/std/atomic>
+// <cuda/atomic>
 
-#include <cuda/std/atomic>
+#include <cuda/atomic>
 #include <cuda/std/cassert>
 #include <cuda/std/type_traits>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_helpers.h b/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_helpers.h
index ae3ac2ec5e..cc54eda725 100644
--- a/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_helpers.h
+++ b/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_helpers.h
@@ -9,7 +9,7 @@
 #ifndef ATOMIC_HELPERS_H
 #define ATOMIC_HELPERS_H
 
-#include <cuda/std/atomic>
+#include <cuda/atomic>
 #include <cuda/std/cassert>
 
 #include "test_macros.h"
diff --git a/libcudacxx/test/libcudacxx/cuda/bad_atomic_alignment.pass.cpp b/libcudacxx/test/libcudacxx/cuda/bad_atomic_alignment.pass.cpp
index d0566c3a14..e4a099ac6c 100644
--- a/libcudacxx/test/libcudacxx/cuda/bad_atomic_alignment.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/bad_atomic_alignment.pass.cpp
@@ -37,8 +37,11 @@ struct TestFn
       A& t = *sel.construct();
       cuda::std::atomic_init(&t, key{1, 2});
       auto r = t.load();
+      auto d = key{5, 5};
       t.store(r);
       (void) t.exchange(r);
+      (void) t.compare_exchange_weak(r, d, cuda::memory_order_seq_cst, cuda::memory_order_seq_cst);
+      (void) t.compare_exchange_strong(d, r, cuda::memory_order_seq_cst, cuda::memory_order_seq_cst);
     }
     {
       struct alignas(8) key
@@ -51,8 +54,11 @@ struct TestFn
       A& t = *sel.construct();
       cuda::std::atomic_init(&t, key{1, 2});
       auto r = t.load();
+      auto d = key{5, 5};
       t.store(r);
       (void) t.exchange(r);
+      (void) t.compare_exchange_weak(r, d, cuda::memory_order_seq_cst, cuda::memory_order_seq_cst);
+      (void) t.compare_exchange_strong(d, r, cuda::memory_order_seq_cst, cuda::memory_order_seq_cst);
     }
   }
 };
diff --git a/libcudacxx/test/libcudacxx/cuda/pipeline_group_concept.h b/libcudacxx/test/libcudacxx/cuda/pipeline_group_concept.h
index 2410abea06..83d08371d5 100644
--- a/libcudacxx/test/libcudacxx/cuda/pipeline_group_concept.h
+++ b/libcudacxx/test/libcudacxx/cuda/pipeline_group_concept.h
@@ -13,6 +13,7 @@
 // TODO: Remove pointless comparison suppression when compiler fixes short-circuiting
 
 #include <cuda/pipeline>
+#include <cuda/std/cassert>
 
 #include "test_macros.h"
 
diff --git a/libcudacxx/test/libcudacxx/heterogeneous/helpers.h b/libcudacxx/test/libcudacxx/heterogeneous/helpers.h
index 7691912558..3b6759d61c 100644
--- a/libcudacxx/test/libcudacxx/heterogeneous/helpers.h
+++ b/libcudacxx/test/libcudacxx/heterogeneous/helpers.h
@@ -81,10 +81,6 @@ __host__ inline std::vector<std::thread>& host_threads()
 
 __host__ inline void sync_host_threads()
 {
-#ifdef DEBUG_TESTERS
-  printf("%s\n", __PRETTY_FUNCTION__);
-  fflush(stdout);
-#endif
   for (auto&& thread : host_threads())
   {
     thread.join();
@@ -100,10 +96,6 @@ __host__ inline std::vector<std::thread>& device_threads()
 
 __host__ inline void sync_device_threads()
 {
-#ifdef DEBUG_TESTERS
-  printf("%s\n", __PRETTY_FUNCTION__);
-  fflush(stdout);
-#endif
   for (auto&& thread : device_threads())
   {
     thread.join();
@@ -217,14 +209,14 @@ template <typename Tester, typename T>
 void device_initialize(T& object)
 {
 #ifdef DEBUG_TESTERS
-  printf("%s\n", __PRETTY_FUNCTION__);
+  printf("    %s\n", __PRETTY_FUNCTION__);
   fflush(stdout);
 #endif
 
   auto kernel_launcher = [&object](cudaStream_t stream) {
     constexpr auto tc = threadcount_trait<Tester>::value;
 #ifdef DEBUG_TESTERS
-    printf("%i device init threads launched\r\n", (int) tc);
+    printf("      %i device init threads launched\r\n", (int) tc);
     fflush(stdout);
 #endif
     initialization_kernel<Tester><<<1, tc, 0, stream>>>(object);
@@ -234,10 +226,6 @@ void device_initialize(T& object)
 
   if (!async_initialize_trait<Tester>::value)
   {
-#ifdef DEBUG_TESTERS
-    printf("init not async, synchronizing\r\n");
-    fflush(stdout);
-#endif
     HETEROGENEOUS_SAFE_CALL(cudaDeviceSynchronize());
     sync_all();
   }
@@ -247,14 +235,14 @@ template <typename Tester, typename T>
 void device_validate(T& object)
 {
 #ifdef DEBUG_TESTERS
-  printf("%s\n", __PRETTY_FUNCTION__);
+  printf("    %s\n", __PRETTY_FUNCTION__);
   fflush(stdout);
 #endif
 
   auto kernel_launcher = [&object](cudaStream_t stream) {
     constexpr auto tc = threadcount_trait<Tester>::value;
 #ifdef DEBUG_TESTERS
-    printf("%i device validate threads launched\r\n", (int) tc);
+    printf("     %i device validate threads launched\r\n", (int) tc);
     fflush(stdout);
 #endif
     validation_kernel<Tester><<<1, tc, 0, stream>>>(object);
@@ -264,10 +252,6 @@ void device_validate(T& object)
 
   if (!async_validate_trait<Tester>::value)
   {
-#ifdef DEBUG_TESTERS
-    printf("validate not async, synchronizing\r\n");
-    fflush(stdout);
-#endif
     HETEROGENEOUS_SAFE_CALL(cudaDeviceSynchronize());
     sync_all();
   }
@@ -277,13 +261,13 @@ template <typename Tester, typename T>
 void host_initialize(T& object)
 {
 #ifdef DEBUG_TESTERS
-  printf("%s\n", __PRETTY_FUNCTION__);
+  printf("    %s\n", __PRETTY_FUNCTION__);
   fflush(stdout);
 #endif
 
   constexpr auto tc = threadcount_trait<Tester>::value;
 #ifdef DEBUG_TESTERS
-  printf("%i host init threads launched\r\n", (int) tc);
+  printf("      %i host init threads launched\r\n", (int) tc);
   fflush(stdout);
 #endif
 
@@ -296,10 +280,6 @@ void host_initialize(T& object)
 
   if (!async_initialize_trait<Tester>::value)
   {
-#ifdef DEBUG_TESTERS
-    printf("init not async, synchronizing\r\n");
-    fflush(stdout);
-#endif
     HETEROGENEOUS_SAFE_CALL(cudaDeviceSynchronize());
     sync_all();
   }
@@ -309,13 +289,13 @@ template <typename Tester, typename T>
 void host_validate(T& object)
 {
 #ifdef DEBUG_TESTERS
-  printf("%s\n", __PRETTY_FUNCTION__);
+  printf("    %s\n", __PRETTY_FUNCTION__);
   fflush(stdout);
 #endif
 
   constexpr auto tc = threadcount_trait<Tester>::value;
 #ifdef DEBUG_TESTERS
-  printf("%i host validate threads launched\r\n", (int) tc);
+  printf("      %i host validate threads launched\r\n", (int) tc);
   fflush(stdout);
 #endif
 
@@ -328,10 +308,6 @@ void host_validate(T& object)
 
   if (!async_initialize_trait<Tester>::value)
   {
-#ifdef DEBUG_TESTERS
-    printf("validate not async, synchronizing\r\n");
-    fflush(stdout);
-#endif
     HETEROGENEOUS_SAFE_CALL(cudaDeviceSynchronize());
     sync_all();
   }
@@ -396,7 +372,7 @@ template <size_t Idx, typename Fn, typename Launchers, enable_if_permutations_re
 void permute_tests(const Fn& fn, Launchers launchers)
 {
 #ifdef DEBUG_TESTERS
-  printf("Testing permutation %zu of %zu\r\n", Idx, sizeof...(Launchers));
+  printf("  Testing permutation %zd (%s)\r\n", Idx, __PRETTY_FUNCTION__);
   fflush(stdout);
 #endif
   fn(launchers);
@@ -447,6 +423,10 @@ void validate_device_dynamic(tester_list<Testers...> testers, Args... args)
 
   // ex: type_list<device_launcher, host_launcher, host_launcher>
   using initial_launcher_list = append_n<sizeof...(Testers) - 1, type_list<device_launcher<T>>, host_launcher<T>>;
+#ifdef DEBUG_TESTERS
+  printf("Launching %zd permutations\r\n", sizeof...(Testers));
+  fflush(stdout);
+#endif
   permute_tests(test_harness, initial_launcher_list{});
 }
 
@@ -652,10 +632,18 @@ void validate_pinned(Args... args)
 {
   using list_t = typename validate_list<false, TesterList>::type;
   list_t list0;
+#ifdef DEBUG_TESTERS
+  printf("%s\n", "Launching permuted H/D tests");
+  fflush(stdout);
+#endif
   validate_device_dynamic<T>(list0, args...);
 
   if (check_managed_memory_support(is_tester_list_async<list_t>::value))
   {
+#ifdef DEBUG_TESTERS
+    printf("%s\n", "Launching mixed H/D tests");
+    fflush(stdout);
+#endif
     typename validate_list<true, TesterList>::type list1;
     validate_managed<T>(list1, args...);
   }
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.flag/init.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.flag/init.pass.cpp
index 72090475a4..9bf8624f67 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.flag/init.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.flag/init.pass.cpp
@@ -14,7 +14,7 @@
 
 // struct atomic_flag
 
-// atomic_flag() = ATOMIC_FLAG_INIT;
+// atomic_flag() = LIBCUDACXX_ATOMIC_FLAG_INIT;
 
 #include <cuda/std/atomic>
 #include <cuda/std/cassert>
@@ -24,9 +24,9 @@
 int main(int, char**)
 {
   NV_DISPATCH_TARGET(NV_IS_HOST,
-                     (cuda::std::atomic_flag f = ATOMIC_FLAG_INIT; assert(f.test_and_set() == 0);),
+                     (cuda::std::atomic_flag f = LIBCUDACXX_ATOMIC_FLAG_INIT; assert(f.test_and_set() == 0);),
                      NV_PROVIDES_SM_70,
-                     (cuda::std::atomic_flag f = ATOMIC_FLAG_INIT; assert(f.test_and_set() == 0);))
+                     (cuda::std::atomic_flag f = LIBCUDACXX_ATOMIC_FLAG_INIT; assert(f.test_and_set() == 0);))
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.lockfree/isalwayslockfree.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.lockfree/isalwayslockfree.pass.cpp
index 9e5e9d41e7..7ec8db0973 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.lockfree/isalwayslockfree.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.lockfree/isalwayslockfree.pass.cpp
@@ -38,7 +38,7 @@ __host__ __device__ void checkAlwaysLockFree()
 }
 
 // FIXME: This separate test is needed to work around llvm.org/PR31864
-// which causes ATOMIC_LLONG_LOCK_FREE to be defined as '1' in 32-bit builds
+// which causes LIBCUDACXX_ATOMIC_LLONG_LOCK_FREE to be defined as '1' in 32-bit builds
 // even though __atomic_always_lock_free returns true for the same type.
 constexpr bool NeedWorkaroundForPR31864 =
 #if defined(__clang__)
@@ -53,8 +53,8 @@ template <bool Disable                      = NeedWorkaroundForPR31864,
           class ULLong                      = unsigned long long>
 __host__ __device__ void checkLongLongTypes()
 {
-  static_assert(cuda::std::atomic<LLong>::is_always_lock_free == (2 == ATOMIC_LLONG_LOCK_FREE), "");
-  static_assert(cuda::std::atomic<ULLong>::is_always_lock_free == (2 == ATOMIC_LLONG_LOCK_FREE), "");
+  static_assert(cuda::std::atomic<LLong>::is_always_lock_free == (2 == LIBCUDACXX_ATOMIC_LLONG_LOCK_FREE), "");
+  static_assert(cuda::std::atomic<ULLong>::is_always_lock_free == (2 == LIBCUDACXX_ATOMIC_LLONG_LOCK_FREE), "");
 }
 
 // Used to make the calls to __atomic_always_lock_free dependent on a template
@@ -74,7 +74,7 @@ __host__ __device__ void checkLongLongTypes()
   constexpr bool ExpectLockFree = __atomic_always_lock_free(getSizeOf<LLong>(), 0);
   static_assert(cuda::std::atomic<LLong>::is_always_lock_free == ExpectLockFree, "");
   static_assert(cuda::std::atomic<ULLong>::is_always_lock_free == ExpectLockFree, "");
-  static_assert((0 != ATOMIC_LLONG_LOCK_FREE) == ExpectLockFree, "");
+  static_assert((0 != LIBCUDACXX_ATOMIC_LLONG_LOCK_FREE) == ExpectLockFree, "");
 }
 
 __host__ __device__ void run()
@@ -143,22 +143,23 @@ __host__ __device__ void run()
   });
 
   // C macro and static constexpr must be consistent.
-  static_assert(cuda::std::atomic<bool>::is_always_lock_free == (2 == ATOMIC_BOOL_LOCK_FREE), "");
-  static_assert(cuda::std::atomic<char>::is_always_lock_free == (2 == ATOMIC_CHAR_LOCK_FREE), "");
-  static_assert(cuda::std::atomic<signed char>::is_always_lock_free == (2 == ATOMIC_CHAR_LOCK_FREE), "");
-  static_assert(cuda::std::atomic<unsigned char>::is_always_lock_free == (2 == ATOMIC_CHAR_LOCK_FREE), "");
-  static_assert(cuda::std::atomic<char16_t>::is_always_lock_free == (2 == ATOMIC_CHAR16_T_LOCK_FREE), "");
-  static_assert(cuda::std::atomic<char32_t>::is_always_lock_free == (2 == ATOMIC_CHAR32_T_LOCK_FREE), "");
-  static_assert(cuda::std::atomic<wchar_t>::is_always_lock_free == (2 == ATOMIC_WCHAR_T_LOCK_FREE), "");
-  static_assert(cuda::std::atomic<short>::is_always_lock_free == (2 == ATOMIC_SHORT_LOCK_FREE), "");
-  static_assert(cuda::std::atomic<unsigned short>::is_always_lock_free == (2 == ATOMIC_SHORT_LOCK_FREE), "");
-  static_assert(cuda::std::atomic<int>::is_always_lock_free == (2 == ATOMIC_INT_LOCK_FREE), "");
-  static_assert(cuda::std::atomic<unsigned int>::is_always_lock_free == (2 == ATOMIC_INT_LOCK_FREE), "");
-  static_assert(cuda::std::atomic<long>::is_always_lock_free == (2 == ATOMIC_LONG_LOCK_FREE), "");
-  static_assert(cuda::std::atomic<unsigned long>::is_always_lock_free == (2 == ATOMIC_LONG_LOCK_FREE), "");
+  static_assert(cuda::std::atomic<bool>::is_always_lock_free == (2 == LIBCUDACXX_ATOMIC_BOOL_LOCK_FREE), "");
+  static_assert(cuda::std::atomic<char>::is_always_lock_free == (2 == LIBCUDACXX_ATOMIC_CHAR_LOCK_FREE), "");
+  static_assert(cuda::std::atomic<signed char>::is_always_lock_free == (2 == LIBCUDACXX_ATOMIC_CHAR_LOCK_FREE), "");
+  static_assert(cuda::std::atomic<unsigned char>::is_always_lock_free == (2 == LIBCUDACXX_ATOMIC_CHAR_LOCK_FREE), "");
+  static_assert(cuda::std::atomic<char16_t>::is_always_lock_free == (2 == LIBCUDACXX_ATOMIC_CHAR16_T_LOCK_FREE), "");
+  static_assert(cuda::std::atomic<char32_t>::is_always_lock_free == (2 == LIBCUDACXX_ATOMIC_CHAR32_T_LOCK_FREE), "");
+  static_assert(cuda::std::atomic<wchar_t>::is_always_lock_free == (2 == LIBCUDACXX_ATOMIC_WCHAR_T_LOCK_FREE), "");
+  static_assert(cuda::std::atomic<short>::is_always_lock_free == (2 == LIBCUDACXX_ATOMIC_SHORT_LOCK_FREE), "");
+  static_assert(cuda::std::atomic<unsigned short>::is_always_lock_free == (2 == LIBCUDACXX_ATOMIC_SHORT_LOCK_FREE), "");
+  static_assert(cuda::std::atomic<int>::is_always_lock_free == (2 == LIBCUDACXX_ATOMIC_INT_LOCK_FREE), "");
+  static_assert(cuda::std::atomic<unsigned int>::is_always_lock_free == (2 == LIBCUDACXX_ATOMIC_INT_LOCK_FREE), "");
+  static_assert(cuda::std::atomic<long>::is_always_lock_free == (2 == LIBCUDACXX_ATOMIC_LONG_LOCK_FREE), "");
+  static_assert(cuda::std::atomic<unsigned long>::is_always_lock_free == (2 == LIBCUDACXX_ATOMIC_LONG_LOCK_FREE), "");
   checkLongLongTypes();
-  static_assert(cuda::std::atomic<void*>::is_always_lock_free == (2 == ATOMIC_POINTER_LOCK_FREE), "");
-  static_assert(cuda::std::atomic<cuda::std::nullptr_t>::is_always_lock_free == (2 == ATOMIC_POINTER_LOCK_FREE), "");
+  static_assert(cuda::std::atomic<void*>::is_always_lock_free == (2 == LIBCUDACXX_ATOMIC_POINTER_LOCK_FREE), "");
+  static_assert(
+    cuda::std::atomic<cuda::std::nullptr_t>::is_always_lock_free == (2 == LIBCUDACXX_ATOMIC_POINTER_LOCK_FREE), "");
 }
 
 int main(int, char**)
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.lockfree/lockfree.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.lockfree/lockfree.pass.cpp
index 1ca3afd2f7..f0853813ad 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.lockfree/lockfree.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.lockfree/lockfree.pass.cpp
@@ -29,16 +29,26 @@
 
 int main(int, char**)
 {
-  assert(ATOMIC_BOOL_LOCK_FREE == 0 || ATOMIC_BOOL_LOCK_FREE == 1 || ATOMIC_BOOL_LOCK_FREE == 2);
-  assert(ATOMIC_CHAR_LOCK_FREE == 0 || ATOMIC_CHAR_LOCK_FREE == 1 || ATOMIC_CHAR_LOCK_FREE == 2);
-  assert(ATOMIC_CHAR16_T_LOCK_FREE == 0 || ATOMIC_CHAR16_T_LOCK_FREE == 1 || ATOMIC_CHAR16_T_LOCK_FREE == 2);
-  assert(ATOMIC_CHAR32_T_LOCK_FREE == 0 || ATOMIC_CHAR32_T_LOCK_FREE == 1 || ATOMIC_CHAR32_T_LOCK_FREE == 2);
-  assert(ATOMIC_WCHAR_T_LOCK_FREE == 0 || ATOMIC_WCHAR_T_LOCK_FREE == 1 || ATOMIC_WCHAR_T_LOCK_FREE == 2);
-  assert(ATOMIC_SHORT_LOCK_FREE == 0 || ATOMIC_SHORT_LOCK_FREE == 1 || ATOMIC_SHORT_LOCK_FREE == 2);
-  assert(ATOMIC_INT_LOCK_FREE == 0 || ATOMIC_INT_LOCK_FREE == 1 || ATOMIC_INT_LOCK_FREE == 2);
-  assert(ATOMIC_LONG_LOCK_FREE == 0 || ATOMIC_LONG_LOCK_FREE == 1 || ATOMIC_LONG_LOCK_FREE == 2);
-  assert(ATOMIC_LLONG_LOCK_FREE == 0 || ATOMIC_LLONG_LOCK_FREE == 1 || ATOMIC_LLONG_LOCK_FREE == 2);
-  assert(ATOMIC_POINTER_LOCK_FREE == 0 || ATOMIC_POINTER_LOCK_FREE == 1 || ATOMIC_POINTER_LOCK_FREE == 2);
+  assert(LIBCUDACXX_ATOMIC_BOOL_LOCK_FREE == 0 || LIBCUDACXX_ATOMIC_BOOL_LOCK_FREE == 1
+         || LIBCUDACXX_ATOMIC_BOOL_LOCK_FREE == 2);
+  assert(LIBCUDACXX_ATOMIC_CHAR_LOCK_FREE == 0 || LIBCUDACXX_ATOMIC_CHAR_LOCK_FREE == 1
+         || LIBCUDACXX_ATOMIC_CHAR_LOCK_FREE == 2);
+  assert(LIBCUDACXX_ATOMIC_CHAR16_T_LOCK_FREE == 0 || LIBCUDACXX_ATOMIC_CHAR16_T_LOCK_FREE == 1
+         || LIBCUDACXX_ATOMIC_CHAR16_T_LOCK_FREE == 2);
+  assert(LIBCUDACXX_ATOMIC_CHAR32_T_LOCK_FREE == 0 || LIBCUDACXX_ATOMIC_CHAR32_T_LOCK_FREE == 1
+         || LIBCUDACXX_ATOMIC_CHAR32_T_LOCK_FREE == 2);
+  assert(LIBCUDACXX_ATOMIC_WCHAR_T_LOCK_FREE == 0 || LIBCUDACXX_ATOMIC_WCHAR_T_LOCK_FREE == 1
+         || LIBCUDACXX_ATOMIC_WCHAR_T_LOCK_FREE == 2);
+  assert(LIBCUDACXX_ATOMIC_SHORT_LOCK_FREE == 0 || LIBCUDACXX_ATOMIC_SHORT_LOCK_FREE == 1
+         || LIBCUDACXX_ATOMIC_SHORT_LOCK_FREE == 2);
+  assert(LIBCUDACXX_ATOMIC_INT_LOCK_FREE == 0 || LIBCUDACXX_ATOMIC_INT_LOCK_FREE == 1
+         || LIBCUDACXX_ATOMIC_INT_LOCK_FREE == 2);
+  assert(LIBCUDACXX_ATOMIC_LONG_LOCK_FREE == 0 || LIBCUDACXX_ATOMIC_LONG_LOCK_FREE == 1
+         || LIBCUDACXX_ATOMIC_LONG_LOCK_FREE == 2);
+  assert(LIBCUDACXX_ATOMIC_LLONG_LOCK_FREE == 0 || LIBCUDACXX_ATOMIC_LLONG_LOCK_FREE == 1
+         || LIBCUDACXX_ATOMIC_LLONG_LOCK_FREE == 2);
+  assert(LIBCUDACXX_ATOMIC_POINTER_LOCK_FREE == 0 || LIBCUDACXX_ATOMIC_POINTER_LOCK_FREE == 1
+         || LIBCUDACXX_ATOMIC_POINTER_LOCK_FREE == 2);
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/address.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/address.pass.cpp
index 37bfc73300..74dc6f8515 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/address.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/address.pass.cpp
@@ -68,6 +68,7 @@
 //     T* operator-=(ptrdiff_t op);
 // };
 
+#include <cuda/atomic>
 #include <cuda/std/atomic>
 #include <cuda/std/cassert>
 #include <cuda/std/type_traits>
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/address_ref.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/address_ref.pass.cpp
index 0cae7e53a6..376ca94e19 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/address_ref.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/address_ref.pass.cpp
@@ -68,6 +68,7 @@
 //     T* operator-=(ptrdiff_t op);
 // };
 
+#include <cuda/atomic>
 #include <cuda/std/atomic>
 #include <cuda/std/cassert>
 #include <cuda/std/type_traits>
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/address_ref_constness.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/address_ref_constness.pass.cpp
index 9108280b80..9adc1d390b 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/address_ref_constness.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/address_ref_constness.pass.cpp
@@ -68,6 +68,7 @@
 //     T* operator-=(ptrdiff_t op);
 // };
 
+#include <cuda/atomic>
 #include <cuda/std/atomic>
 #include <cuda/std/cassert>
 #include <cuda/std/type_traits>
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/atomic_copyable.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/atomic_copyable.pass.cpp
index 3650b84f07..a9486a5dcd 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/atomic_copyable.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/atomic_copyable.pass.cpp
@@ -15,6 +15,7 @@
 
 // <cuda/std/atomic>
 
+#include <cuda/atomic>
 #include <cuda/std/atomic>
 #include <cuda/std/cassert>
 #include <cuda/std/utility>
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/bool.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/bool.pass.cpp
index 6dc016dabf..131d3677d1 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/bool.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/bool.pass.cpp
@@ -51,6 +51,7 @@
 //
 // typedef atomic<bool> atomic_bool;
 
+#include <cuda/atomic>
 #include <cuda/std/atomic>
 #include <cuda/std/cassert>
 
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/cstdint_typedefs.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/cstdint_typedefs.pass.cpp
index 6105a54918..13b1afe169 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/cstdint_typedefs.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/cstdint_typedefs.pass.cpp
@@ -36,6 +36,7 @@
 // typedef atomic<intmax_t>  atomic_intmax_t;
 // typedef atomic<uintmax_t> atomic_uintmax_t;
 
+#include <cuda/atomic>
 #include <cuda/std/atomic>
 #include <cuda/std/cstdint>
 #include <cuda/std/type_traits>
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/enum_class.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/enum_class.pass.cpp
index 1904c53206..adc43d32a5 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/enum_class.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/enum_class.pass.cpp
@@ -49,6 +49,7 @@
 //     T operator=(T) noexcept;
 // };
 
+#include <cuda/atomic>
 #include <cuda/std/atomic>
 #include <cuda/std/cassert>
 
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point.pass.cpp
index f000d0e69a..28145c99bf 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point.pass.cpp
@@ -72,6 +72,7 @@
 //     floating_point operator-=(floating_point op);
 // };
 
+#include <cuda/atomic>
 #include <cuda/std/atomic>
 #include <cuda/std/cassert>
 
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point_ref.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point_ref.pass.cpp
index c790be5b6a..ce25bc45d3 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point_ref.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point_ref.pass.cpp
@@ -72,6 +72,7 @@
 //     floating_point operator-=(floating_point op);
 // };
 
+#include <cuda/atomic>
 #include <cuda/std/atomic>
 #include <cuda/std/cassert>
 
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point_ref_constness.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point_ref_constness.pass.cpp
index 6ff9981471..7c5dae71a9 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point_ref_constness.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point_ref_constness.pass.cpp
@@ -12,6 +12,7 @@
 
 // <cuda/std/atomic>
 
+#include <cuda/atomic>
 #include <cuda/std/atomic>
 #include <cuda/std/cassert>
 
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral.pass.cpp
index 272cedff26..ed53c53c57 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral.pass.cpp
@@ -86,6 +86,7 @@
 //     integral operator^=(integral op);
 // };
 
+#include <cuda/atomic>
 #include <cuda/std/atomic>
 #include <cuda/std/cassert>
 
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref.pass.cpp
index b685255e02..56153f3664 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref.pass.cpp
@@ -86,6 +86,7 @@
 //     integral operator^=(integral op);
 // };
 
+#include <cuda/atomic>
 #include <cuda/std/atomic>
 #include <cuda/std/cassert>
 
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref_constness.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref_constness.pass.cpp
index 2b20eb7841..b237c862a5 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref_constness.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref_constness.pass.cpp
@@ -86,6 +86,7 @@
 //     integral operator^=(integral op);
 // };
 
+#include <cuda/atomic>
 #include <cuda/std/atomic>
 #include <cuda/std/cassert>
 
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_var_init.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_var_init.pass.cpp
index a3acff9845..d81e4d11e9 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_var_init.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_var_init.pass.cpp
@@ -12,7 +12,7 @@
 
 // <cuda/std/atomic>
 
-// #define ATOMIC_VAR_INIT(value)
+// #define LIBCUDACXX_ATOMIC_VAR_INIT(value)
 
 #include <cuda/std/atomic>
 #include <cuda/std/cassert>
@@ -22,7 +22,7 @@
 
 int main(int, char**)
 {
-  cuda::std::atomic<int> v = ATOMIC_VAR_INIT(5);
+  cuda::std::atomic<int> v = LIBCUDACXX_ATOMIC_VAR_INIT(5);
   assert(v == 5);
 
   return 0;
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.operations/atomics.types.operations.req/ctor.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.operations/atomics.types.operations.req/ctor.pass.cpp
index 050bb36e72..b033b1ff83 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.operations/atomics.types.operations.req/ctor.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.operations/atomics.types.operations.req/ctor.pass.cpp
@@ -20,6 +20,7 @@
 
 #define _LIBCUDACXX_DISABLE_DEPRECATION_WARNINGS
 
+#include <cuda/atomic>
 #include <cuda/std/atomic>
 #include <cuda/std/cassert>
 #include <cuda/std/type_traits>
@@ -62,7 +63,7 @@ struct TestFunc
 #if !defined(_GNUC_VER) || _GNUC_VER >= 409
     // TODO: Figure out why this is failing with GCC 4.8.2 on CentOS 7 only.
     {
-      constexpr Atomic a = ATOMIC_VAR_INIT(t);
+      constexpr Atomic a = LIBCUDACXX_ATOMIC_VAR_INIT(t);
       assert(a == t);
     }
 #endif
diff --git a/libcudacxx/test/utils/libcudacxx/test/format.py b/libcudacxx/test/utils/libcudacxx/test/format.py
index f2b6f478fb..3a58447989 100644
--- a/libcudacxx/test/utils/libcudacxx/test/format.py
+++ b/libcudacxx/test/utils/libcudacxx/test/format.py
@@ -74,6 +74,10 @@ def getTestsInDirectory(self, testSuite, path_in_suite,
                     yield lit.Test.Test(testSuite, path_in_suite + (filename,),
                                         localConfig)
 
+    def getTestsForPath(self, testSuite, path_in_suite,
+                            litConfig, localConfig):
+        yield lit.Test.Test(testSuite, path_in_suite, localConfig)
+
     def execute(self, test, lit_config):
         while True:
             try: