From 32a02275a987045ed61b17252e8e647787639538 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Thu, 11 Apr 2024 12:45:22 -0700
Subject: [PATCH 01/71] Delete <cuda/std/atomic> header

---
 libcudacxx/include/cuda/std/atomic | 22 ----------------------
 1 file changed, 22 deletions(-)
 delete mode 100644 libcudacxx/include/cuda/std/atomic

diff --git a/libcudacxx/include/cuda/std/atomic b/libcudacxx/include/cuda/std/atomic
deleted file mode 100644
index 0daab5f2cb..0000000000
--- a/libcudacxx/include/cuda/std/atomic
+++ /dev/null
@@ -1,22 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of libcu++, the C++ Standard Library for your entire system,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _CUDA_STD_ATOMIC
-#define _CUDA_STD_ATOMIC
-
-#include <cuda/std/detail/__config>
-
-#include <cuda/std/detail/__pragma_push>
-
-#include <cuda/std/detail/libcxx/include/atomic>
-
-#include <cuda/std/detail/__pragma_pop>
-
-#endif // _CUDA_STD_ATOMIC

From 4be887d1898fc660a41fdf7e4e02bc4d9c6679a9 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Thu, 11 Apr 2024 12:46:32 -0700
Subject: [PATCH 02/71] Move atomic from libcxx to top-level

---
 libcudacxx/include/cuda/std/{detail/libcxx/include => }/atomic | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename libcudacxx/include/cuda/std/{detail/libcxx/include => }/atomic (100%)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/atomic b/libcudacxx/include/cuda/std/atomic
similarity index 100%
rename from libcudacxx/include/cuda/std/detail/libcxx/include/atomic
rename to libcudacxx/include/cuda/std/atomic

From b36fec61334a75e4e72db0ac6e8f0053e3209b13 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Thu, 11 Apr 2024 12:50:52 -0700
Subject: [PATCH 03/71] Move PTX backends from libcxx to
 <cuda/std/__atomic/...>

---
 libcudacxx/codegen/CMakeLists.txt                      | 10 +++++-----
 libcudacxx/codegen/codegen.cpp                         |  4 ++--
 .../atomic_cuda_ptx_derived.h}                         |  0
 .../atomic_cuda_ptx_generated.h}                       |  0
 4 files changed, 7 insertions(+), 7 deletions(-)
 rename libcudacxx/include/cuda/std/{detail/libcxx/include/support/atomic/atomic_cuda_derived.h => __atomic/atomic_cuda_ptx_derived.h} (100%)
 rename libcudacxx/include/cuda/std/{detail/libcxx/include/support/atomic/atomic_cuda_generated.h => __atomic/atomic_cuda_ptx_generated.h} (100%)

diff --git a/libcudacxx/codegen/CMakeLists.txt b/libcudacxx/codegen/CMakeLists.txt
index b0df1b5a98..af1b6bdb8a 100644
--- a/libcudacxx/codegen/CMakeLists.txt
+++ b/libcudacxx/codegen/CMakeLists.txt
@@ -19,8 +19,8 @@ target_compile_features(
 
 add_dependencies(libcudacxx.atomics.codegen codegen)
 
-set(atomic_generated_output "${libcudacxx_BINARY_DIR}/codegen/atomic_cuda_generated.h")
-set(atomic_install_location "${libcudacxx_SOURCE_DIR}/include/cuda/std/detail/libcxx/include/support/atomic")
+set(atomic_generated_output "${libcudacxx_BINARY_DIR}/codegen/atomic_cuda_ptx_generated.h")
+set(atomic_install_location "${libcudacxx_SOURCE_DIR}/include/cuda/std/__atomic")
 
 add_custom_target(
     libcudacxx.atomics.codegen.execute
@@ -32,13 +32,13 @@ add_dependencies(libcudacxx.atomics.codegen libcudacxx.atomics.codegen.execute)
 
 add_custom_target(
     libcudacxx.atomics.codegen.install
-    COMMAND ${CMAKE_COMMAND} -E copy "${atomic_generated_output}" "${atomic_install_location}/atomic_cuda_generated.h"
-    BYPRODUCTS "${atomic_install_location}/atomic_cuda_generated.h"
+    COMMAND ${CMAKE_COMMAND} -E copy "${atomic_generated_output}" "${atomic_install_location}/atomic_cuda_ptx_generated.h"
+    BYPRODUCTS "${atomic_install_location}/atomic_cuda_ptx_generated.h"
 )
 
 add_dependencies(libcudacxx.atomics.codegen.install libcudacxx.atomics.codegen.execute)
 
 add_test(
     NAME libcudacxx.atomics.codegen.diff
-    COMMAND ${CMAKE_COMMAND} -E compare_files "${atomic_install_location}/atomic_cuda_generated.h" "${atomic_generated_output}"
+    COMMAND ${CMAKE_COMMAND} -E compare_files "${atomic_install_location}/atomic_cuda_ptx_generated.h" "${atomic_generated_output}"
 )
diff --git a/libcudacxx/codegen/codegen.cpp b/libcudacxx/codegen/codegen.cpp
index 77d96a92d9..fd032d1d4b 100644
--- a/libcudacxx/codegen/codegen.cpp
+++ b/libcudacxx/codegen/codegen.cpp
@@ -66,9 +66,9 @@ int main()
 
   std::vector<std::string> cv_qualifier{"volatile ", ""};
 
-  std::ofstream out("atomic_cuda_generated.h");
+  std::ofstream out("atomic_cuda_ptx_generated.h");
 
-  out << R"XXX(//===----------------------------------------------------------------------===//
+    out << R"XXX(//===----------------------------------------------------------------------===//
 //
 // Part of libcu++, the C++ Standard Library for your entire system,
 // under the Apache License v2.0 with LLVM Exceptions.
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda_derived.h b/libcudacxx/include/cuda/std/__atomic/atomic_cuda_ptx_derived.h
similarity index 100%
rename from libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda_derived.h
rename to libcudacxx/include/cuda/std/__atomic/atomic_cuda_ptx_derived.h
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda_generated.h b/libcudacxx/include/cuda/std/__atomic/atomic_cuda_ptx_generated.h
similarity index 100%
rename from libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda_generated.h
rename to libcudacxx/include/cuda/std/__atomic/atomic_cuda_ptx_generated.h

From 52a60bb7f73040dc7b6ff38ad4857c1a4e6f3ce8 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Thu, 11 Apr 2024 12:53:09 -0700
Subject: [PATCH 04/71] Delete remaining atomics backends. Move MSVC backend

---
 .../atomic_gcc.h => __atomic/platform.h}      |   9 +-
 .../platform/msvc_to_builtins.h}              |   2 -
 .../include/support/atomic/atomic_base.h      | 246 ------
 .../include/support/atomic/atomic_c11.h       | 241 ------
 .../include/support/atomic/atomic_cuda.h      | 787 ------------------
 .../include/support/atomic/atomic_nvrtc.h     |  17 -
 .../include/support/atomic/atomic_scopes.h    |  67 --
 .../include/support/atomic/cxx_atomic.h       | 180 ----
 8 files changed, 3 insertions(+), 1546 deletions(-)
 rename libcudacxx/include/cuda/std/{detail/libcxx/include/support/atomic/atomic_gcc.h => __atomic/platform.h} (74%)
 rename libcudacxx/include/cuda/std/{detail/libcxx/include/support/atomic/atomic_msvc.h => __atomic/platform/msvc_to_builtins.h} (99%)
 delete mode 100644 libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_base.h
 delete mode 100644 libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_c11.h
 delete mode 100644 libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda.h
 delete mode 100644 libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_nvrtc.h
 delete mode 100644 libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_scopes.h
 delete mode 100644 libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/cxx_atomic.h

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_gcc.h b/libcudacxx/include/cuda/std/__atomic/platform.h
similarity index 74%
rename from libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_gcc.h
rename to libcudacxx/include/cuda/std/__atomic/platform.h
index 8d5d7967cb..9a2f683d15 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_gcc.h
+++ b/libcudacxx/include/cuda/std/__atomic/platform.h
@@ -9,9 +9,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCUDACXX_ATOMIC_GCC_H
-#define _LIBCUDACXX_ATOMIC_GCC_H
-
-#include <cuda/std/detail/libcxx/include/support/atomic/atomic_base.h>
-
-#endif // _LIBCUDACXX_ATOMIC_GCC_H
+#if defined(_CCCL_COMPILER_MSVC)
+#include <cuda/std/__atomic/platform/msvc_to_builtins.h>
+#endif
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_msvc.h b/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h
similarity index 99%
rename from libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_msvc.h
rename to libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h
index 53cd9cd4d7..d48c68acb4 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_msvc.h
+++ b/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h
@@ -621,5 +621,3 @@ _Type __atomic_fetch_min(_Type volatile* __ptr, _Delta __val, int __memorder)
   }
   return __expected;
 }
-
-#include <cuda/std/detail/libcxx/include/support/atomic/atomic_base.h>
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_base.h b/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_base.h
deleted file mode 100644
index 65be5cfd97..0000000000
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_base.h
+++ /dev/null
@@ -1,246 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of libcu++, the C++ Standard Library for your entire system,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCUDACXX_ATOMIC_BASE_H
-#define _LIBCUDACXX_ATOMIC_BASE_H
-
-#include <cuda/std/detail/libcxx/include/support/atomic/cxx_atomic.h>
-
-// Guard ifdef for lock free query in case it is assigned elsewhere (MSVC/CUDA)
-#ifndef _LIBCUDACXX_ATOMIC_IS_LOCK_FREE
-#  define _LIBCUDACXX_ATOMIC_IS_LOCK_FREE(__x) __atomic_is_lock_free(__x, 0)
-#endif
-
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr int __cxx_atomic_order_to_int(memory_order __order)
-{
-  // Avoid switch statement to make this a constexpr.
-  return __order == memory_order_relaxed
-         ? __ATOMIC_RELAXED
-         : (__order == memory_order_acquire
-              ? __ATOMIC_ACQUIRE
-              : (__order == memory_order_release
-                   ? __ATOMIC_RELEASE
-                   : (__order == memory_order_seq_cst
-                        ? __ATOMIC_SEQ_CST
-                        : (__order == memory_order_acq_rel ? __ATOMIC_ACQ_REL : __ATOMIC_CONSUME))));
-}
-
-_LIBCUDACXX_INLINE_VISIBILITY inline constexpr int __cxx_atomic_failure_order_to_int(memory_order __order)
-{
-  // Avoid switch statement to make this a constexpr.
-  return __order == memory_order_relaxed
-         ? __ATOMIC_RELAXED
-         : (__order == memory_order_acquire
-              ? __ATOMIC_ACQUIRE
-              : (__order == memory_order_release
-                   ? __ATOMIC_RELAXED
-                   : (__order == memory_order_seq_cst
-                        ? __ATOMIC_SEQ_CST
-                        : (__order == memory_order_acq_rel ? __ATOMIC_ACQUIRE : __ATOMIC_CONSUME))));
-}
-
-template <typename _Tp, typename _Up>
-inline void __cxx_atomic_init(volatile _Tp* __a, _Up __val)
-{
-  auto __a_tmp = __cxx_get_underlying_atomic(__cxx_atomic_unwrap(__a));
-  __cxx_atomic_assign_volatile(*__a_tmp, __val);
-}
-
-template <typename _Tp, typename _Up>
-inline void __cxx_atomic_init(_Tp* __a, _Up __val)
-{
-  auto __a_tmp = __cxx_get_underlying_atomic(__cxx_atomic_unwrap(__a));
-  *__a_tmp     = __val;
-}
-
-inline void __cxx_atomic_thread_fence(memory_order __order)
-{
-  __atomic_thread_fence(__cxx_atomic_order_to_int(__order));
-}
-
-inline void __cxx_atomic_signal_fence(memory_order __order)
-{
-  __atomic_signal_fence(__cxx_atomic_order_to_int(__order));
-}
-
-template <typename _Tp, typename _Up>
-inline void __cxx_atomic_store(_Tp* __a, _Up __val, memory_order __order)
-{
-  auto __v_temp = __cxx_atomic_wrap_to_base(__a, __val);
-  __atomic_store(__cxx_atomic_unwrap(__a), &__v_temp, __cxx_atomic_order_to_int(__order));
-}
-
-template <typename _Tp>
-inline auto __cxx_atomic_load(const _Tp* __a, memory_order __order) -> __cxx_atomic_underlying_t<_Tp>
-{
-  auto __ret = __cxx_atomic_base_temporary(__a);
-  __atomic_load(__cxx_atomic_unwrap(__a), &__ret, __cxx_atomic_order_to_int(__order));
-  return *__cxx_get_underlying_atomic(&__ret);
-}
-
-template <typename _Tp, typename _Up>
-inline auto __cxx_atomic_exchange(_Tp* __a, _Up __val, memory_order __order) -> __cxx_atomic_underlying_t<_Tp>
-{
-  auto __v_temp = __cxx_atomic_wrap_to_base(__a, __val);
-  auto __ret    = __cxx_atomic_base_temporary(__a);
-  __atomic_exchange(__cxx_atomic_unwrap(__a), &__v_temp, &__ret, __cxx_atomic_order_to_int(__order));
-  return *__cxx_get_underlying_atomic(&__ret);
-}
-
-template <typename _Tp, typename _Up>
-inline bool __cxx_atomic_compare_exchange_strong(
-  _Tp* __a, _Up* __expected, _Up __value, memory_order __success, memory_order __failure)
-{
-  (void) __expected;
-  return __atomic_compare_exchange(
-    __cxx_get_underlying_atomic(__cxx_atomic_unwrap(__a)),
-    __expected,
-    &__value,
-    false,
-    __cxx_atomic_order_to_int(__success),
-    __cxx_atomic_failure_order_to_int(__failure));
-}
-
-template <typename _Tp, typename _Up>
-inline bool __cxx_atomic_compare_exchange_weak(
-  _Tp* __a, _Up* __expected, _Up __value, memory_order __success, memory_order __failure)
-{
-  (void) __expected;
-  return __atomic_compare_exchange(
-    __cxx_get_underlying_atomic(__cxx_atomic_unwrap(__a)),
-    __expected,
-    &__value,
-    true,
-    __cxx_atomic_order_to_int(__success),
-    __cxx_atomic_failure_order_to_int(__failure));
-}
-
-template <typename _Tp>
-struct __atomic_ptr_inc
-{
-  enum
-  {
-    value = 1
-  };
-};
-
-template <typename _Tp>
-struct __atomic_ptr_inc<_Tp*>
-{
-  enum
-  {
-    value = sizeof(_Tp)
-  };
-};
-
-// FIXME: Haven't figured out what the spec says about using arrays with
-// atomic_fetch_add. Force a failure rather than creating bad behavior.
-template <typename _Tp>
-struct __atomic_ptr_inc<_Tp[]>
-{};
-template <typename _Tp, int n>
-struct __atomic_ptr_inc<_Tp[n]>
-{};
-
-template <typename _Tp, typename _Td, __enable_if_t<!is_floating_point<__cxx_atomic_underlying_t<_Tp>>::value, int> = 0>
-inline auto __cxx_atomic_fetch_add(_Tp* __a, _Td __delta, memory_order __order) -> __cxx_atomic_underlying_t<_Tp>
-{
-  constexpr auto __skip_v = __atomic_ptr_inc<__cxx_atomic_underlying_t<_Tp>>::value;
-  auto __a_tmp            = __cxx_get_underlying_atomic(__cxx_atomic_unwrap(__a));
-  return __atomic_fetch_add(__a_tmp, __delta * __skip_v, __cxx_atomic_order_to_int(__order));
-}
-
-template <typename _Tp, typename _Td, __enable_if_t<is_floating_point<__cxx_atomic_underlying_t<_Tp>>::value, int> = 0>
-inline auto __cxx_atomic_fetch_add(_Tp* __a, _Td __delta, memory_order __order) -> __cxx_atomic_underlying_t<_Tp>
-{
-  auto __expected = __cxx_atomic_load(__a, memory_order_relaxed);
-  auto __desired  = __expected + __delta;
-
-  while (!__cxx_atomic_compare_exchange_strong(__a, &__expected, __desired, __order, __order))
-  {
-    __desired = __expected + __delta;
-  }
-
-  return __expected;
-}
-
-template <typename _Tp, typename _Td, __enable_if_t<!is_floating_point<__cxx_atomic_underlying_t<_Tp>>::value, int> = 0>
-inline auto __cxx_atomic_fetch_sub(_Tp* __a, _Td __delta, memory_order __order) -> __cxx_atomic_underlying_t<_Tp>
-{
-  constexpr auto __skip_v = __atomic_ptr_inc<__cxx_atomic_underlying_t<_Tp>>::value;
-  auto __a_tmp            = __cxx_get_underlying_atomic(__cxx_atomic_unwrap(__a));
-  return __atomic_fetch_sub(__a_tmp, __delta * __skip_v, __cxx_atomic_order_to_int(__order));
-}
-
-template <typename _Tp, typename _Td, __enable_if_t<is_floating_point<__cxx_atomic_underlying_t<_Tp>>::value, int> = 0>
-inline auto __cxx_atomic_fetch_sub(_Tp* __a, _Td __delta, memory_order __order) -> __cxx_atomic_underlying_t<_Tp>
-{
-  auto __expected = __cxx_atomic_load(__a, memory_order_relaxed);
-  auto __desired  = __expected - __delta;
-
-  while (!__cxx_atomic_compare_exchange_strong(__a, &__expected, __desired, __order, __order))
-  {
-    __desired = __expected - __delta;
-  }
-
-  return __expected;
-}
-
-template <typename _Tp, typename _Td>
-inline auto __cxx_atomic_fetch_and(_Tp* __a, _Td __pattern, memory_order __order) -> __cxx_atomic_underlying_t<_Tp>
-{
-  auto __a_tmp = __cxx_get_underlying_atomic(__cxx_atomic_unwrap(__a));
-  return __atomic_fetch_and(__a_tmp, __pattern, __cxx_atomic_order_to_int(__order));
-}
-
-template <typename _Tp, typename _Td>
-inline auto __cxx_atomic_fetch_or(_Tp* __a, _Td __pattern, memory_order __order) -> __cxx_atomic_underlying_t<_Tp>
-{
-  auto __a_tmp = __cxx_get_underlying_atomic(__cxx_atomic_unwrap(__a));
-  return __atomic_fetch_or(__a_tmp, __pattern, __cxx_atomic_order_to_int(__order));
-}
-
-template <typename _Tp, typename _Td>
-inline auto __cxx_atomic_fetch_xor(_Tp* __a, _Td __pattern, memory_order __order) -> __cxx_atomic_underlying_t<_Tp>
-{
-  auto __a_tmp = __cxx_get_underlying_atomic(__cxx_atomic_unwrap(__a));
-  return __atomic_fetch_xor(__a_tmp, __pattern, __cxx_atomic_order_to_int(__order));
-}
-
-template <typename _Tp, typename _Td>
-inline auto __cxx_atomic_fetch_max(_Tp* __a, _Td __val, memory_order __order) -> __cxx_atomic_underlying_t<_Tp>
-{
-  auto __expected = __cxx_atomic_load(__a, memory_order_relaxed);
-  auto __desired  = __expected > __val ? __expected : __val;
-
-  while (__desired == __val && !__cxx_atomic_compare_exchange_strong(__a, &__expected, __desired, __order, __order))
-  {
-    __desired = __expected > __val ? __expected : __val;
-  }
-
-  return __expected;
-}
-
-template <typename _Tp, typename _Td>
-inline auto __cxx_atomic_fetch_min(_Tp* __a, _Td __val, memory_order __order) -> __cxx_atomic_underlying_t<_Tp>
-{
-  auto __expected = __cxx_atomic_load(__a, memory_order_relaxed);
-  auto __desired  = __expected < __val ? __expected : __val;
-
-  while (__desired == __val && !__cxx_atomic_compare_exchange_strong(__a, &__expected, __desired, __order, __order))
-  {
-    __desired = __expected < __val ? __expected : __val;
-  }
-
-  return __expected;
-}
-
-#endif // _LIBCUDACXX_ATOMIC_BASE_H
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_c11.h b/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_c11.h
deleted file mode 100644
index 1e5c55d243..0000000000
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_c11.h
+++ /dev/null
@@ -1,241 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of libcu++, the C++ Standard Library for your entire system,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-// Atomics for C11
-
-template <typename _Tp>
-struct __cxx_atomic_base_impl
-{
-  _LIBCUDACXX_INLINE_VISIBILITY __cxx_atomic_base_impl() noexcept = default;
-
-  constexpr explicit __cxx_atomic_base_impl(_Tp value) noexcept
-      : __a_value(value)
-  {}
-  _LIBCUDACXX_DISABLE_EXTENSION_WARNING _Atomic(_Tp) __a_value;
-};
-
-#ifndef _LIBCUDACXX_ATOMIC_IS_LOCK_FREE
-#  define _LIBCUDACXX_ATOMIC_IS_LOCK_FREE(__x) __c11_atomic_is_lock_free(__x, 0)
-#endif
-
-_LIBCUDACXX_INLINE_VISIBILITY inline void __cxx_atomic_thread_fence(memory_order __order) noexcept
-{
-  __c11_atomic_thread_fence(static_cast<__memory_order_underlying_t>(__order));
-}
-
-_LIBCUDACXX_INLINE_VISIBILITY inline void __cxx_atomic_signal_fence(memory_order __order) noexcept
-{
-  __c11_atomic_signal_fence(static_cast<__memory_order_underlying_t>(__order));
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_init(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp __val) noexcept
-{
-  __c11_atomic_init(&__a->__a_value, __val);
-}
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_init(__cxx_atomic_base_impl<_Tp>* __a, _Tp __val) noexcept
-{
-  __c11_atomic_init(&__a->__a_value, __val);
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY void
-__cxx_atomic_store(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp __val, memory_order __order) noexcept
-{
-  __c11_atomic_store(&__a->__a_value, __val, static_cast<__memory_order_underlying_t>(__order));
-}
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY void
-__cxx_atomic_store(__cxx_atomic_base_impl<_Tp>* __a, _Tp __val, memory_order __order) noexcept
-{
-  __c11_atomic_store(&__a->__a_value, __val, static_cast<__memory_order_underlying_t>(__order));
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp
-__cxx_atomic_load(__cxx_atomic_base_impl<_Tp> const volatile* __a, memory_order __order) noexcept
-{
-  using __ptr_type = typename remove_const<decltype(__a->__a_value)>::type*;
-  return __c11_atomic_load(const_cast<__ptr_type>(&__a->__a_value), static_cast<__memory_order_underlying_t>(__order));
-}
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp
-__cxx_atomic_load(__cxx_atomic_base_impl<_Tp> const* __a, memory_order __order) noexcept
-{
-  using __ptr_type = typename remove_const<decltype(__a->__a_value)>::type*;
-  return __c11_atomic_load(const_cast<__ptr_type>(&__a->__a_value), static_cast<__memory_order_underlying_t>(__order));
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp
-__cxx_atomic_exchange(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp __value, memory_order __order) noexcept
-{
-  return __c11_atomic_exchange(&__a->__a_value, __value, static_cast<__memory_order_underlying_t>(__order));
-}
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp
-__cxx_atomic_exchange(__cxx_atomic_base_impl<_Tp>* __a, _Tp __value, memory_order __order) noexcept
-{
-  return __c11_atomic_exchange(&__a->__a_value, __value, static_cast<__memory_order_underlying_t>(__order));
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY bool __cxx_atomic_compare_exchange_strong(
-  __cxx_atomic_base_impl<_Tp> volatile* __a,
-  _Tp* __expected,
-  _Tp __value,
-  memory_order __success,
-  memory_order __failure) noexcept
-{
-  return __c11_atomic_compare_exchange_strong(
-    &__a->__a_value,
-    __expected,
-    __value,
-    static_cast<__memory_order_underlying_t>(__success),
-    static_cast<__memory_order_underlying_t>(__failure));
-}
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY bool __cxx_atomic_compare_exchange_strong(
-  __cxx_atomic_base_impl<_Tp>* __a,
-  _Tp* __expected,
-  _Tp __value,
-  memory_order __success,
-  memory_order __failure) noexcept
-{
-  return __c11_atomic_compare_exchange_strong(
-    &__a->__a_value,
-    __expected,
-    __value,
-    static_cast<__memory_order_underlying_t>(__success),
-    static_cast<__memory_order_underlying_t>(__failure));
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY bool __cxx_atomic_compare_exchange_weak(
-  __cxx_atomic_base_impl<_Tp> volatile* __a,
-  _Tp* __expected,
-  _Tp __value,
-  memory_order __success,
-  memory_order __failure) noexcept
-{
-  return __c11_atomic_compare_exchange_weak(
-    &__a->__a_value,
-    __expected,
-    __value,
-    static_cast<__memory_order_underlying_t>(__success),
-    static_cast<__memory_order_underlying_t>(__failure));
-}
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY bool __cxx_atomic_compare_exchange_weak(
-  __cxx_atomic_base_impl<_Tp>* __a,
-  _Tp* __expected,
-  _Tp __value,
-  memory_order __success,
-  memory_order __failure) noexcept
-{
-  return __c11_atomic_compare_exchange_weak(
-    &__a->__a_value,
-    __expected,
-    __value,
-    static_cast<__memory_order_underlying_t>(__success),
-    static_cast<__memory_order_underlying_t>(__failure));
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp
-__cxx_atomic_fetch_add(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp __delta, memory_order __order) noexcept
-{
-  return __c11_atomic_fetch_add(&__a->__a_value, __delta, static_cast<__memory_order_underlying_t>(__order));
-}
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp
-__cxx_atomic_fetch_add(__cxx_atomic_base_impl<_Tp>* __a, _Tp __delta, memory_order __order) noexcept
-{
-  return __c11_atomic_fetch_add(&__a->__a_value, __delta, static_cast<__memory_order_underlying_t>(__order));
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp*
-__cxx_atomic_fetch_add(__cxx_atomic_base_impl<_Tp*> volatile* __a, ptrdiff_t __delta, memory_order __order) noexcept
-{
-  return __c11_atomic_fetch_add(&__a->__a_value, __delta, static_cast<__memory_order_underlying_t>(__order));
-}
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp*
-__cxx_atomic_fetch_add(__cxx_atomic_base_impl<_Tp*>* __a, ptrdiff_t __delta, memory_order __order) noexcept
-{
-  return __c11_atomic_fetch_add(&__a->__a_value, __delta, static_cast<__memory_order_underlying_t>(__order));
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp
-__cxx_atomic_fetch_sub(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp __delta, memory_order __order) noexcept
-{
-  return __c11_atomic_fetch_sub(&__a->__a_value, __delta, static_cast<__memory_order_underlying_t>(__order));
-}
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp
-__cxx_atomic_fetch_sub(__cxx_atomic_base_impl<_Tp>* __a, _Tp __delta, memory_order __order) noexcept
-{
-  return __c11_atomic_fetch_sub(&__a->__a_value, __delta, static_cast<__memory_order_underlying_t>(__order));
-}
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp*
-__cxx_atomic_fetch_sub(__cxx_atomic_base_impl<_Tp*> volatile* __a, ptrdiff_t __delta, memory_order __order) noexcept
-{
-  return __c11_atomic_fetch_sub(&__a->__a_value, __delta, static_cast<__memory_order_underlying_t>(__order));
-}
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp*
-__cxx_atomic_fetch_sub(__cxx_atomic_base_impl<_Tp*>* __a, ptrdiff_t __delta, memory_order __order) noexcept
-{
-  return __c11_atomic_fetch_sub(&__a->__a_value, __delta, static_cast<__memory_order_underlying_t>(__order));
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp
-__cxx_atomic_fetch_and(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp __pattern, memory_order __order) noexcept
-{
-  return __c11_atomic_fetch_and(&__a->__a_value, __pattern, static_cast<__memory_order_underlying_t>(__order));
-}
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp
-__cxx_atomic_fetch_and(__cxx_atomic_base_impl<_Tp>* __a, _Tp __pattern, memory_order __order) noexcept
-{
-  return __c11_atomic_fetch_and(&__a->__a_value, __pattern, static_cast<__memory_order_underlying_t>(__order));
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp
-__cxx_atomic_fetch_or(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp __pattern, memory_order __order) noexcept
-{
-  return __c11_atomic_fetch_or(&__a->__a_value, __pattern, static_cast<__memory_order_underlying_t>(__order));
-}
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp
-__cxx_atomic_fetch_or(__cxx_atomic_base_impl<_Tp>* __a, _Tp __pattern, memory_order __order) noexcept
-{
-  return __c11_atomic_fetch_or(&__a->__a_value, __pattern, static_cast<__memory_order_underlying_t>(__order));
-}
-
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp
-__cxx_atomic_fetch_xor(__cxx_atomic_base_impl<_Tp> volatile* __a, _Tp __pattern, memory_order __order) noexcept
-{
-  return __c11_atomic_fetch_xor(&__a->__a_value, __pattern, static_cast<__memory_order_underlying_t>(__order));
-}
-template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY _Tp
-__cxx_atomic_fetch_xor(__cxx_atomic_base_impl<_Tp>* __a, _Tp __pattern, memory_order __order) noexcept
-{
-  return __c11_atomic_fetch_xor(&__a->__a_value, __pattern, static_cast<__memory_order_underlying_t>(__order));
-}
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda.h b/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda.h
deleted file mode 100644
index b6fa9a16fd..0000000000
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda.h
+++ /dev/null
@@ -1,787 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of libcu++, the C++ Standard Library for your entire system,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#if defined(__CUDA_MINIMUM_ARCH__)                                   \
-  && ((!defined(_CCCL_COMPILER_MSVC) && __CUDA_MINIMUM_ARCH__ < 600) \
-      || (defined(_CCCL_COMPILER_MSVC) && __CUDA_MINIMUM_ARCH__ < 700))
-#  error "CUDA atomics are only supported for sm_60 and up on *nix and sm_70 and up on Windows."
-#endif
-
-inline _CCCL_HOST_DEVICE int __stronger_order_cuda(int __a, int __b)
-{
-  int const __max = __a > __b ? __a : __b;
-  if (__max != __ATOMIC_RELEASE)
-  {
-    return __max;
-  }
-  static int const __xform[] = {__ATOMIC_RELEASE, __ATOMIC_ACQ_REL, __ATOMIC_ACQ_REL, __ATOMIC_RELEASE};
-  return __xform[__a < __b ? __a : __b];
-}
-
-// pre-define lock free query for heterogeneous compatibility
-#ifndef _LIBCUDACXX_ATOMIC_IS_LOCK_FREE
-#  define _LIBCUDACXX_ATOMIC_IS_LOCK_FREE(__x) (__x <= 8)
-#endif
-
-// Wrap host atomic implementations into a sub-namespace
-namespace __host
-{
-#if defined(_CCCL_COMPILER_MSVC)
-#  include <cuda/std/detail/libcxx/include/support/atomic/atomic_msvc.h>
-#elif defined(_LIBCUDACXX_HAS_GCC_ATOMIC_IMP)
-#  include <cuda/std/detail/libcxx/include/support/atomic/atomic_gcc.h>
-#elif defined(_LIBCUDACXX_HAS_C11_ATOMIC_IMP)
-// TODO
-//  #  include <cuda/std/detail/libcxx/include/support/atomic/atomic_c11.h>
-#elif defined(_CCCL_COMPILER_NVRTC)
-#  include <cuda/std/detail/libcxx/include/support/atomic/atomic_nvrtc.h>
-#endif
-} // namespace __host
-
-using __host::__cxx_atomic_underlying_t;
-
-#include <cuda/std/detail/libcxx/include/support/atomic/atomic_cuda_derived.h>
-#include <cuda/std/detail/libcxx/include/support/atomic/atomic_cuda_generated.h>
-
-_CCCL_HOST_DEVICE inline void __cxx_atomic_thread_fence(memory_order __order)
-{
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (__atomic_thread_fence_cuda(static_cast<__memory_order_underlying_t>(__order), __thread_scope_system_tag());),
-    NV_IS_HOST,
-    (__host::__cxx_atomic_thread_fence(__order);))
-}
-
-_CCCL_HOST_DEVICE inline void __cxx_atomic_signal_fence(memory_order __order)
-{
-  NV_DISPATCH_TARGET(NV_IS_DEVICE,
-                     (__atomic_signal_fence_cuda(static_cast<__memory_order_underlying_t>(__order));),
-                     NV_IS_HOST,
-                     (__host::__cxx_atomic_signal_fence(__order);))
-}
-
-template <typename _Tp, int _Sco, bool _Ref = false>
-struct __cxx_atomic_base_heterogeneous_impl
-{
-  __cxx_atomic_base_heterogeneous_impl() noexcept = default;
-
-  _CCCL_HOST_DEVICE constexpr explicit __cxx_atomic_base_heterogeneous_impl(_Tp __value)
-      : __a_value(__value)
-  {}
-
-  using __underlying_t       = _Tp;
-  static constexpr int __sco = _Sco;
-
-  __host::__cxx_atomic_base_impl<_Tp, _Sco> __a_value;
-};
-
-template <typename _Tp, int _Sco>
-struct __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, true>
-{
-  __cxx_atomic_base_heterogeneous_impl() noexcept = default;
-
-  static_assert(sizeof(_Tp) >= 4, "atomic_ref does not support 1 or 2 byte types");
-  static_assert(sizeof(_Tp) <= 8, "atomic_ref does not support types larger than 8 bytes");
-
-  _CCCL_HOST_DEVICE constexpr explicit __cxx_atomic_base_heterogeneous_impl(_Tp& __value)
-      : __a_value(__value)
-  {}
-
-  using __underlying_t       = _Tp;
-  static constexpr int __sco = _Sco;
-
-  __host::__cxx_atomic_ref_base_impl<_Tp, _Sco> __a_value;
-};
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE constexpr _Tp*
-__cxx_get_underlying_device_atomic(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref>* __a) noexcept
-{
-  return __cxx_get_underlying_atomic(&__a->__a_value);
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE constexpr volatile _Tp*
-__cxx_get_underlying_device_atomic(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a) noexcept
-{
-  return __cxx_get_underlying_atomic(&__a->__a_value);
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE constexpr const _Tp*
-__cxx_get_underlying_device_atomic(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> const* __a) noexcept
-{
-  return __cxx_get_underlying_atomic(&__a->__a_value);
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE constexpr const volatile _Tp*
-__cxx_get_underlying_device_atomic(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> const volatile* __a) noexcept
-{
-  return __cxx_get_underlying_atomic(&__a->__a_value);
-}
-
-template <typename _Tp>
-using __cxx_atomic_small_to_32 = __conditional_t<is_signed<_Tp>::value, int32_t, uint32_t>;
-
-// Arithmetic conversions to/from proxy types
-template <class _Tp, __enable_if_t<is_arithmetic<_Tp>::value, int> = 0>
-constexpr _CCCL_HOST_DEVICE inline __cxx_atomic_small_to_32<_Tp> __cxx_small_to_32(_Tp __val)
-{
-  return static_cast<__cxx_atomic_small_to_32<_Tp>>(__val);
-}
-
-template <class _Tp, __enable_if_t<is_arithmetic<_Tp>::value, int> = 0>
-constexpr _CCCL_HOST_DEVICE inline _Tp __cxx_small_from_32(__cxx_atomic_small_to_32<_Tp> __val)
-{
-  return static_cast<_Tp>(__val);
-}
-
-// Non-arithmetic conversion to/from proxy types
-template <class _Tp, __enable_if_t<!is_arithmetic<_Tp>::value, int> = 0>
-_CCCL_HOST_DEVICE inline __cxx_atomic_small_to_32<_Tp> __cxx_small_to_32(_Tp __val)
-{
-  __cxx_atomic_small_to_32<_Tp> __temp{};
-  memcpy(&__temp, &__val, sizeof(_Tp));
-  return __temp;
-}
-
-template <class _Tp, __enable_if_t<!is_arithmetic<_Tp>::value, int> = 0>
-_CCCL_HOST_DEVICE inline _Tp __cxx_small_from_32(__cxx_atomic_small_to_32<_Tp> __val)
-{
-  _Tp __temp{};
-  memcpy(&__temp, &__val, sizeof(_Tp));
-  return __temp;
-}
-
-template <typename _Tp, int _Sco>
-struct __cxx_atomic_base_small_impl
-{
-  __cxx_atomic_base_small_impl() noexcept = default;
-  _CCCL_HOST_DEVICE constexpr explicit __cxx_atomic_base_small_impl(_Tp __value)
-      : __a_value(__cxx_small_to_32(__value))
-  {}
-
-  using __underlying_t       = _Tp;
-  static constexpr int __sco = _Sco;
-
-  __cxx_atomic_base_heterogeneous_impl<__cxx_atomic_small_to_32<_Tp>, _Sco, false> __a_value;
-};
-
-template <typename _Tp, int _Sco>
-using __cxx_atomic_base_impl =
-  __conditional_t<sizeof(_Tp) < 4,
-                  __cxx_atomic_base_small_impl<_Tp, _Sco>,
-                  __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco>>;
-
-template <typename _Tp, int _Sco>
-using __cxx_atomic_ref_base_impl = __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, true>;
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE void __cxx_atomic_init(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref>* __a, _Tp __val)
-{
-  alignas(_Tp) auto __tmp = __val;
-  __cxx_atomic_assign_volatile(*__cxx_get_underlying_device_atomic(__a), __tmp);
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE void __cxx_atomic_init(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp __val)
-{
-  alignas(_Tp) auto __tmp = __val;
-  __cxx_atomic_assign_volatile(*__cxx_get_underlying_device_atomic(__a), __tmp);
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE void
-__cxx_atomic_store(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref>* __a, _Tp __val, memory_order __order)
-{
-  alignas(_Tp) auto __tmp = __val;
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (__atomic_store_n_cuda(__cxx_get_underlying_device_atomic(__a),
-                           __tmp,
-                           static_cast<__memory_order_underlying_t>(__order),
-                           __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (__host::__cxx_atomic_store(&__a->__a_value, __tmp, __order);))
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE void
-__cxx_atomic_store(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp __val, memory_order __order)
-{
-  alignas(_Tp) auto __tmp = __val;
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (__atomic_store_n_cuda(__cxx_get_underlying_device_atomic(__a),
-                           __tmp,
-                           static_cast<__memory_order_underlying_t>(__order),
-                           __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (__host::__cxx_atomic_store(&__a->__a_value, __tmp, __order);))
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp
-__cxx_atomic_load(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> const* __a, memory_order __order)
-{
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_load_n_cuda(__cxx_get_underlying_device_atomic(__a),
-                                 static_cast<__memory_order_underlying_t>(__order),
-                                 __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (return __host::__cxx_atomic_load(&__a->__a_value, __order);))
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp
-__cxx_atomic_load(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> const volatile* __a, memory_order __order)
-{
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_load_n_cuda(__cxx_get_underlying_device_atomic(__a),
-                                 static_cast<__memory_order_underlying_t>(__order),
-                                 __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (return __host::__cxx_atomic_load(&__a->__a_value, __order);))
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp
-__cxx_atomic_exchange(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref>* __a, _Tp __val, memory_order __order)
-{
-  alignas(_Tp) auto __tmp = __val;
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_exchange_n_cuda(__cxx_get_underlying_device_atomic(__a),
-                                     __tmp,
-                                     static_cast<__memory_order_underlying_t>(__order),
-                                     __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (return __host::__cxx_atomic_exchange(&__a->__a_value, __tmp, __order);))
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp __cxx_atomic_exchange(
-  __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp __val, memory_order __order)
-{
-  alignas(_Tp) auto __tmp = __val;
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_exchange_n_cuda(__cxx_get_underlying_device_atomic(__a),
-                                     __tmp,
-                                     static_cast<__memory_order_underlying_t>(__order),
-                                     __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (return __host::__cxx_atomic_exchange(&__a->__a_value, __tmp, __order);))
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE bool __cxx_atomic_compare_exchange_strong(
-  __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref>* __a,
-  _Tp* __expected,
-  _Tp __val,
-  memory_order __success,
-  memory_order __failure)
-{
-  alignas(_Tp) auto __tmp = *__expected;
-  bool __result           = false;
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (alignas(_Tp) auto __tmp_v = __val;
-     __result                  = __atomic_compare_exchange_cuda(
-       __cxx_get_underlying_device_atomic(__a),
-       &__tmp,
-       &__tmp_v,
-       false,
-       static_cast<__memory_order_underlying_t>(__success),
-       static_cast<__memory_order_underlying_t>(__failure),
-       __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (__result = __host::__cxx_atomic_compare_exchange_strong(&__a->__a_value, &__tmp, __val, __success, __failure);))
-  *__expected = __tmp;
-  return __result;
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE bool __cxx_atomic_compare_exchange_strong(
-  __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a,
-  _Tp* __expected,
-  _Tp __val,
-  memory_order __success,
-  memory_order __failure)
-{
-  alignas(_Tp) auto __tmp = *__expected;
-  bool __result           = false;
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (alignas(_Tp) auto __tmp_v = __val;
-     __result                  = __atomic_compare_exchange_cuda(
-       __cxx_get_underlying_device_atomic(__a),
-       &__tmp,
-       &__tmp_v,
-       false,
-       static_cast<__memory_order_underlying_t>(__success),
-       static_cast<__memory_order_underlying_t>(__failure),
-       __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (__result = __host::__cxx_atomic_compare_exchange_strong(&__a->__a_value, &__tmp, __val, __success, __failure);))
-  *__expected = __tmp;
-  return __result;
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE bool __cxx_atomic_compare_exchange_weak(
-  __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref>* __a,
-  _Tp* __expected,
-  _Tp __val,
-  memory_order __success,
-  memory_order __failure)
-{
-  alignas(_Tp) auto __tmp = *__expected;
-  bool __result           = false;
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (alignas(_Tp) auto __tmp_v = __val;
-     __result                  = __atomic_compare_exchange_cuda(
-       __cxx_get_underlying_device_atomic(__a),
-       &__tmp,
-       &__tmp_v,
-       true,
-       static_cast<__memory_order_underlying_t>(__success),
-       static_cast<__memory_order_underlying_t>(__failure),
-       __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (__result = __host::__cxx_atomic_compare_exchange_weak(&__a->__a_value, &__tmp, __val, __success, __failure);))
-  *__expected = __tmp;
-  return __result;
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE bool __cxx_atomic_compare_exchange_weak(
-  __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a,
-  _Tp* __expected,
-  _Tp __val,
-  memory_order __success,
-  memory_order __failure)
-{
-  alignas(_Tp) auto __tmp = *__expected;
-  bool __result           = false;
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (alignas(_Tp) auto __tmp_v = __val;
-     __result                  = __atomic_compare_exchange_cuda(
-       __cxx_get_underlying_device_atomic(__a),
-       &__tmp,
-       &__tmp_v,
-       true,
-       static_cast<__memory_order_underlying_t>(__success),
-       static_cast<__memory_order_underlying_t>(__failure),
-       __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (__result = __host::__cxx_atomic_compare_exchange_weak(&__a->__a_value, &__tmp, __val, __success, __failure);))
-  *__expected = __tmp;
-  return __result;
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp
-__cxx_atomic_fetch_add(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref>* __a, _Tp __delta, memory_order __order)
-{
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_fetch_add_cuda(__cxx_get_underlying_device_atomic(__a),
-                                    __delta,
-                                    static_cast<__memory_order_underlying_t>(__order),
-                                    __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (return __host::__cxx_atomic_fetch_add(&__a->__a_value, __delta, __order);))
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp __cxx_atomic_fetch_add(
-  __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp __delta, memory_order __order)
-{
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_fetch_add_cuda(__cxx_get_underlying_device_atomic(__a),
-                                    __delta,
-                                    static_cast<__memory_order_underlying_t>(__order),
-                                    __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (return __host::__cxx_atomic_fetch_add(&__a->__a_value, __delta, __order);))
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp* __cxx_atomic_fetch_add(
-  __cxx_atomic_base_heterogeneous_impl<_Tp*, _Sco, _Ref>* __a, ptrdiff_t __delta, memory_order __order)
-{
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_fetch_add_cuda(__cxx_get_underlying_device_atomic(__a),
-                                    __delta,
-                                    static_cast<__memory_order_underlying_t>(__order),
-                                    __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (return __host::__cxx_atomic_fetch_add(&__a->__a_value, __delta, __order);))
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp* __cxx_atomic_fetch_add(
-  __cxx_atomic_base_heterogeneous_impl<_Tp*, _Sco, _Ref> volatile* __a, ptrdiff_t __delta, memory_order __order)
-{
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_fetch_add_cuda(__cxx_get_underlying_device_atomic(__a),
-                                    __delta,
-                                    static_cast<__memory_order_underlying_t>(__order),
-                                    __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (return __host::__cxx_atomic_fetch_add(&__a->__a_value, __delta, __order);))
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp
-__cxx_atomic_fetch_sub(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref>* __a, _Tp __delta, memory_order __order)
-{
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_fetch_sub_cuda(__cxx_get_underlying_device_atomic(__a),
-                                    __delta,
-                                    static_cast<__memory_order_underlying_t>(__order),
-                                    __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (return __host::__cxx_atomic_fetch_sub(&__a->__a_value, __delta, __order);))
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp __cxx_atomic_fetch_sub(
-  __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp __delta, memory_order __order)
-{
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_fetch_sub_cuda(__cxx_get_underlying_device_atomic(__a),
-                                    __delta,
-                                    static_cast<__memory_order_underlying_t>(__order),
-                                    __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (return __host::__cxx_atomic_fetch_sub(&__a->__a_value, __delta, __order);))
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp* __cxx_atomic_fetch_sub(
-  __cxx_atomic_base_heterogeneous_impl<_Tp*, _Sco, _Ref>* __a, ptrdiff_t __delta, memory_order __order)
-{
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_fetch_sub_cuda(__cxx_get_underlying_device_atomic(__a),
-                                    __delta,
-                                    static_cast<__memory_order_underlying_t>(__order),
-                                    __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (return __host::__cxx_atomic_fetch_sub(&__a->__a_value, __delta, __order);))
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp* __cxx_atomic_fetch_sub(
-  __cxx_atomic_base_heterogeneous_impl<_Tp*, _Sco, _Ref> volatile* __a, ptrdiff_t __delta, memory_order __order)
-{
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_fetch_sub_cuda(__cxx_get_underlying_device_atomic(__a),
-                                    __delta,
-                                    static_cast<__memory_order_underlying_t>(__order),
-                                    __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (return __host::__cxx_atomic_fetch_sub(&__a->__a_value, __delta, __order);))
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp
-__cxx_atomic_fetch_and(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref>* __a, _Tp __pattern, memory_order __order)
-{
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_fetch_and_cuda(__cxx_get_underlying_device_atomic(__a),
-                                    __pattern,
-                                    static_cast<__memory_order_underlying_t>(__order),
-                                    __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (return __host::__cxx_atomic_fetch_and(&__a->__a_value, __pattern, __order);))
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp __cxx_atomic_fetch_and(
-  __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp __pattern, memory_order __order)
-{
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_fetch_and_cuda(__cxx_get_underlying_device_atomic(__a),
-                                    __pattern,
-                                    static_cast<__memory_order_underlying_t>(__order),
-                                    __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (return __host::__cxx_atomic_fetch_and(&__a->__a_value, __pattern, __order);))
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp
-__cxx_atomic_fetch_or(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref>* __a, _Tp __pattern, memory_order __order)
-{
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_fetch_or_cuda(__cxx_get_underlying_device_atomic(__a),
-                                   __pattern,
-                                   static_cast<__memory_order_underlying_t>(__order),
-                                   __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (return __host::__cxx_atomic_fetch_or(&__a->__a_value, __pattern, __order);))
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp __cxx_atomic_fetch_or(
-  __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp __pattern, memory_order __order)
-{
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_fetch_or_cuda(__cxx_get_underlying_device_atomic(__a),
-                                   __pattern,
-                                   static_cast<__memory_order_underlying_t>(__order),
-                                   __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (return __host::__cxx_atomic_fetch_or(&__a->__a_value, __pattern, __order);))
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp
-__cxx_atomic_fetch_xor(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref>* __a, _Tp __pattern, memory_order __order)
-{
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_fetch_xor_cuda(__cxx_get_underlying_device_atomic(__a),
-                                    __pattern,
-                                    static_cast<__memory_order_underlying_t>(__order),
-                                    __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (return __host::__cxx_atomic_fetch_xor(&__a->__a_value, __pattern, __order);))
-}
-
-template <typename _Tp, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp __cxx_atomic_fetch_xor(
-  __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Tp __pattern, memory_order __order)
-{
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_fetch_xor_cuda(__cxx_get_underlying_device_atomic(__a),
-                                    __pattern,
-                                    static_cast<__memory_order_underlying_t>(__order),
-                                    __scope_tag<_Sco>());),
-    NV_IS_HOST,
-    (return __host::__cxx_atomic_fetch_xor(&__a->__a_value, __pattern, __order);))
-}
-
-template <typename _Tp, typename _Delta, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp
-__cxx_atomic_fetch_max(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref>* __a, _Delta __val, memory_order __order)
-{
-  NV_IF_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_fetch_max_cuda(__cxx_get_underlying_device_atomic(__a),
-                                    __val,
-                                    static_cast<__memory_order_underlying_t>(__order),
-                                    __scope_tag<_Sco>());),
-    (return __host::__cxx_atomic_fetch_max(&__a->__a_value, __val, __order);))
-}
-
-template <typename _Tp, typename _Delta, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp __cxx_atomic_fetch_max(
-  __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Delta __val, memory_order __order)
-{
-  NV_IF_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_fetch_max_cuda(__cxx_get_underlying_device_atomic(__a),
-                                    __val,
-                                    static_cast<__memory_order_underlying_t>(__order),
-                                    __scope_tag<_Sco>());),
-    (return __host::__cxx_atomic_fetch_max(&__a->__a_value, __val, __order);))
-}
-
-template <typename _Tp, typename _Delta, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp
-__cxx_atomic_fetch_min(__cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref>* __a, _Delta __val, memory_order __order)
-{
-  NV_IF_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_fetch_min_cuda(__cxx_get_underlying_device_atomic(__a),
-                                    __val,
-                                    static_cast<__memory_order_underlying_t>(__order),
-                                    __scope_tag<_Sco>());),
-    (return __host::__cxx_atomic_fetch_min(&__a->__a_value, __val, __order);))
-}
-
-template <typename _Tp, typename _Delta, int _Sco, bool _Ref>
-_CCCL_HOST_DEVICE _Tp __cxx_atomic_fetch_min(
-  __cxx_atomic_base_heterogeneous_impl<_Tp, _Sco, _Ref> volatile* __a, _Delta __val, memory_order __order)
-{
-  NV_IF_TARGET(
-    NV_IS_DEVICE,
-    (return __atomic_fetch_min_cuda(__cxx_get_underlying_device_atomic(__a),
-                                    __val,
-                                    static_cast<__memory_order_underlying_t>(__order),
-                                    __scope_tag<_Sco>());),
-    (return __host::__cxx_atomic_fetch_min(&__a->__a_value, __val, __order);))
-}
-
-template <typename _Tp, int _Sco>
-_CCCL_HOST_DEVICE inline void __cxx_atomic_init(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp __val)
-{
-  __cxx_atomic_init(&__a->__a_value, __cxx_small_to_32(__val));
-}
-
-template <typename _Tp, int _Sco>
-_CCCL_HOST_DEVICE inline void
-__cxx_atomic_store(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp __val, memory_order __order)
-{
-  __cxx_atomic_store(&__a->__a_value, __cxx_small_to_32(__val), __order);
-}
-
-template <typename _Tp, int _Sco>
-_CCCL_HOST_DEVICE inline _Tp
-__cxx_atomic_load(__cxx_atomic_base_small_impl<_Tp, _Sco> const volatile* __a, memory_order __order)
-{
-  return __cxx_small_from_32<_Tp>(__cxx_atomic_load(&__a->__a_value, __order));
-}
-
-template <typename _Tp, int _Sco>
-_CCCL_HOST_DEVICE inline _Tp
-__cxx_atomic_exchange(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp __value, memory_order __order)
-{
-  return __cxx_small_from_32<_Tp>(__cxx_atomic_exchange(&__a->__a_value, __cxx_small_to_32(__value), __order));
-}
-_CCCL_HOST_DEVICE inline int __cuda_memcmp(void const* __lhs, void const* __rhs, size_t __count)
-{
-  NV_DISPATCH_TARGET(
-    NV_IS_DEVICE,
-    (auto __lhs_c = reinterpret_cast<unsigned char const*>(__lhs);
-     auto __rhs_c = reinterpret_cast<unsigned char const*>(__rhs);
-     while (__count--) {
-       auto const __lhs_v = *__lhs_c++;
-       auto const __rhs_v = *__rhs_c++;
-       if (__lhs_v < __rhs_v)
-       {
-         return -1;
-       }
-       if (__lhs_v > __rhs_v)
-       {
-         return 1;
-       }
-     } return 0;),
-    NV_IS_HOST,
-    (return memcmp(__lhs, __rhs, __count);))
-}
-
-template <typename _Tp, int _Sco>
-_CCCL_HOST_DEVICE inline bool __cxx_atomic_compare_exchange_weak(
-  __cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a,
-  _Tp* __expected,
-  _Tp __value,
-  memory_order __success,
-  memory_order __failure)
-{
-  auto __temp = __cxx_small_to_32(*__expected);
-  auto const __ret =
-    __cxx_atomic_compare_exchange_weak(&__a->__a_value, &__temp, __cxx_small_to_32(__value), __success, __failure);
-  auto const __actual   = __cxx_small_from_32<_Tp>(__temp);
-  constexpr auto __mask = static_cast<decltype(__temp)>((1u << (8 * sizeof(_Tp))) - 1);
-  if (!__ret)
-  {
-    if (0 == __cuda_memcmp(&__actual, __expected, sizeof(_Tp)))
-    {
-      __cxx_atomic_fetch_and(&__a->__a_value, __mask, memory_order_relaxed);
-    }
-    else
-    {
-      *__expected = __actual;
-    }
-  }
-  return __ret;
-}
-
-template <typename _Tp, int _Sco>
-_CCCL_HOST_DEVICE inline bool __cxx_atomic_compare_exchange_strong(
-  __cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a,
-  _Tp* __expected,
-  _Tp __value,
-  memory_order __success,
-  memory_order __failure)
-{
-  auto const __old = *__expected;
-  while (1)
-  {
-    if (__cxx_atomic_compare_exchange_weak(__a, __expected, __value, __success, __failure))
-    {
-      return true;
-    }
-    if (0 != __cuda_memcmp(&__old, __expected, sizeof(_Tp)))
-    {
-      return false;
-    }
-  }
-}
-
-template <typename _Tp, int _Sco>
-_CCCL_HOST_DEVICE inline _Tp
-__cxx_atomic_fetch_add(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp __delta, memory_order __order)
-{
-  return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_add(&__a->__a_value, __cxx_small_to_32(__delta), __order));
-}
-
-template <typename _Tp, int _Sco>
-_CCCL_HOST_DEVICE inline _Tp
-__cxx_atomic_fetch_sub(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp __delta, memory_order __order)
-{
-  return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_sub(&__a->__a_value, __cxx_small_to_32(__delta), __order));
-}
-
-template <typename _Tp, int _Sco>
-_CCCL_HOST_DEVICE inline _Tp
-__cxx_atomic_fetch_and(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp __pattern, memory_order __order)
-{
-  return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_and(&__a->__a_value, __cxx_small_to_32(__pattern), __order));
-}
-
-template <typename _Tp, int _Sco>
-_CCCL_HOST_DEVICE inline _Tp
-__cxx_atomic_fetch_or(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp __pattern, memory_order __order)
-{
-  return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_or(&__a->__a_value, __cxx_small_to_32(__pattern), __order));
-}
-
-template <typename _Tp, int _Sco>
-_CCCL_HOST_DEVICE inline _Tp
-__cxx_atomic_fetch_xor(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Tp __pattern, memory_order __order)
-{
-  return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_xor(&__a->__a_value, __cxx_small_to_32(__pattern), __order));
-}
-
-template <typename _Tp, typename _Delta, int _Sco>
-_CCCL_HOST_DEVICE inline _Tp
-__cxx_atomic_fetch_max(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Delta __val, memory_order __order)
-{
-  return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_max(&__a->__a_value, __cxx_small_to_32(__val), __order));
-}
-
-template <typename _Tp, typename _Delta, int _Sco>
-_CCCL_HOST_DEVICE inline _Tp
-__cxx_atomic_fetch_min(__cxx_atomic_base_small_impl<_Tp, _Sco> volatile* __a, _Delta __val, memory_order __order)
-{
-  return __cxx_small_from_32<_Tp>(__cxx_atomic_fetch_min(&__a->__a_value, __cxx_small_to_32(__val), __order));
-}
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_nvrtc.h b/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_nvrtc.h
deleted file mode 100644
index 129b088081..0000000000
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_nvrtc.h
+++ /dev/null
@@ -1,17 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of libcu++, the C++ Standard Library for your entire system,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCUDACXX_ATOMIC_NVRTC_H
-#define _LIBCUDACXX_ATOMIC_NVRTC_H
-
-#include <cuda/std/detail/libcxx/include/support/atomic/cxx_atomic.h>
-
-#endif // _LIBCUDACXX_ATOMIC_NVRTC_H
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_scopes.h b/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_scopes.h
deleted file mode 100644
index 9a035b1e4d..0000000000
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/atomic_scopes.h
+++ /dev/null
@@ -1,67 +0,0 @@
-#ifndef __LIBCUDACXX_ATOMIC_SCOPES_H
-#define __LIBCUDACXX_ATOMIC_SCOPES_H
-
-// REMEMBER CHANGES TO THESE ARE ABI BREAKING
-// TODO: Space values out for potential new scopes
-#ifndef __ATOMIC_BLOCK
-#  define __ATOMIC_SYSTEM 0 // 0 indicates default
-#  define __ATOMIC_DEVICE 1
-#  define __ATOMIC_BLOCK  2
-#  define __ATOMIC_THREAD 10
-#endif //__ATOMIC_BLOCK
-
-enum thread_scope
-{
-  thread_scope_system = __ATOMIC_SYSTEM,
-  thread_scope_device = __ATOMIC_DEVICE,
-  thread_scope_block  = __ATOMIC_BLOCK,
-  thread_scope_thread = __ATOMIC_THREAD
-};
-
-#define _LIBCUDACXX_ATOMIC_SCOPE_TYPE    ::cuda::thread_scope
-#define _LIBCUDACXX_ATOMIC_SCOPE_DEFAULT ::cuda::thread_scope::system
-
-struct __thread_scope_thread_tag
-{};
-struct __thread_scope_block_tag
-{};
-struct __thread_scope_device_tag
-{};
-struct __thread_scope_system_tag
-{};
-
-template <int _Scope>
-struct __scope_enum_to_tag
-{};
-/* This would be the implementation once an actual thread-scope backend exists.
-template<> struct __scope_enum_to_tag<(int)thread_scope_thread> {
-    using type = __thread_scope_thread_tag; };
-Until then: */
-template <>
-struct __scope_enum_to_tag<(int) thread_scope_thread>
-{
-  using type = __thread_scope_block_tag;
-};
-template <>
-struct __scope_enum_to_tag<(int) thread_scope_block>
-{
-  using type = __thread_scope_block_tag;
-};
-template <>
-struct __scope_enum_to_tag<(int) thread_scope_device>
-{
-  using type = __thread_scope_device_tag;
-};
-template <>
-struct __scope_enum_to_tag<(int) thread_scope_system>
-{
-  using type = __thread_scope_system_tag;
-};
-
-template <int _Scope>
-_LIBCUDACXX_INLINE_VISIBILITY auto constexpr __scope_tag() -> typename __scope_enum_to_tag<_Scope>::type
-{
-  return typename __scope_enum_to_tag<_Scope>::type();
-}
-
-#endif // __LIBCUDACXX_ATOMIC_SCOPES_H
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/cxx_atomic.h b/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/cxx_atomic.h
deleted file mode 100644
index a4212f44a7..0000000000
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/support/atomic/cxx_atomic.h
+++ /dev/null
@@ -1,180 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of libcu++, the C++ Standard Library for your entire system,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCUDACXX_CXX_ATOMIC_H
-#define _LIBCUDACXX_CXX_ATOMIC_H
-
-template <typename _Tp, int _Sco>
-struct __cxx_atomic_base_impl
-{
-  using __underlying_t = _Tp;
-  using __temporary_t  = __cxx_atomic_base_impl<_Tp, _Sco>;
-  using __wrap_t       = __cxx_atomic_base_impl<_Tp, _Sco>;
-
-  static constexpr int __sco = _Sco;
-
-#if !defined(_CCCL_COMPILER_GCC) || (__GNUC__ >= 5)
-  static_assert(is_trivially_copyable<_Tp>::value, "std::atomic<Tp> requires that 'Tp' be a trivially copyable type");
-#endif
-
-  constexpr __cxx_atomic_base_impl() noexcept                         = default;
-  constexpr __cxx_atomic_base_impl(__cxx_atomic_base_impl&&) noexcept = default;
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit __cxx_atomic_base_impl(_Tp value) noexcept
-      : __a_value(value)
-  {}
-
-  __cxx_atomic_base_impl& operator=(const __cxx_atomic_base_impl&) noexcept = default;
-
-  _CCCL_ALIGNAS(sizeof(_Tp)) _Tp __a_value;
-};
-
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr _Tp* __cxx_get_underlying_atomic(__cxx_atomic_base_impl<_Tp, _Sco>* __a) noexcept
-{
-  return &__a->__a_value;
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr volatile _Tp*
-__cxx_get_underlying_atomic(__cxx_atomic_base_impl<_Tp, _Sco> volatile* __a) noexcept
-{
-  return &__a->__a_value;
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr const _Tp*
-__cxx_get_underlying_atomic(__cxx_atomic_base_impl<_Tp, _Sco> const* __a) noexcept
-{
-  return &__a->__a_value;
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr const volatile _Tp*
-__cxx_get_underlying_atomic(__cxx_atomic_base_impl<_Tp, _Sco> const volatile* __a) noexcept
-{
-  return &__a->__a_value;
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr __cxx_atomic_base_impl<_Tp, _Sco>*
-__cxx_atomic_unwrap(__cxx_atomic_base_impl<_Tp, _Sco>* __a) noexcept
-{
-  return __a;
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr volatile __cxx_atomic_base_impl<_Tp, _Sco>*
-__cxx_atomic_unwrap(__cxx_atomic_base_impl<_Tp, _Sco> volatile* __a) noexcept
-{
-  return __a;
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr const __cxx_atomic_base_impl<_Tp, _Sco>*
-__cxx_atomic_unwrap(__cxx_atomic_base_impl<_Tp, _Sco> const* __a) noexcept
-{
-  return __a;
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr const volatile __cxx_atomic_base_impl<_Tp, _Sco>*
-__cxx_atomic_unwrap(__cxx_atomic_base_impl<_Tp, _Sco> const volatile* __a) noexcept
-{
-  return __a;
-}
-
-template <typename _Tp, int _Sco>
-struct __cxx_atomic_ref_base_impl
-{
-  using __underlying_t = _Tp;
-  using __temporary_t  = _Tp;
-  using __wrap_t       = _Tp;
-
-  static constexpr int __sco = _Sco;
-
-#if !defined(_CCCL_COMPILER_GCC) || (__GNUC__ >= 5)
-  static_assert(is_trivially_copyable<_Tp>::value,
-                "std::atomic_ref<Tp> requires that 'Tp' be a trivially copyable type");
-#endif
-
-  constexpr __cxx_atomic_ref_base_impl() noexcept                                  = delete;
-  constexpr __cxx_atomic_ref_base_impl(__cxx_atomic_ref_base_impl&&) noexcept      = default;
-  constexpr __cxx_atomic_ref_base_impl(const __cxx_atomic_ref_base_impl&) noexcept = default;
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit __cxx_atomic_ref_base_impl(_Tp& value) noexcept
-      : __a_value(&value)
-  {}
-
-  _Tp* __a_value;
-};
-
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr _Tp*
-__cxx_get_underlying_atomic(__cxx_atomic_ref_base_impl<_Tp, _Sco>* __a) noexcept
-{
-  return __a->__a_value;
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr volatile _Tp*
-__cxx_get_underlying_atomic(__cxx_atomic_ref_base_impl<_Tp, _Sco> volatile* __a) noexcept
-{
-  return __a->__a_value;
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr const _Tp*
-__cxx_get_underlying_atomic(__cxx_atomic_ref_base_impl<_Tp, _Sco> const* __a) noexcept
-{
-  return __a->__a_value;
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr const volatile _Tp*
-__cxx_get_underlying_atomic(__cxx_atomic_ref_base_impl<_Tp, _Sco> const volatile* __a) noexcept
-{
-  return __a->__a_value;
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr _Tp* __cxx_atomic_unwrap(__cxx_atomic_ref_base_impl<_Tp, _Sco>* __a) noexcept
-{
-  return __cxx_get_underlying_atomic(__a);
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr volatile _Tp*
-__cxx_atomic_unwrap(__cxx_atomic_ref_base_impl<_Tp, _Sco> volatile* __a) noexcept
-{
-  return __cxx_get_underlying_atomic(__a);
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr const _Tp*
-__cxx_atomic_unwrap(__cxx_atomic_ref_base_impl<_Tp, _Sco> const* __a) noexcept
-{
-  return __cxx_get_underlying_atomic(__a);
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr const volatile _Tp*
-__cxx_atomic_unwrap(__cxx_atomic_ref_base_impl<_Tp, _Sco> const volatile* __a) noexcept
-{
-  return __cxx_get_underlying_atomic(__a);
-}
-
-template <typename _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr _Tp* __cxx_get_underlying_atomic(_Tp* __a) noexcept
-{
-  return __a;
-}
-
-template <typename _Tp, typename _Up>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr auto __cxx_atomic_wrap_to_base(_Tp*, _Up __val) noexcept ->
-  typename _Tp::__wrap_t
-{
-  return typename _Tp::__wrap_t(__val);
-}
-template <typename _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY constexpr auto __cxx_atomic_base_temporary(_Tp*) noexcept -> typename _Tp::__temporary_t
-{
-  return typename _Tp::__temporary_t();
-}
-
-template <typename _Tp>
-using __cxx_atomic_underlying_t = typename _Tp::__underlying_t;
-
-#endif //_LIBCUDACXX_CXX_ATOMIC_H

From 76294d11026a771868d4b636b95deba7af2b4e19 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Fri, 12 Apr 2024 19:40:57 -0700
Subject: [PATCH 05/71] First pass at making atomic use new backends

---
 libcudacxx/codegen/CMakeLists.txt             |   2 +-
 libcudacxx/codegen/codegen.cpp                |   8 +-
 .../atomic_cuda_ptx_derived.h                 |   0
 .../atomic_cuda_ptx_generated.h               |  31 +-
 .../std/__atomic/operations/heterogeneous.h   | 265 ++++++
 .../cuda/std/__atomic/operations/host.h       | 182 ++++
 libcudacxx/include/cuda/std/__atomic/order.h  | 126 +++
 .../include/cuda/std/__atomic/platform.h      |  14 -
 .../std/__atomic/platform/msvc_to_builtins.h  |  12 +
 .../cuda/std/__atomic/platform/platform.h     |  59 ++
 libcudacxx/include/cuda/std/__atomic/scopes.h |  52 +
 .../include/cuda/std/__atomic/storage/base.h  |  60 ++
 .../cuda/std/__atomic/storage/common.h        |  46 +
 .../cuda/std/__atomic/storage/locked.h        | 204 ++++
 .../cuda/std/__atomic/storage/reference.h     |  48 +
 .../include/cuda/std/__atomic/storage/small.h | 177 ++++
 .../cuda/std/__atomic/wait/notify_wait.h      | 188 ++++
 .../include/cuda/std/__atomic/wait/polling.h  |  56 ++
 libcudacxx/include/cuda/std/atomic            | 901 +++---------------
 19 files changed, 1624 insertions(+), 807 deletions(-)
 rename libcudacxx/include/cuda/std/__atomic/{ => operations}/atomic_cuda_ptx_derived.h (100%)
 rename libcudacxx/include/cuda/std/__atomic/{ => operations}/atomic_cuda_ptx_generated.h (99%)
 create mode 100644 libcudacxx/include/cuda/std/__atomic/operations/heterogeneous.h
 create mode 100644 libcudacxx/include/cuda/std/__atomic/operations/host.h
 create mode 100644 libcudacxx/include/cuda/std/__atomic/order.h
 delete mode 100644 libcudacxx/include/cuda/std/__atomic/platform.h
 create mode 100644 libcudacxx/include/cuda/std/__atomic/platform/platform.h
 create mode 100644 libcudacxx/include/cuda/std/__atomic/scopes.h
 create mode 100644 libcudacxx/include/cuda/std/__atomic/storage/base.h
 create mode 100644 libcudacxx/include/cuda/std/__atomic/storage/common.h
 create mode 100644 libcudacxx/include/cuda/std/__atomic/storage/locked.h
 create mode 100644 libcudacxx/include/cuda/std/__atomic/storage/reference.h
 create mode 100644 libcudacxx/include/cuda/std/__atomic/storage/small.h
 create mode 100644 libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h
 create mode 100644 libcudacxx/include/cuda/std/__atomic/wait/polling.h

diff --git a/libcudacxx/codegen/CMakeLists.txt b/libcudacxx/codegen/CMakeLists.txt
index af1b6bdb8a..77e749b83b 100644
--- a/libcudacxx/codegen/CMakeLists.txt
+++ b/libcudacxx/codegen/CMakeLists.txt
@@ -20,7 +20,7 @@ target_compile_features(
 add_dependencies(libcudacxx.atomics.codegen codegen)
 
 set(atomic_generated_output "${libcudacxx_BINARY_DIR}/codegen/atomic_cuda_ptx_generated.h")
-set(atomic_install_location "${libcudacxx_SOURCE_DIR}/include/cuda/std/__atomic")
+set(atomic_install_location "${libcudacxx_SOURCE_DIR}/include/cuda/std/__atomic/operations")
 
 add_custom_target(
     libcudacxx.atomics.codegen.execute
diff --git a/libcudacxx/codegen/codegen.cpp b/libcudacxx/codegen/codegen.cpp
index fd032d1d4b..c1f809bd4b 100644
--- a/libcudacxx/codegen/codegen.cpp
+++ b/libcudacxx/codegen/codegen.cpp
@@ -78,8 +78,11 @@ int main()
 //
 //===----------------------------------------------------------------------===//
 
-// This is a autogenerated file, we want to ensure that it contains exactly the contentes we want to generate
+// This is a autogenerated file, we want to ensure that it contains exactly the contents we want to generate
 // clang-format off
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
 )XXX";
 
   auto scopenametag = [&](auto scope) {
@@ -306,7 +309,7 @@ int main()
                        "__failure_memorder, "
                     << scopenametag(s.first) << ") {\n";
                 out << "    uint" << sz << "_t __tmp = 0, __old = 0, __old_tmp;\n";
-                out << "    memcpy(&__tmp, __desired, " << sz / 8 << ");\n";
+                out << "    memcpy(&__tmp, &__desired, " << sz / 8 << ");\n";
                 out << "    memcpy(&__old, __expected, " << sz / 8 << ");\n";
                 out << "    __old_tmp = __old;\n";
                 out << "    NV_DISPATCH_TARGET(\n";
@@ -503,6 +506,7 @@ int main()
     }
   }
 
+  out << "\n_LIBCUDACXX_END_NAMESPACE_STD\n";
   out << "\n// clang-format on\n";
 
   return 0;
diff --git a/libcudacxx/include/cuda/std/__atomic/atomic_cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/operations/atomic_cuda_ptx_derived.h
similarity index 100%
rename from libcudacxx/include/cuda/std/__atomic/atomic_cuda_ptx_derived.h
rename to libcudacxx/include/cuda/std/__atomic/operations/atomic_cuda_ptx_derived.h
diff --git a/libcudacxx/include/cuda/std/__atomic/atomic_cuda_ptx_generated.h b/libcudacxx/include/cuda/std/__atomic/operations/atomic_cuda_ptx_generated.h
similarity index 99%
rename from libcudacxx/include/cuda/std/__atomic/atomic_cuda_ptx_generated.h
rename to libcudacxx/include/cuda/std/__atomic/operations/atomic_cuda_ptx_generated.h
index 648de27352..ff1bdcf1ff 100644
--- a/libcudacxx/include/cuda/std/__atomic/atomic_cuda_ptx_generated.h
+++ b/libcudacxx/include/cuda/std/__atomic/operations/atomic_cuda_ptx_generated.h
@@ -8,8 +8,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-// This is a autogenerated file, we want to ensure that it contains exactly the contentes we want to generate
+// This is a autogenerated file, we want to ensure that it contains exactly the contents we want to generate
 // clang-format off
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
 static inline _CCCL_DEVICE void __cuda_membar_block() { asm volatile("membar.cta;":::"memory"); }
 static inline _CCCL_DEVICE void __cuda_fence_acq_rel_block() { asm volatile("fence.acq_rel.cta;":::"memory"); }
 static inline _CCCL_DEVICE void __cuda_fence_sc_block() { asm volatile("fence.sc.cta;":::"memory"); }
@@ -251,7 +254,7 @@ template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inli
 template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
 _CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) {
     uint32_t __tmp = 0, __old = 0, __old_tmp;
-    memcpy(&__tmp, __desired, 4);
+    memcpy(&__tmp, &__desired, 4);
     memcpy(&__old, __expected, 4);
     __old_tmp = __old;
     NV_DISPATCH_TARGET(
@@ -285,7 +288,7 @@ _CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *_
 template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
 _CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) {
     uint32_t __tmp = 0, __old = 0, __old_tmp;
-    memcpy(&__tmp, __desired, 4);
+    memcpy(&__tmp, &__desired, 4);
     memcpy(&__old, __expected, 4);
     __old_tmp = __old;
     NV_DISPATCH_TARGET(
@@ -1158,7 +1161,7 @@ template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inli
 template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
 _CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) {
     uint64_t __tmp = 0, __old = 0, __old_tmp;
-    memcpy(&__tmp, __desired, 8);
+    memcpy(&__tmp, &__desired, 8);
     memcpy(&__old, __expected, 8);
     __old_tmp = __old;
     NV_DISPATCH_TARGET(
@@ -1192,7 +1195,7 @@ _CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *_
 template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
 _CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) {
     uint64_t __tmp = 0, __old = 0, __old_tmp;
-    memcpy(&__tmp, __desired, 8);
+    memcpy(&__tmp, &__desired, 8);
     memcpy(&__old, __expected, 8);
     __old_tmp = __old;
     NV_DISPATCH_TARGET(
@@ -2428,7 +2431,7 @@ template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inli
 template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
 _CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) {
     uint32_t __tmp = 0, __old = 0, __old_tmp;
-    memcpy(&__tmp, __desired, 4);
+    memcpy(&__tmp, &__desired, 4);
     memcpy(&__old, __expected, 4);
     __old_tmp = __old;
     NV_DISPATCH_TARGET(
@@ -2462,7 +2465,7 @@ _CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *_
 template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
 _CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) {
     uint32_t __tmp = 0, __old = 0, __old_tmp;
-    memcpy(&__tmp, __desired, 4);
+    memcpy(&__tmp, &__desired, 4);
     memcpy(&__old, __expected, 4);
     __old_tmp = __old;
     NV_DISPATCH_TARGET(
@@ -3335,7 +3338,7 @@ template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inli
 template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
 _CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) {
     uint64_t __tmp = 0, __old = 0, __old_tmp;
-    memcpy(&__tmp, __desired, 8);
+    memcpy(&__tmp, &__desired, 8);
     memcpy(&__old, __expected, 8);
     __old_tmp = __old;
     NV_DISPATCH_TARGET(
@@ -3369,7 +3372,7 @@ _CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *_
 template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
 _CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) {
     uint64_t __tmp = 0, __old = 0, __old_tmp;
-    memcpy(&__tmp, __desired, 8);
+    memcpy(&__tmp, &__desired, 8);
     memcpy(&__old, __expected, 8);
     __old_tmp = __old;
     NV_DISPATCH_TARGET(
@@ -4605,7 +4608,7 @@ template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inli
 template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
 _CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) {
     uint32_t __tmp = 0, __old = 0, __old_tmp;
-    memcpy(&__tmp, __desired, 4);
+    memcpy(&__tmp, &__desired, 4);
     memcpy(&__old, __expected, 4);
     __old_tmp = __old;
     NV_DISPATCH_TARGET(
@@ -4639,7 +4642,7 @@ _CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *_
 template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
 _CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) {
     uint32_t __tmp = 0, __old = 0, __old_tmp;
-    memcpy(&__tmp, __desired, 4);
+    memcpy(&__tmp, &__desired, 4);
     memcpy(&__old, __expected, 4);
     __old_tmp = __old;
     NV_DISPATCH_TARGET(
@@ -5512,7 +5515,7 @@ template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inli
 template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
 _CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) {
     uint64_t __tmp = 0, __old = 0, __old_tmp;
-    memcpy(&__tmp, __desired, 8);
+    memcpy(&__tmp, &__desired, 8);
     memcpy(&__old, __expected, 8);
     __old_tmp = __old;
     NV_DISPATCH_TARGET(
@@ -5546,7 +5549,7 @@ _CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *_
 template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
 _CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) {
     uint64_t __tmp = 0, __old = 0, __old_tmp;
-    memcpy(&__tmp, __desired, 8);
+    memcpy(&__tmp, &__desired, 8);
     memcpy(&__old, __expected, 8);
     __old_tmp = __old;
     NV_DISPATCH_TARGET(
@@ -6542,4 +6545,6 @@ _CCCL_DEVICE _Type* __atomic_fetch_sub_cuda(_Type **__ptr, ptrdiff_t __val, int
     return __ret;
 }
 
+_LIBCUDACXX_END_NAMESPACE_STD
+
 // clang-format on
diff --git a/libcudacxx/include/cuda/std/__atomic/operations/heterogeneous.h b/libcudacxx/include/cuda/std/__atomic/operations/heterogeneous.h
new file mode 100644
index 0000000000..86a142de08
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/operations/heterogeneous.h
@@ -0,0 +1,265 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __LIBCUDACXX___ATOMIC_DISPATCH_H
+#define __LIBCUDACXX___ATOMIC_DISPATCH_H
+
+#include <cuda/std/detail/__config>
+
+#include <cuda/std/__atomic/scopes.h>
+#include <cuda/std/__atomic/order.h>
+#include <cuda/std/__atomic/storage/common.h>
+
+#include <cuda/std/__atomic/operations/host.h>
+#include <cuda/std/__atomic/operations/atomic_cuda_ptx_generated.h>
+#include <cuda/std/__atomic/operations/atomic_cuda_ptx_derived.h>
+
+// Dispatch directly calls PTX/Host backends for atomic objects.
+// By default these objects support extracting the address contained with operator()()
+// this provides some amount of syntactic sugar to avoid duplicating every function that requires `volatile`.
+// `_Tp` is able to be volatile and will simply be instatiated into a new function.
+// It is up to the underlying backends to implement the correct volatile behavior
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+_LIBCUDACXX_HOST_DEVICE
+inline
+ void __atomic_thread_fence_dispatch(memory_order __order) {
+    NV_DISPATCH_TARGET(
+        NV_IS_DEVICE, (
+            __atomic_thread_fence_cuda(static_cast<__memory_order_underlying_t>(__order), __thread_scope_system_tag());
+        ),
+        NV_IS_HOST, (
+            __atomic_thread_fence_host(__order);
+        )
+    )
+}
+
+_LIBCUDACXX_HOST_DEVICE
+inline
+ void __atomic_signal_fence_dispatch(memory_order __order) {
+    NV_DISPATCH_TARGET(
+        NV_IS_DEVICE, (
+            __atomic_signal_fence_cuda(static_cast<__memory_order_underlying_t>(__order));
+        ),
+        NV_IS_HOST, (
+            __atomic_signal_fence_host(__order);
+        )
+    )
+}
+
+// Regarding __atomic_base_Tag
+// It *is* possible to define it as:
+// _Tag = __atomic_enable_if_default_base_t<_Tp> and make all tag types default to the 'base' backend
+// I don't know if it's necessary to do that though. For now, this just adds some kind of protection
+// preventing access to the functions with the wrong tag type.
+template <typename _Tp>
+using __atomic_enable_if_default_base_t = __enable_if_t<is_same<__atomic_tag_t<_Tp>, __atomic_base_tag>::value, __atomic_tag_t<_Tp>>;
+
+template <typename _Tp, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
+_LIBCUDACXX_HOST_DEVICE
+ void __atomic_init_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, _Tag = {}) {
+    __atomic_assign_volatile(__a(), __val);
+}
+
+template <typename _Tp, typename _Sco = __thread_scope_system_tag, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
+_LIBCUDACXX_HOST_DEVICE
+ void __atomic_store_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco = {}, _Tag = {}) {
+    alignas(_Tp) auto __tmp = __val;
+    NV_DISPATCH_TARGET(
+        NV_IS_DEVICE, (
+            __atomic_store_n_cuda(__a(), __tmp, static_cast<__memory_order_underlying_t>(__order),  _Sco{});
+        ),
+        NV_IS_HOST, (
+            __atomic_store_host(__a(), __tmp, __order);
+        )
+    )
+}
+
+template <typename _Tp, typename _Sco = __thread_scope_system_tag, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
+_LIBCUDACXX_HOST_DEVICE
+ auto __atomic_load_dispatch(_Tp const& __a, memory_order __order, _Sco = {}, _Tag = {}) -> __atomic_underlying_t<_Tp> {
+    NV_DISPATCH_TARGET(
+        NV_IS_DEVICE, (
+            return __atomic_load_n_cuda(__a(), static_cast<__memory_order_underlying_t>(__order),  _Sco{});
+        ),
+        NV_IS_HOST, (
+            return __atomic_load_host(__a(), __order);
+        )
+    )
+}
+
+template <typename _Tp, typename _Sco = __thread_scope_system_tag, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
+_LIBCUDACXX_HOST_DEVICE
+__atomic_underlying_t<_Tp> __atomic_exchange_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __value, memory_order __order, _Sco = {}, _Tag = {}) {
+    alignas(_Tp) auto __tmp = __value;
+    NV_DISPATCH_TARGET(
+        NV_IS_DEVICE, (
+            return __atomic_exchange_n_cuda(__a(), __tmp, static_cast<__memory_order_underlying_t>(__order), _Sco{});
+        ),
+        NV_IS_HOST, (
+            return __atomic_exchange_host(__a(), __tmp, __order);
+        )
+    )
+}
+
+template <typename _Tp, typename _Sco = __thread_scope_system_tag, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
+_LIBCUDACXX_HOST_DEVICE
+ bool __atomic_compare_exchange_strong_dispatch(_Tp& __a, __atomic_underlying_t<_Tp>* __expected, __atomic_underlying_t<_Tp> __val, memory_order __success, memory_order __failure, _Sco = {}, _Tag = {}) {
+    bool __result = false;
+    NV_DISPATCH_TARGET(
+        NV_IS_DEVICE, (
+            __result = __atomic_compare_exchange_cuda(__a(), __expected, __val, false, static_cast<__memory_order_underlying_t>(__success), static_cast<__memory_order_underlying_t>(__failure), _Sco{});
+        ),
+        NV_IS_HOST, (
+            __result = __atomic_compare_exchange_strong_host(__a(), __expected, __val, __success, __failure);
+        )
+    )
+    return __result;
+}
+
+template <typename _Tp, typename _Sco = __thread_scope_system_tag, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
+_LIBCUDACXX_HOST_DEVICE
+ bool __atomic_compare_exchange_weak_dispatch(_Tp& __a, __atomic_underlying_t<_Tp>* __expected, __atomic_underlying_t<_Tp> __val, memory_order __success, memory_order __failure, _Sco = {}, _Tag = {}) {
+    bool __result = false;
+    NV_DISPATCH_TARGET(
+        NV_IS_DEVICE, (
+            __result = __atomic_compare_exchange_cuda(__a(), __expected, __val,  true, static_cast<__memory_order_underlying_t>(__success), static_cast<__memory_order_underlying_t>(__failure), _Sco{});
+        ),
+        NV_IS_HOST, (
+            __result = __atomic_compare_exchange_weak_host(__a(), __expected, __val, __success, __failure);
+        )
+    )
+    return __result;
+}
+
+template <typename _Tp>
+using __atomic_enable_if_ptr = __enable_if_t<is_pointer<__atomic_underlying_t<_Tp>>::value, __atomic_underlying_t<_Tp>>;
+template <typename _Tp>
+using __atomic_enable_if_not_ptr = __enable_if_t<!is_pointer<__atomic_underlying_t<_Tp>>::value, __atomic_underlying_t<_Tp>>;
+
+template <typename _Tp, typename _Sco = __thread_scope_system_tag, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
+_LIBCUDACXX_HOST_DEVICE
+ __atomic_enable_if_not_ptr<_Tp> __atomic_fetch_add_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __delta, memory_order __order, _Sco = {}, _Tag = {}) {
+    NV_DISPATCH_TARGET(
+        NV_IS_DEVICE, (
+            return __atomic_fetch_add_cuda(__a(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{});
+        ),
+        NV_IS_HOST, (
+            return __atomic_fetch_add_host(__a(), __delta, __order);
+        )
+    )
+}
+
+template <typename _Tp, typename _Sco = __thread_scope_system_tag, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
+_LIBCUDACXX_HOST_DEVICE
+ __atomic_enable_if_ptr<_Tp> __atomic_fetch_add_dispatch(_Tp& __a, ptrdiff_t __delta, memory_order __order, _Sco = {}, _Tag = {}) {
+    NV_DISPATCH_TARGET(
+        NV_IS_DEVICE, (
+            return __atomic_fetch_add_cuda(__a(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{});
+        ),
+        NV_IS_HOST, (
+            return __atomic_fetch_add_host(__a(), __delta, __order);
+        )
+    )
+}
+
+template <typename _Tp, typename _Sco = __thread_scope_system_tag, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
+_LIBCUDACXX_HOST_DEVICE
+ __atomic_enable_if_not_ptr<_Tp> __atomic_fetch_sub_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __delta, memory_order __order, _Sco = {}, _Tag = {}) {
+    NV_DISPATCH_TARGET(
+        NV_IS_DEVICE, (
+            return __atomic_fetch_sub_cuda(__a(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{});
+        ),
+        NV_IS_HOST, (
+            return __atomic_fetch_sub_cuda(__a(), __delta, __order);
+        )
+    )
+}
+
+template <typename _Tp, typename _Sco = __thread_scope_system_tag, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
+_LIBCUDACXX_HOST_DEVICE
+ __atomic_enable_if_ptr<_Tp> __atomic_fetch_sub_dispatch(_Tp& __a, ptrdiff_t __delta, memory_order __order, _Sco = {}, _Tag = {}) {
+    NV_DISPATCH_TARGET(
+        NV_IS_DEVICE, (
+            return __atomic_fetch_sub_cuda(__a(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{});
+        ),
+        NV_IS_HOST, (
+            return __atomic_fetch_sub_host(__a(), __delta, __order);
+        )
+    )
+}
+
+template <typename _Tp, typename _Sco = __thread_scope_system_tag, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
+_LIBCUDACXX_HOST_DEVICE
+ __atomic_underlying_t<_Tp> __atomic_fetch_and_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order __order, _Sco = {}, _Tag = {}) {
+    NV_DISPATCH_TARGET(
+        NV_IS_DEVICE, (
+            return __atomic_fetch_and_cuda(__a(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{});
+        ),
+        NV_IS_HOST, (
+            return __atomic_fetch_and_host(__a(), __pattern, __order);
+        )
+    )
+}
+
+template <typename _Tp, typename _Sco = __thread_scope_system_tag, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
+_LIBCUDACXX_HOST_DEVICE
+ __atomic_underlying_t<_Tp> __atomic_fetch_or_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order __order, _Sco = {}, _Tag = {}) {
+    NV_DISPATCH_TARGET(
+        NV_IS_DEVICE, (
+            return __atomic_fetch_or_cuda(__a(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{});
+        ),
+        NV_IS_HOST, (
+            return __atomic_fetch_or_host(__a(), __pattern, __order);
+        )
+    )
+}
+
+template <typename _Tp, typename _Sco = __thread_scope_system_tag, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
+_LIBCUDACXX_HOST_DEVICE
+ __atomic_underlying_t<_Tp> __atomic_fetch_xor_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order __order, _Sco = {}, _Tag = {}) {
+    NV_DISPATCH_TARGET(
+        NV_IS_DEVICE, (
+            return __atomic_fetch_xor_cuda(__a(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{});
+        ),
+        NV_IS_HOST, (
+            return __atomic_fetch_xor_host(__a(), __pattern, __order);
+        )
+    )
+}
+
+template <typename _Tp, typename _Sco = __thread_scope_system_tag, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
+_LIBCUDACXX_HOST_DEVICE
+ __atomic_underlying_t<_Tp> __atomic_fetch_max_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco = {}, _Tag = {}) {
+    NV_IF_TARGET(
+        NV_IS_DEVICE, (
+            return __atomic_fetch_max_cuda(__a(), __val, static_cast<__memory_order_underlying_t>(__order), _Sco{});
+        ), (
+            return __atomic_fetch_max_host(__a(), __val, __order);
+        )
+    )
+}
+
+template <typename _Tp, typename _Sco = __thread_scope_system_tag, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
+_LIBCUDACXX_HOST_DEVICE
+ __atomic_underlying_t<_Tp> __atomic_fetch_min_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco = {}, _Tag = {}) {
+    NV_IF_TARGET(
+        NV_IS_DEVICE, (
+            return __atomic_fetch_min_cuda(__a(), __val, static_cast<__memory_order_underlying_t>(__order), _Sco{});
+        ), (
+            return __atomic_fetch_min_host(__a(), __val, __order);
+        )
+    )
+}
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // __LIBCUDACXX___ATOMIC_DISPATCH_H
diff --git a/libcudacxx/include/cuda/std/__atomic/operations/host.h b/libcudacxx/include/cuda/std/__atomic/operations/host.h
new file mode 100644
index 0000000000..4870c011c4
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/operations/host.h
@@ -0,0 +1,182 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___ATOMICS_HOST_H
+#define _LIBCUDACXX___ATOMICS_HOST_H
+
+#include <cuda/std/__atomic/platform/platform.h>
+#include <cuda/std/__atomic/order.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+// Guard ifdef for lock free query in case it is assigned elsewhere (MSVC/CUDA)
+#ifndef _LIBCUDACXX_ATOMIC_IS_LOCK_FREE
+#define _LIBCUDACXX_ATOMIC_IS_LOCK_FREE(__x) __atomic_is_lock_free(__x, 0)
+#endif
+
+inline
+void __atomic_thread_fence_host(memory_order __order) {
+  __atomic_thread_fence(__atomic_order_to_int(__order));
+}
+
+inline
+void __atomic_signal_fence_host(memory_order __order) {
+  __atomic_signal_fence(__atomic_order_to_int(__order));
+}
+
+template <typename _Tp, typename _Up>
+inline void __atomic_store_host(_Tp* __a,  _Up __val, memory_order __order) {
+  __atomic_store(__a, &__val, __atomic_order_to_int(__order));
+}
+
+template <typename _Tp>
+inline auto __atomic_load_host(_Tp* __a, memory_order __order) -> _Tp {
+  __remove_cvref_t<_Tp> __ret{};
+  __atomic_load(__a, &__ret, __atomic_order_to_int(__order));
+  return __ret;
+}
+
+template <typename _Tp, typename _Up>
+inline auto __atomic_exchange_host(_Tp* __a, _Up __val, memory_order __order) -> _Tp {
+  __remove_cvref_t<_Tp> __ret{};
+  __atomic_exchange(__a, &__val, &__ret, __atomic_order_to_int(__order));
+  return __ret;
+}
+
+template <typename _Tp, typename _Up>
+inline bool __atomic_compare_exchange_strong_host(
+    _Tp* __a, _Up* __expected, _Up __value, memory_order __success,
+    memory_order __failure) {
+  (void)__expected;
+  return __atomic_compare_exchange(__a,
+                                   __expected, &__value, false,
+                                   __atomic_order_to_int(__success),
+                                   __atomic_failure_order_to_int(__failure));
+}
+
+template <typename _Tp, typename _Up>
+inline bool __atomic_compare_exchange_weak_host(
+    _Tp* __a, _Up* __expected, _Up __value, memory_order __success,
+    memory_order __failure) {
+  (void)__expected;
+  return __atomic_compare_exchange(__a,
+                                   __expected, &__value, true,
+                                   __atomic_order_to_int(__success),
+                                   __atomic_failure_order_to_int(__failure));
+}
+
+template <typename _Tp>
+struct __atomic_ptr_inc { enum {value = 1}; };
+
+template <typename _Tp>
+struct __atomic_ptr_inc<_Tp*> { enum {value = sizeof(_Tp)}; };
+
+// FIXME: Haven't figured out what the spec says about using arrays with
+// atomic_fetch_add. Force a failure rather than creating bad behavior.
+template <typename _Tp>
+struct __atomic_ptr_inc<_Tp[]> { };
+template <typename _Tp, int n>
+struct __atomic_ptr_inc<_Tp[n]> { };
+
+template <typename _Tp, typename _Td, __enable_if_t<!is_floating_point<_Tp>::value, int> = 0>
+inline _Tp __atomic_fetch_add_host(_Tp* __a, _Td __delta,
+                           memory_order __order) {
+  constexpr auto __skip_v = __atomic_ptr_inc<_Tp>::value;
+  return __atomic_fetch_add(__a, __delta * __skip_v,
+                            __atomic_order_to_int(__order));
+}
+
+template <typename _Tp, typename _Td, __enable_if_t<is_floating_point<_Tp>::value, int> = 0>
+inline _Tp __atomic_fetch_add_host(_Tp* __a, _Td __delta,
+                           memory_order __order) {
+  auto __expected = __atomic_load_host(__a, memory_order_relaxed);
+  auto __desired = __expected + __delta;
+
+  while(!__atomic_compare_exchange_strong_host(__a, &__expected, __desired, __order, __order)) {
+      __desired = __expected + __delta;
+  }
+
+  return __expected;
+}
+
+template <typename _Tp, typename _Td, __enable_if_t<!is_floating_point<_Tp>::value, int> = 0>
+inline _Tp __atomic_fetch_sub_host(_Tp* __a, _Td __delta,
+                           memory_order __order) {
+  constexpr auto __skip_v = __atomic_ptr_inc<_Tp>::value;
+  return __atomic_fetch_sub(__a, __delta * __skip_v,
+                            __atomic_order_to_int(__order));
+}
+
+template <typename _Tp, typename _Td, __enable_if_t<is_floating_point<_Tp>::value, int> = 0>
+inline _Tp __atomic_fetch_sub_host(_Tp* __a, _Td __delta,
+                           memory_order __order) {
+  auto __expected = __atomic_load_host(__a, memory_order_relaxed);
+  auto __desired = __expected - __delta;
+
+  while(!__atomic_compare_exchange_strong_host(__a, &__expected, __desired, __order, __order)) {
+      __desired = __expected - __delta;
+  }
+
+  return __expected;
+}
+
+template <typename _Tp, typename _Td>
+inline _Tp __atomic_fetch_and_host(_Tp* __a, _Td __pattern,
+                            memory_order __order) {
+  return __atomic_fetch_and(__a, __pattern,
+                            __atomic_order_to_int(__order));
+}
+
+template <typename _Tp, typename _Td>
+inline _Tp __atomic_fetch_or_host(_Tp* __a, _Td __pattern,
+                          memory_order __order) {
+  return __atomic_fetch_or(__a, __pattern,
+                           __atomic_order_to_int(__order));
+}
+
+template <typename _Tp, typename _Td>
+inline _Tp __atomic_fetch_xor_host(_Tp* __a, _Td __pattern,
+                           memory_order __order) {
+  return __atomic_fetch_xor(__a, __pattern,
+                            __atomic_order_to_int(__order));
+}
+
+template <typename _Tp, typename _Td>
+inline _Tp __atomic_fetch_max_host(_Tp* __a, _Td __val,
+                           memory_order __order) {
+  auto __expected = __atomic_load_host(__a, memory_order_relaxed);
+  auto __desired = __expected > __val ? __expected : __val;
+
+  while(__desired == __val &&
+          !__atomic_compare_exchange_strong_host(__a, &__expected, __desired, __order, __order)) {
+      __desired = __expected > __val ? __expected : __val;
+  }
+
+  return __expected;
+}
+
+template <typename _Tp, typename _Td>
+inline _Tp __atomic_fetch_min_host(_Tp* __a, _Td __val,
+                           memory_order __order) {
+  auto __expected = __atomic_load_host(__a, memory_order_relaxed);
+  auto __desired = __expected < __val ? __expected : __val;
+
+  while(__desired == __val &&
+          !__atomic_compare_exchange_strong_host(__a, &__expected, __desired, __order, __order)) {
+      __desired = __expected < __val ? __expected : __val;
+  }
+
+  return __expected;
+}
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX___ATOMICS_HOST_H
diff --git a/libcudacxx/include/cuda/std/__atomic/order.h b/libcudacxx/include/cuda/std/__atomic/order.h
new file mode 100644
index 0000000000..d5c37c45ec
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/order.h
@@ -0,0 +1,126 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __LIBCUDACXX_ATOMIC_ORDER_H
+#define __LIBCUDACXX_ATOMIC_ORDER_H
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+#define _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m) \
+  _LIBCUDACXX_DIAGNOSE_WARNING(__m == memory_order_consume || \
+                           __m == memory_order_acquire || \
+                           __m == memory_order_acq_rel,   \
+                        "memory order argument to atomic operation is invalid")
+
+#define _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m) \
+  _LIBCUDACXX_DIAGNOSE_WARNING(__m == memory_order_release || \
+                           __m == memory_order_acq_rel,   \
+                        "memory order argument to atomic operation is invalid")
+
+#define _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__m, __f) \
+  _LIBCUDACXX_DIAGNOSE_WARNING(__f == memory_order_release || \
+                           __f == memory_order_acq_rel,   \
+                        "memory order argument to atomic operation is invalid")
+
+#ifndef __ATOMIC_RELAXED
+#define __ATOMIC_RELAXED 0
+#define __ATOMIC_CONSUME 1
+#define __ATOMIC_ACQUIRE 2
+#define __ATOMIC_RELEASE 3
+#define __ATOMIC_ACQ_REL 4
+#define __ATOMIC_SEQ_CST 5
+#endif //__ATOMIC_RELAXED
+
+// Figure out what the underlying type for `memory_order` would be if it were
+// declared as an unscoped enum (accounting for -fshort-enums). Use this result
+// to pin the underlying type in C++20.
+enum __legacy_memory_order {
+    __mo_relaxed,
+    __mo_consume,
+    __mo_acquire,
+    __mo_release,
+    __mo_acq_rel,
+    __mo_seq_cst
+};
+
+typedef underlying_type<__legacy_memory_order>::type __memory_order_underlying_t;
+
+#if _CCCL_STD_VER > 2017
+
+enum class memory_order : __memory_order_underlying_t {
+  relaxed = __mo_relaxed,
+  consume = __mo_consume,
+  acquire = __mo_acquire,
+  release = __mo_release,
+  acq_rel = __mo_acq_rel,
+  seq_cst = __mo_seq_cst
+};
+
+inline constexpr auto memory_order_relaxed = memory_order::relaxed;
+inline constexpr auto memory_order_consume = memory_order::consume;
+inline constexpr auto memory_order_acquire = memory_order::acquire;
+inline constexpr auto memory_order_release = memory_order::release;
+inline constexpr auto memory_order_acq_rel = memory_order::acq_rel;
+inline constexpr auto memory_order_seq_cst = memory_order::seq_cst;
+
+#else
+
+typedef enum memory_order {
+  memory_order_relaxed = __mo_relaxed,
+  memory_order_consume = __mo_consume,
+  memory_order_acquire = __mo_acquire,
+  memory_order_release = __mo_release,
+  memory_order_acq_rel = __mo_acq_rel,
+  memory_order_seq_cst = __mo_seq_cst,
+} memory_order;
+
+#endif // _CCCL_STD_VER > 2017
+
+_LIBCUDACXX_HOST_DEVICE
+inline int __stronger_order_cuda(int __a, int __b) {
+    int const __max = __a > __b ? __a : __b;
+    if(__max != __ATOMIC_RELEASE)
+        return __max;
+    static int const __xform[] = {
+        __ATOMIC_RELEASE,
+        __ATOMIC_ACQ_REL,
+        __ATOMIC_ACQ_REL,
+        __ATOMIC_RELEASE };
+    return __xform[__a < __b ? __a : __b];
+}
+
+_LIBCUDACXX_HOST_DEVICE
+inline constexpr int __atomic_order_to_int(memory_order __order) {
+  // Avoid switch statement to make this a constexpr.
+  return __order == memory_order_relaxed ? __ATOMIC_RELAXED:
+         (__order == memory_order_acquire ? __ATOMIC_ACQUIRE:
+          (__order == memory_order_release ? __ATOMIC_RELEASE:
+           (__order == memory_order_seq_cst ? __ATOMIC_SEQ_CST:
+            (__order == memory_order_acq_rel ? __ATOMIC_ACQ_REL:
+              __ATOMIC_CONSUME))));
+}
+
+_LIBCUDACXX_HOST_DEVICE
+inline constexpr int __atomic_failure_order_to_int(memory_order __order) {
+  // Avoid switch statement to make this a constexpr.
+  return __order == memory_order_relaxed ? __ATOMIC_RELAXED:
+         (__order == memory_order_acquire ? __ATOMIC_ACQUIRE:
+          (__order == memory_order_release ? __ATOMIC_RELAXED:
+           (__order == memory_order_seq_cst ? __ATOMIC_SEQ_CST:
+            (__order == memory_order_acq_rel ? __ATOMIC_ACQUIRE:
+              __ATOMIC_CONSUME))));
+}
+
+static_assert((is_same<underlying_type<memory_order>::type, __memory_order_underlying_t>::value),
+  "unexpected underlying type for std::memory_order");
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // __LIBCUDACXX_ATOMIC_ORDER_H
diff --git a/libcudacxx/include/cuda/std/__atomic/platform.h b/libcudacxx/include/cuda/std/__atomic/platform.h
deleted file mode 100644
index 9a2f683d15..0000000000
--- a/libcudacxx/include/cuda/std/__atomic/platform.h
+++ /dev/null
@@ -1,14 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of libcu++, the C++ Standard Library for your entire system,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#if defined(_CCCL_COMPILER_MSVC)
-#include <cuda/std/__atomic/platform/msvc_to_builtins.h>
-#endif
diff --git a/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h b/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h
index d48c68acb4..f1ddff6dfd 100644
--- a/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h
+++ b/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h
@@ -9,10 +9,18 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef __LIBCUDACXX___ATOMIC_PLATFORM_MSVC_H
+#define __LIBCUDACXX___ATOMIC_PLATFORM_MSVC_H
+
 #ifndef _MSC_VER
 #  error "This file is only for CL.EXE's benefit"
 #endif
 
+#include <intrin.h>
+#include <cuda/std/cassert>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
 #define _LIBCUDACXX_COMPILER_BARRIER() _ReadWriteBarrier()
 
 #if defined(_M_ARM) || defined(_M_ARM64)
@@ -621,3 +629,7 @@ _Type __atomic_fetch_min(_Type volatile* __ptr, _Delta __val, int __memorder)
   }
   return __expected;
 }
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // __LIBCUDACXX___ATOMIC_PLATFORM_MSVC_H
diff --git a/libcudacxx/include/cuda/std/__atomic/platform/platform.h b/libcudacxx/include/cuda/std/__atomic/platform/platform.h
new file mode 100644
index 0000000000..cabb9de827
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/platform/platform.h
@@ -0,0 +1,59 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#if defined(_CCCL_COMPILER_MSVC)
+#include <cuda/std/__atomic/platform/msvc_to_builtins.h>
+#endif
+
+#if defined(__CLANG_ATOMIC_BOOL_LOCK_FREE)
+# define ATOMIC_BOOL_LOCK_FREE      __CLANG_ATOMIC_BOOL_LOCK_FREE
+# define ATOMIC_CHAR_LOCK_FREE      __CLANG_ATOMIC_CHAR_LOCK_FREE
+# define ATOMIC_CHAR16_T_LOCK_FREE  __CLANG_ATOMIC_CHAR16_T_LOCK_FREE
+# define ATOMIC_CHAR32_T_LOCK_FREE  __CLANG_ATOMIC_CHAR32_T_LOCK_FREE
+# define ATOMIC_WCHAR_T_LOCK_FREE   __CLANG_ATOMIC_WCHAR_T_LOCK_FREE
+# define ATOMIC_SHORT_LOCK_FREE     __CLANG_ATOMIC_SHORT_LOCK_FREE
+# define ATOMIC_INT_LOCK_FREE       __CLANG_ATOMIC_INT_LOCK_FREE
+# define ATOMIC_LONG_LOCK_FREE      __CLANG_ATOMIC_LONG_LOCK_FREE
+# define ATOMIC_LLONG_LOCK_FREE     __CLANG_ATOMIC_LLONG_LOCK_FREE
+# define ATOMIC_POINTER_LOCK_FREE   __CLANG_ATOMIC_POINTER_LOCK_FREE
+#elif defined(__GCC_ATOMIC_BOOL_LOCK_FREE)
+# define ATOMIC_BOOL_LOCK_FREE      __GCC_ATOMIC_BOOL_LOCK_FREE
+# define ATOMIC_CHAR_LOCK_FREE      __GCC_ATOMIC_CHAR_LOCK_FREE
+# define ATOMIC_CHAR16_T_LOCK_FREE  __GCC_ATOMIC_CHAR16_T_LOCK_FREE
+# define ATOMIC_CHAR32_T_LOCK_FREE  __GCC_ATOMIC_CHAR32_T_LOCK_FREE
+# define ATOMIC_WCHAR_T_LOCK_FREE   __GCC_ATOMIC_WCHAR_T_LOCK_FREE
+# define ATOMIC_SHORT_LOCK_FREE     __GCC_ATOMIC_SHORT_LOCK_FREE
+# define ATOMIC_INT_LOCK_FREE       __GCC_ATOMIC_INT_LOCK_FREE
+# define ATOMIC_LONG_LOCK_FREE      __GCC_ATOMIC_LONG_LOCK_FREE
+# define ATOMIC_LLONG_LOCK_FREE     __GCC_ATOMIC_LLONG_LOCK_FREE
+# define ATOMIC_POINTER_LOCK_FREE   __GCC_ATOMIC_POINTER_LOCK_FREE
+#endif
+
+#if !defined(__CLANG_ATOMIC_BOOL_LOCK_FREE) && !defined(__GCC_ATOMIC_BOOL_LOCK_FREE)
+#define ATOMIC_BOOL_LOCK_FREE      2
+#define ATOMIC_CHAR_LOCK_FREE      2
+#define ATOMIC_CHAR16_T_LOCK_FREE  2
+#define ATOMIC_CHAR32_T_LOCK_FREE  2
+#define ATOMIC_WCHAR_T_LOCK_FREE   2
+#define ATOMIC_SHORT_LOCK_FREE     2
+#define ATOMIC_INT_LOCK_FREE       2
+#define ATOMIC_LONG_LOCK_FREE      2
+#define ATOMIC_LLONG_LOCK_FREE     2
+#define ATOMIC_POINTER_LOCK_FREE   2
+#endif //!defined(__CLANG_ATOMIC_BOOL_LOCK_FREE) && !defined(__GCC_ATOMIC_BOOL_LOCK_FREE)
+
+#if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+template<typename _Tp> struct __atomic_is_always_lock_free {
+    enum { __value = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0) }; };
+#else
+template<typename _Tp> struct __atomic_is_always_lock_free {
+    enum { __value = sizeof(_Tp) <= 8 }; };
+#endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
diff --git a/libcudacxx/include/cuda/std/__atomic/scopes.h b/libcudacxx/include/cuda/std/__atomic/scopes.h
new file mode 100644
index 0000000000..3208227dc8
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/scopes.h
@@ -0,0 +1,52 @@
+#ifndef __LIBCUDACXX_ATOMIC_SCOPES_H
+#define __LIBCUDACXX_ATOMIC_SCOPES_H
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+// REMEMBER CHANGES TO THESE ARE ABI BREAKING
+// TODO: Space values out for potential new scopes
+#ifndef __ATOMIC_BLOCK
+#define __ATOMIC_SYSTEM 0 // 0 indicates default
+#define __ATOMIC_DEVICE 1
+#define __ATOMIC_BLOCK 2
+#define __ATOMIC_THREAD 10
+#endif //__ATOMIC_BLOCK
+
+enum thread_scope {
+    thread_scope_system = __ATOMIC_SYSTEM,
+    thread_scope_device = __ATOMIC_DEVICE,
+    thread_scope_block = __ATOMIC_BLOCK,
+    thread_scope_thread = __ATOMIC_THREAD
+};
+
+#define _LIBCUDACXX_ATOMIC_SCOPE_TYPE ::cuda::thread_scope
+#define _LIBCUDACXX_ATOMIC_SCOPE_DEFAULT ::cuda::thread_scope::system
+
+struct __thread_scope_thread_tag { };
+struct __thread_scope_block_tag { };
+struct __thread_scope_device_tag { };
+struct __thread_scope_system_tag { };
+
+template<int _Scope>  struct __scope_enum_to_tag { };
+/* This would be the implementation once an actual thread-scope backend exists.
+template<> struct __scope_enum_to_tag<(int)thread_scope_thread> {
+    using type = __thread_scope_thread_tag; };
+Until then: */
+template<> struct __scope_enum_to_tag<(int)thread_scope_thread> {
+    using type = __thread_scope_block_tag; };
+template<> struct __scope_enum_to_tag<(int)thread_scope_block> {
+    using type = __thread_scope_block_tag; };
+template<> struct __scope_enum_to_tag<(int)thread_scope_device> {
+    using type = __thread_scope_device_tag; };
+template<> struct __scope_enum_to_tag<(int)thread_scope_system> {
+    using type = __thread_scope_system_tag; };
+
+template <int _Scope>
+_LIBCUDACXX_INLINE_VISIBILITY auto constexpr __scope_tag() ->
+        typename __scope_enum_to_tag<_Scope>::type {
+    return typename __scope_enum_to_tag<_Scope>::type();
+}
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // __LIBCUDACXX_ATOMIC_SCOPES_H
diff --git a/libcudacxx/include/cuda/std/__atomic/storage/base.h b/libcudacxx/include/cuda/std/__atomic/storage/base.h
new file mode 100644
index 0000000000..ef197fd4ef
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/storage/base.h
@@ -0,0 +1,60 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___ATOMIC_STORAGE_BASE_H
+#define _LIBCUDACXX___ATOMIC_STORAGE_BASE_H
+
+#include <cuda/std/detail/__config>
+
+#include <cuda/std/type_traits>
+
+#include <cuda/std/__atomic/storage/common.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+struct __atomic_base_tag {};
+
+template <typename _Tp>
+struct __atomic_storage {
+  using __underlying_t = _Tp;
+  using __tag_t = __atomic_base_tag;
+
+#if !defined(_CCCL_COMPILER_GCC) || (__GNUC__ >= 5)
+  static_assert(is_trivially_copyable<_Tp>::value,
+    "std::atomic<Tp> requires that 'Tp' be a trivially copyable type");
+#endif
+
+  _ALIGNAS(sizeof(_Tp)) _Tp __a_value;
+
+  _LIBCUDACXX_HOST_DEVICE
+  __atomic_storage() noexcept
+    : __a_value() {}
+  _LIBCUDACXX_HOST_DEVICE constexpr explicit
+  __atomic_storage(_Tp value) noexcept
+    : __a_value(value) {}
+
+  _LIBCUDACXX_HOST_DEVICE inline auto operator()() -> __underlying_t* {
+    return &__a_value;
+  }
+  _LIBCUDACXX_HOST_DEVICE inline auto operator()() volatile -> volatile __underlying_t* {
+    return &__a_value;
+  }
+  _LIBCUDACXX_HOST_DEVICE inline auto operator()() const -> const __underlying_t* {
+    return &__a_value;
+  }
+  _LIBCUDACXX_HOST_DEVICE inline auto operator()() const volatile -> const volatile __underlying_t* {
+    return &__a_value;
+  }
+};
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX___ATOMIC_STORAGE_BASE_H
diff --git a/libcudacxx/include/cuda/std/__atomic/storage/common.h b/libcudacxx/include/cuda/std/__atomic/storage/common.h
new file mode 100644
index 0000000000..22f946aada
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/storage/common.h
@@ -0,0 +1,46 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___ATOMIC_STORAGE_COMMON_H
+#define _LIBCUDACXX___ATOMIC_STORAGE_COMMON_H
+
+#include <cuda/std/type_traits>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+// [atomics.types.generic]p1 guarantees _Tp is trivially copyable. Because
+// the default operator= in an object is not volatile, a byte-by-byte copy
+// is required.
+template <typename _Tp, typename _Tv>
+__enable_if_t<is_assignable<_Tp&, _Tv>::value>
+_LIBCUDACXX_HOST_DEVICE __atomic_assign_volatile(_Tp& __a_value, _Tv const& __val) {
+  __a_value = __val;
+}
+
+template <typename _Tp, typename _Tv>
+__enable_if_t<is_assignable<_Tp&, _Tv>::value>
+_LIBCUDACXX_HOST_DEVICE __atomic_assign_volatile(_Tp volatile& __a_value, _Tv volatile const& __val) {
+  volatile char* __to = reinterpret_cast<volatile char*>(&__a_value);
+  volatile char* __end = __to + sizeof(_Tp);
+  volatile const char* __from = reinterpret_cast<volatile const char*>(&__val);
+  while (__to != __end)
+    *__to++ = *__from++;
+}
+
+template <typename _Tp>
+using __atomic_underlying_t = typename __remove_cvref_t<_Tp>::__underlying_t;
+
+template <typename _Tp>
+using __atomic_tag_t = typename __remove_cvref_t<_Tp>::__tag_t;
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX___ATOMIC_STORAGE_COMMON_H
diff --git a/libcudacxx/include/cuda/std/__atomic/storage/locked.h b/libcudacxx/include/cuda/std/__atomic/storage/locked.h
new file mode 100644
index 0000000000..ab359bc780
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/storage/locked.h
@@ -0,0 +1,204 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___ATOMIC_STORAGE_LOCKED_H
+#define _LIBCUDACXX___ATOMIC_STORAGE_LOCKED_H
+
+#include <cuda/std/detail/__config>
+
+#include <cuda/std/type_traits>
+
+#include <cuda/std/__atomic/storage/base.h>
+#include <cuda/std/__atomic/order.h>
+#include <cuda/std/__atomic/scopes.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+// Locked atomics must override the dispatch to be able to implement RMW primitives around the embedded lock.
+struct __atomic_locked_tag {};
+
+template<typename _Tp>
+struct __atomic_locked_storage {
+  using __underlying_t = typename remove_cv<_Tp>::type;
+  using __tag_t = typename __atomic_locked_tag;
+
+  _LIBCUDACXX_HOST_DEVICE
+  __atomic_locked_storage() noexcept
+    : __a_value(), __a_lock(0) {}
+  _LIBCUDACXX_HOST_DEVICE constexpr explicit
+  __atomic_locked_storage(_Tp value) noexcept
+    : __a_value(value), __a_lock(0) {}
+
+  _Tp __a_value;
+  mutable __atomic_storage<_LIBCUDACXX_ATOMIC_FLAG_TYPE> __a_lock;
+
+  template <typename _Sco>
+  _LIBCUDACXX_HOST_DEVICE void __lock(_Sco) const volatile {
+    while(1 == __atomic_exchange_dispatch(__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), memory_order_acquire, _Sco{}))
+        /*spin*/;
+  }
+  template <typename _Sco>
+  _LIBCUDACXX_HOST_DEVICE void __lock(_Sco) const {
+    while(1 == __atomic_exchange_dispatch(__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), memory_order_acquire, _Sco{}))
+        /*spin*/;
+  }
+  template <typename _Sco>
+  _LIBCUDACXX_HOST_DEVICE void __unlock(_Sco) const volatile {
+    __atomic_store_dispatch(__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), memory_order_release, _Sco{});
+  }
+  template <typename _Sco>
+  _LIBCUDACXX_HOST_DEVICE void __unlock(_Sco) const {
+    __atomic_store_dispatch(__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), memory_order_release, _Sco{});
+  }
+};
+
+template <typename _Tp, typename _Sco>
+_LIBCUDACXX_HOST_DEVICE
+void __atomic_init_dispatch(_Tp& __a,  __atomic_underlying_t<_Tp> __val, _Sco, __atomic_locked_tag) {
+  __atomic_assign_volatile(__a.__a_value, __val);
+}
+
+template <typename _Tp, typename _Sco>
+_LIBCUDACXX_HOST_DEVICE
+void __atomic_store_dispatch(_Tp& __a,  __atomic_underlying_t<_Tp> __val, memory_order, _Sco, __atomic_locked_tag) {
+  __a.__lock(_Sco{});
+  __atomic_assign_volatile(__a.__a_value, __val);
+  __a.__unlock(_Sco{});
+}
+
+template <typename _Tp, typename _Sco>
+_LIBCUDACXX_HOST_DEVICE
+__atomic_underlying_t<_Tp> __atomic_load_dispatch(const _Tp& __a, memory_order, _Sco, __atomic_locked_tag) {
+  __atomic_underlying_t<_Tp> __old;
+  __a.__lock(_Sco{});
+  __atomic_assign_volatile(__old, __a.__a_value);
+  __a.__unlock(_Sco{});
+  return __old;
+}
+
+template <typename _Tp, typename _Sco>
+_LIBCUDACXX_HOST_DEVICE
+__atomic_underlying_t<_Tp> __atomic_exchange_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __value, memory_order, _Sco, __atomic_locked_tag) {
+  __atomic_underlying_t<_Tp> __old;
+  __a.__lock(_Sco{});
+  __atomic_assign_volatile(__old, __a.__a_value);
+  __atomic_assign_volatile(__a.__a_value, __value);
+  __a.__unlock(_Sco{});
+  return __old;
+}
+
+template <typename _Tp, typename _Sco>
+_LIBCUDACXX_HOST_DEVICE
+bool __atomic_compare_exchange_strong_dispatch(_Tp& __a,
+                                          __atomic_underlying_t<_Tp>* __expected, __atomic_underlying_t<_Tp> __value, memory_order, memory_order, _Sco, __atomic_locked_tag) {
+  __atomic_underlying_t<_Tp> __temp;
+  __a.__lock(_Sco{});
+  __atomic_assign_volatile(__temp, __a.__a_value);
+  bool __ret = __temp == *__expected;
+  if(__ret)
+    __atomic_assign_volatile(__a.__a_value, __value);
+  else
+    __atomic_assign_volatile(*__expected, __a.__a_value);
+  __a.__unlock(_Sco{});
+  return __ret;
+}
+
+template <typename _Tp, typename _Sco>
+_LIBCUDACXX_HOST_DEVICE
+bool __atomic_compare_exchange_weak_dispatch(_Tp& __a,
+                                        __atomic_underlying_t<_Tp>* __expected, __atomic_underlying_t<_Tp> __value, memory_order, memory_order, _Sco, __atomic_locked_tag) {
+  __atomic_underlying_t<_Tp> __temp;
+  __a.__lock(_Sco{});
+  __atomic_assign_volatile(__temp, __a.__a_value);
+  bool __ret = __temp == *__expected;
+  if(__ret)
+    __atomic_assign_volatile(__a.__a_value, __value);
+  else
+    __atomic_assign_volatile(*__expected, __a.__a_value);
+  __a.__unlock(_Sco{});
+  return __ret;
+}
+
+template <typename _Tp, typename _Td, typename _Sco>
+_LIBCUDACXX_HOST_DEVICE
+__atomic_underlying_t<_Tp> __atomic_fetch_add_dispatch(_Tp& __a,
+                           _Td __delta, memory_order, _Sco, __atomic_locked_tag) {
+  __atomic_underlying_t<_Tp> __old;
+  __a.__lock(_Sco{});
+  __atomic_assign_volatile(__old, __a.__a_value);
+  __atomic_assign_volatile(__a.__a_value, __atomic_underlying_t<_Tp>(__old + __delta));
+  __a.__unlock(_Sco{});
+  return __old;
+}
+
+template <typename _Tp, typename _Sco>
+_LIBCUDACXX_HOST_DEVICE
+__atomic_underlying_t<_Tp> __atomic_fetch_add_dispatch(_Tp& __a,
+                           ptrdiff_t __delta, memory_order, _Sco, __atomic_locked_tag) {
+  __atomic_underlying_t<_Tp> __old;
+  __a.__lock(_Sco{});
+  __atomic_assign_volatile(__old, __a.__a_value);
+  __atomic_assign_volatile(__a.__a_value, __old + __delta);
+  __a.__unlock(_Sco{});
+  return __old;
+}
+
+template <typename _Tp, typename _Td, typename _Sco>
+_LIBCUDACXX_HOST_DEVICE
+__atomic_underlying_t<_Tp> __atomic_fetch_sub_dispatch(_Tp& __a,
+                           __atomic_underlying_t<_Tp> __delta, memory_order, _Sco, __atomic_locked_tag) {
+  __atomic_underlying_t<_Tp> __old;
+  __a.__lock(_Sco{});
+  __atomic_assign_volatile(__old, __a.__a_value);
+  __atomic_assign_volatile(__a.__a_value, __atomic_underlying_t<_Tp>(__old - __delta));
+  __a.__unlock(_Sco{});
+  return __old;
+}
+
+template <typename _Tp, typename _Sco>
+_LIBCUDACXX_HOST_DEVICE
+__atomic_underlying_t<_Tp> __atomic_fetch_and_dispatch(_Tp& __a,
+                           __atomic_underlying_t<_Tp> __pattern, memory_order, _Sco, __atomic_locked_tag) {
+  __atomic_underlying_t<_Tp> __old;
+  __a.__lock(_Sco{});
+  __atomic_assign_volatile(__old, __a.__a_value);
+  __atomic_assign_volatile(__a.__a_value, __atomic_underlying_t<_Tp>(__old & __pattern));
+  __a.__unlock(_Sco{});
+  return __old;
+}
+
+template <typename _Tp, typename _Sco>
+_LIBCUDACXX_HOST_DEVICE
+__atomic_underlying_t<_Tp> __atomic_fetch_or_dispatch(_Tp& __a,
+                          __atomic_underlying_t<_Tp> __pattern, memory_order, _Sco, __atomic_locked_tag) {
+  __atomic_underlying_t<_Tp> __old;
+  __a.__lock(_Sco{});
+  __atomic_assign_volatile(__old, __a.__a_value);
+  __atomic_assign_volatile(__a.__a_value, __atomic_underlying_t<_Tp>(__old | __pattern));
+  __a.__unlock(_Sco{});
+  return __old;
+}
+
+template <typename _Tp, typename _Sco>
+_LIBCUDACXX_HOST_DEVICE
+__atomic_underlying_t<_Tp> __atomic_fetch_xor_dispatch(_Tp& __a,
+                           __atomic_underlying_t<_Tp> __pattern, memory_order, _Sco, __atomic_locked_tag) {
+  __atomic_underlying_t<_Tp> __old;
+  __a.__lock(_Sco{});
+  __atomic_assign_volatile(__old, __a.__a_value);
+  __atomic_assign_volatile(__a.__a_value, _Tp(__old ^ __pattern));
+  __a.__unlock(_Sco{});
+  return __old;
+}
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX___ATOMIC_STORAGE_LOCKED_H
diff --git a/libcudacxx/include/cuda/std/__atomic/storage/reference.h b/libcudacxx/include/cuda/std/__atomic/storage/reference.h
new file mode 100644
index 0000000000..3ead98703a
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/storage/reference.h
@@ -0,0 +1,48 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___ATOMIC_STORAGE_REF_H
+#define _LIBCUDACXX___ATOMIC_STORAGE_REF_H
+
+#include <cuda/std/detail/__config>
+
+#include <cuda/std/type_traits>
+
+#include <cuda/std/__atomic/storage/base.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+// Reference is compatible with __atomic_base_tag and uses default dispatch
+
+template <typename _Tp>
+struct __atomic_ref_storage {
+  using __underlying_t = _Tp;
+  using __tag_t = __atomic_base_tag;
+
+#if !defined(_CCCL_COMPILER_GCC) || (__GNUC__ >= 5)
+  static_assert(is_trivially_copyable<_Tp>::value,
+    "std::atomic_ref<Tp> requires that 'Tp' be a trivially copyable type");
+#endif
+
+  _Tp* __a_value;
+
+  _LIBCUDACXX_HOST_DEVICE constexpr explicit
+  __atomic_ref_storage(_Tp& value) noexcept
+    : __a_value(&value) {}
+
+  _LIBCUDACXX_HOST_DEVICE inline auto operator()() -> __underlying_t* {
+    return __a_value;
+  }
+};
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX___ATOMIC_STORAGE_REF_H
diff --git a/libcudacxx/include/cuda/std/__atomic/storage/small.h b/libcudacxx/include/cuda/std/__atomic/storage/small.h
new file mode 100644
index 0000000000..679fbd5487
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/storage/small.h
@@ -0,0 +1,177 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___ATOMIC_STORAGE_SMALL_H
+#define _LIBCUDACXX___ATOMIC_STORAGE_SMALL_H
+
+#include <cuda/std/detail/__config>
+
+#include <cuda/std/type_traits>
+
+#include <cuda/std/__atomic/storage/base.h>
+#include <cuda/std/__atomic/order.h>
+#include <cuda/std/__atomic/scopes.h>
+
+#include <cuda/std/__atomic/operations/heterogeneous.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+// Atomic small types require conversion to/from a proxy type that can be
+// manipulated by PTX without any performance overhead
+struct __atomic_small_tag {};
+
+template <typename _Tp>
+using __atomic_small_proxy_t = __conditional_t<is_signed<_Tp>::value, int32_t, uint32_t>;
+
+// Arithmetic conversions to/from proxy types
+template<class _Tp, __enable_if_t<is_arithmetic<_Tp>::value, int> = 0>
+constexpr _LIBCUDACXX_HOST_DEVICE inline __atomic_small_proxy_t<_Tp> __atomic_small_to_32(_Tp __val) {
+    return static_cast<__atomic_small_proxy_t<_Tp>>(__val);
+}
+
+template<class _Tp, __enable_if_t<is_arithmetic<_Tp>::value, int> = 0>
+constexpr _LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_small_from_32(__atomic_small_proxy_t<_Tp> __val) {
+    return static_cast<_Tp>(__val);
+}
+
+// Non-arithmetic conversion to/from proxy types
+template<class _Tp, __enable_if_t<!is_arithmetic<_Tp>::value, int> = 0>
+_LIBCUDACXX_HOST_DEVICE inline __atomic_small_proxy_t<_Tp> __atomic_small_to_32(_Tp __val) {
+    __atomic_small_proxy_t<_Tp> __temp{};
+    memcpy(&__temp, &__val, sizeof(_Tp));
+    return __temp;
+}
+
+template<class _Tp, __enable_if_t<!is_arithmetic<_Tp>::value, int> = 0>
+_LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_small_from_32(__atomic_small_proxy_t<_Tp> __val) {
+    _Tp __temp{};
+    memcpy(&__temp, &__val, sizeof(_Tp));
+    return __temp;
+}
+
+template <typename _Tp>
+struct __atomic_small_storage {
+    using __underlying_t = _Tp;
+    using __tag_t = __atomic_small_tag;
+    using __proxy_t = __atomic_small_proxy_t<_Tp>;
+
+    __atomic_small_storage() noexcept = default;
+
+    _LIBCUDACXX_HOST_DEVICE
+    constexpr explicit __atomic_small_storage(_Tp __value) : __a_value(__atomic_small_to_32(__value)) {}
+
+    __atomic_storage<__proxy_t> __a_value;
+};
+
+template <typename _Tp, typename _Sco>
+_LIBCUDACXX_HOST_DEVICE
+void __atomic_init_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, _Sco, __atomic_small_tag) {
+    __atomic_init_dispatch(__a.__a_value, __atomic_small_to_32(__val), _Sco{});
+}
+
+template <typename _Tp, typename _Sco>
+_LIBCUDACXX_HOST_DEVICE inline void __atomic_store_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco, __atomic_small_tag) {
+    __atomic_store_dispatch(__a.__a_value, __atomic_small_to_32(__val), __order, _Sco{});
+}
+
+template <typename _Tp, typename _Sco>
+_LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_load_dispatch(_Tp const& __a, memory_order __order, _Sco, __atomic_small_tag) {
+    return __atomic_small_from_32<_Tp>(__atomic_load_dispatch(__a.__a_value, __order, _Sco{}));
+}
+
+template <typename _Tp, typename _Sco>
+_LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_exchange_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __value, memory_order __order, _Sco, __atomic_small_tag) {
+    return __atomic_small_from_32<_Tp>(__atomic_exchange_dispatch(__a.__a_value, __atomic_small_to_32(__value), __order, _Sco{}));
+}
+_LIBCUDACXX_HOST_DEVICE
+inline int __cuda_memcmp(void const * __lhs, void const * __rhs, size_t __count) {
+    NV_DISPATCH_TARGET(
+        NV_IS_DEVICE, (
+            auto __lhs_c = reinterpret_cast<unsigned char const *>(__lhs);
+            auto __rhs_c = reinterpret_cast<unsigned char const *>(__rhs);
+            while (__count--) {
+                auto const __lhs_v = *__lhs_c++;
+                auto const __rhs_v = *__rhs_c++;
+                if (__lhs_v < __rhs_v) { return -1; }
+                if (__lhs_v > __rhs_v) { return 1; }
+            }
+            return 0;
+        ),
+        NV_IS_HOST, (
+            return memcmp(__lhs, __rhs, __count);
+        )
+    )
+}
+
+template <typename _Tp, typename _Sco>
+_LIBCUDACXX_HOST_DEVICE inline bool __atomic_compare_exchange_weak_dispatch(_Tp& __a, __atomic_underlying_t<_Tp>* __expected, __atomic_underlying_t<_Tp> __value, memory_order __success, memory_order __failure, _Sco, __atomic_small_tag) {
+    auto __temp_expected = __atomic_small_to_32(*__expected);
+    auto const __ret = __atomic_compare_exchange_weak_dispatch(__a.__a_value, &__temp_expected, __atomic_small_to_32(__value), __success, __failure, _Sco{});
+    auto const __actual = __atomic_small_from_32<__atomic_underlying_t<_Tp>>(__temp_expected);
+    constexpr auto __mask = static_cast<decltype(__temp_expected)>((1u << (8*sizeof(__atomic_underlying_t<_Tp>))) - 1);
+    if(!__ret) {
+        if(0 == __cuda_memcmp(&__actual, __expected, sizeof(__atomic_underlying_t<_Tp>)))
+            __atomic_fetch_and_dispatch(__a.__a_value, __mask, memory_order_relaxed, _Sco{});
+        else
+            *__expected = __actual;
+    }
+    return __ret;
+}
+
+template <typename _Tp, typename _Sco>
+_LIBCUDACXX_HOST_DEVICE inline bool __atomic_compare_exchange_strong_dispatch(_Tp& __a, __atomic_underlying_t<_Tp>* __expected, __atomic_underlying_t<_Tp> __value, memory_order __success, memory_order __failure, _Sco, __atomic_small_tag) {
+    auto const __old = *__expected;
+    while(1) {
+        if(__atomic_compare_exchange_weak_dispatch(__a, __expected, __value, __success, __failure, _Sco{}, __atomic_small_tag{}))
+            return true;
+        if(0 != __cuda_memcmp(&__old, __expected, sizeof(__atomic_underlying_t<_Tp>)))
+            return false;
+    }
+}
+
+template <typename _Tp, typename _Sco>
+_LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_fetch_add_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __delta, memory_order __order, _Sco, __atomic_small_tag) {
+    return __atomic_small_from_32<_Tp>(__atomic_fetch_add_dispatch(__a.__a_value, __atomic_small_to_32(__delta), __order, _Sco{}));
+}
+
+template <typename _Tp, typename _Sco>
+_LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_fetch_sub_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __delta, memory_order __order, _Sco, __atomic_small_tag) {
+    return __atomic_small_from_32<_Tp>(__atomic_fetch_sub_dispatch(__a.__a_value, __atomic_small_to_32(__delta), __order, _Sco{}));
+}
+
+template <typename _Tp, typename _Sco>
+_LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_fetch_and_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order __order, _Sco, __atomic_small_tag) {
+    return __atomic_small_from_32<_Tp>(__atomic_fetch_and_dispatch(__a.__a_value, __atomic_small_to_32(__pattern), __order, _Sco{}));
+}
+
+template <typename _Tp, typename _Sco>
+_LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_fetch_or_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order __order, _Sco, __atomic_small_tag) {
+    return __atomic_small_from_32<_Tp>(__atomic_fetch_or_dispatch(__a.__a_value, __atomic_small_to_32(__pattern), __order, _Sco{}));
+}
+
+template <typename _Tp, typename _Sco>
+_LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_fetch_xor_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order __order, _Sco, __atomic_small_tag) {
+    return __atomic_small_from_32<_Tp>(__atomic_fetch_xor_dispatch(__a.__a_value, __atomic_small_to_32(__pattern), __order, _Sco{}));
+}
+
+template <typename _Tp, typename _Delta, typename _Sco>
+_LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_fetch_max_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco, __atomic_small_tag) {
+    return __atomic_small_from_32<_Tp>(__atomic_fetch_max_dispatch(__a.__a_value, __atomic_small_to_32(__val), __order, _Sco{}));
+}
+
+template <typename _Tp, typename _Delta, typename _Sco>
+_LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_fetch_min_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco, __atomic_small_tag) {
+    return __atomic_small_from_32<_Tp>(__atomic_fetch_min_dispatch(__a.__a_value, __atomic_small_to_32(__val), __order, _Sco{}));
+}
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX___ATOMIC_STORAGE_SMALL_H
diff --git a/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h b/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h
new file mode 100644
index 0000000000..87ac58ca73
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h
@@ -0,0 +1,188 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___ATOMIC_WAIT_NOTIFY_WAIT_H
+#define _LIBCUDACXX___ATOMIC_WAIT_NOTIFY_WAIT_H
+
+#include <cuda/std/detail/__config>
+
+#include <cuda/std/__atomic/scopes.h>
+#include <cuda/std/__atomic/order.h>
+
+#include <cuda/std/__atomic/operations/heterogeneous.h>
+#include <cuda/std/__atomic/wait/polling.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+// Leaving this in to figure out if we want this.
+// For now this should be dead code, as we don't support platform wait.
+#ifdef _LIBCUDACXX_HAS_PLATFORM_WAIT
+
+template <class _Tp, int _Sco, __enable_if_t<!__libcpp_platform_wait_uses_type<_Tp>::__value, int> = 1>
+_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_all(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a) {
+#ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
+    auto * const __c = __libcpp_contention_state(__a);
+    __cxx_atomic_fetch_add(__cxx_atomic_rebind<_Sco>(&__c->__version), (__libcpp_platform_wait_t)1, memory_order_relaxed);
+    __cxx_atomic_thread_fence(memory_order_seq_cst);
+    if (0 != __cxx_atomic_exchange(__cxx_atomic_rebind<_Sco>(&__c->__waiters), (ptrdiff_t)0, memory_order_relaxed))
+        __libcpp_platform_wake(&__c->__version, true);
+#endif
+}
+template <class _Tp, int _Sco, __enable_if_t<!__libcpp_platform_wait_uses_type<_Tp>::__value, int> = 1>
+_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_one(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a) {
+    __cxx_atomic_notify_all(__a);
+}
+template <class _Ty, class _Tp = __detail::__cxx_atomic_underlying_t<_Ty>, int _Sco = _Ty::__sco, __enable_if_t<!__libcpp_platform_wait_uses_type<_Tp>::__value, int> = 1>
+_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_try_wait_slow(_Ty const volatile* __a, _Tp const __val, memory_order __order) {
+#ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
+    auto * const __c = __libcpp_contention_state(__a);
+    __cxx_atomic_store(__cxx_atomic_rebind<_Sco>(&__c->__waiters), (ptrdiff_t)1, memory_order_relaxed);
+    __cxx_atomic_thread_fence(memory_order_seq_cst);
+    auto const __version = __cxx_atomic_load(__cxx_atomic_rebind<_Sco>(&__c->__version), memory_order_relaxed);
+    if (!__cxx_nonatomic_compare_equal(__cxx_atomic_load(__a, __order), __val))
+        return;
+    if(sizeof(__libcpp_platform_wait_t) < 8) {
+        constexpr timespec __timeout = { 2, 0 }; // Hedge on rare 'int version' aliasing.
+        __libcpp_platform_wait(&__c->__version, __version, &__timeout);
+    }
+    else
+        __libcpp_platform_wait(&__c->__version, __version, nullptr);
+#else
+    __cxx_atomic_try_wait_slow_fallback(__a, __val, __order);
+#endif // _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
+}
+
+template <class _Tp, int _Sco, __enable_if_t<__libcpp_platform_wait_uses_type<_Tp>::__value, int> = 1>
+_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_try_wait_slow(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a, _Tp __val, memory_order) {
+#ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
+    auto * const __c = __libcpp_contention_state(__a);
+    __cxx_atomic_fetch_add(__cxx_atomic_rebind<_Sco>(&__c->__waiters), (ptrdiff_t)1, memory_order_relaxed);
+    __cxx_atomic_thread_fence(memory_order_seq_cst);
+#endif
+    __libcpp_platform_wait((_Tp*)__a, __val, nullptr);
+#ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
+    __cxx_atomic_fetch_sub(__cxx_atomic_rebind<_Sco>(&__c->__waiters), (ptrdiff_t)1, memory_order_relaxed);
+#endif
+}
+template <class _Tp, int _Sco, __enable_if_t<__libcpp_platform_wait_uses_type<_Tp>::__value, int> = 1>
+_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_all(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a) {
+#ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
+    auto * const __c = __libcpp_contention_state(__a);
+    __cxx_atomic_thread_fence(memory_order_seq_cst);
+    if (0 != __cxx_atomic_load(__cxx_atomic_rebind<_Sco>(&__c->__waiters), memory_order_relaxed))
+#endif
+        __libcpp_platform_wake((_Tp*)__a, true);
+}
+template <class _Tp, int _Sco, __enable_if_t<__libcpp_platform_wait_uses_type<_Tp>::__value, int> = 1>
+_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_one(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a) {
+#ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
+    auto * const __c = __libcpp_contention_state(__a);
+    __cxx_atomic_thread_fence(memory_order_seq_cst);
+    if (0 != __cxx_atomic_load(__cxx_atomic_rebind<_Sco>(&__c->__waiters), memory_order_relaxed))
+#endif
+        __libcpp_platform_wake((_Tp*)__a, false);
+}
+
+// Contention table wait/notify is also not supported as above.
+#elif !defined(_LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE)
+
+template <class _Tp, int _Sco>
+_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_all(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a) {
+    auto * const __c = __libcpp_contention_state(__a);
+    __cxx_atomic_thread_fence(memory_order_seq_cst);
+    if(0 == __cxx_atomic_load(__cxx_atomic_rebind<_Sco>(&__c->__credit), memory_order_relaxed))
+        return;
+    if(0 != __cxx_atomic_exchange(__cxx_atomic_rebind<_Sco>(&__c->__credit), (ptrdiff_t)0, memory_order_relaxed)) {
+        __libcpp_mutex_lock(&__c->__mutex);
+        __libcpp_mutex_unlock(&__c->__mutex);
+        __libcpp_condvar_broadcast(&__c->__condvar);
+    }
+}
+template <class _Tp, int _Sco>
+_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_one(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a) {
+    __cxx_atomic_notify_all(__a);
+}
+template <class _Tp, int _Sco>
+_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_try_wait_slow(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a, _Tp const __val, memory_order __order) {
+    auto * const __c = __libcpp_contention_state(__a);
+    __libcpp_mutex_lock(&__c->__mutex);
+    __cxx_atomic_store(__cxx_atomic_rebind<_Sco>(&__c->__credit), (ptrdiff_t)1, memory_order_relaxed);
+    __cxx_atomic_thread_fence(memory_order_seq_cst);
+    if (__cxx_nonatomic_compare_equal(__cxx_atomic_load(__a, __order), __val))
+        __libcpp_condvar_wait(&__c->__condvar, &__c->__mutex);
+    __libcpp_mutex_unlock(&__c->__mutex);
+}
+
+#else
+
+// Heterogeneous atomic impl begins here
+extern "C" _CCCL_DEVICE void __atomic_try_wait_unsupported_before_SM_70__();
+
+template <typename _Tp, typename _Sco>
+_LIBCUDACXX_INLINE_VISIBILITY void __atomic_try_wait_slow(_Tp const volatile* __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco) {
+    NV_DISPATCH_TARGET(
+        NV_PROVIDES_SM_70,
+            __atomic_try_wait_slow_fallback(__a, __val, __order, _Sco{});,
+        NV_IS_HOST,
+            __atomic_try_wait_slow_fallback(__a, __val, __order, _Sco{});,
+        NV_ANY_TARGET,
+            __atomic_try_wait_unsupported_before_SM_70__();
+    );
+}
+
+template <typename _Tp, typename _Sco>
+_LIBCUDACXX_INLINE_VISIBILITY void __atomic_notify_one(_Tp const volatile*, _Sco) {
+    NV_DISPATCH_TARGET(
+        NV_PROVIDES_SM_70,,
+        NV_IS_HOST,,
+        NV_ANY_TARGET,
+            __atomic_try_wait_unsupported_before_SM_70__();
+    );
+}
+
+template <typename _Tp, typename _Sco>
+_LIBCUDACXX_INLINE_VISIBILITY void __atomic_notify_all(_Tp const volatile*, _Sco) {
+    NV_DISPATCH_TARGET(
+        NV_PROVIDES_SM_70,,
+        NV_IS_HOST,,
+        NV_ANY_TARGET,
+            __atomic_try_wait_unsupported_before_SM_70__();
+    );
+}
+
+#endif // _LIBCUDACXX_HAS_PLATFORM_WAIT || !defined(_LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE)
+
+template <typename _Tp> _LIBCUDACXX_INLINE_VISIBILITY
+bool __nonatomic_compare_equal(_Tp const& __lhs, _Tp const& __rhs) {
+#if defined(_CCCL_CUDA_COMPILER)
+    return __lhs == __rhs;
+#else
+    return memcmp(&__lhs, &__rhs, sizeof(_Tp)) == 0;
+#endif
+}
+
+template <typename _Tp, typename _Sco>
+_LIBCUDACXX_INLINE_VISIBILITY void __atomic_wait(_Tp const volatile* __a, __atomic_underlying_t<_Tp> const __val, memory_order __order, _Sco = {}) {
+    for(int __i = 0; __i < _LIBCUDACXX_POLLING_COUNT; ++__i) {
+        if(!__nonatomic_compare_equal(__atomic_load_dispatch(*__a, __order, _Sco{}, __atomic_tag_t<_Tp>{}), __val))
+            return;
+        if(__i < 12)
+            __libcpp_thread_yield_processor();
+        else
+            __libcpp_thread_yield();
+    }
+    while(__nonatomic_compare_equal(__atomic_load_dispatch(*__a, __order, _Sco{}, __atomic_tag_t<_Tp>{}), __val))
+        __atomic_try_wait_slow(__a, __val, __order, _Sco{});
+}
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif _LIBCUDACXX___ATOMIC_WAIT_NOTIFY_WAIT_H
diff --git a/libcudacxx/include/cuda/std/__atomic/wait/polling.h b/libcudacxx/include/cuda/std/__atomic/wait/polling.h
new file mode 100644
index 0000000000..4f4a8dd9a3
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/wait/polling.h
@@ -0,0 +1,56 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___ATOMIC_WAIT_POLLING_H
+#define _LIBCUDACXX___ATOMIC_WAIT_POLLING_H
+
+#include <cuda/std/detail/__config>
+
+#include <cuda/std/__atomic/storage/common.h>
+#include <cuda/std/__atomic/order.h>
+#include <cuda/std/__atomic/scopes.h>
+
+#include <cuda/std/__atomic/operations/heterogeneous.h>
+
+#include <cuda/std/detail/libcxx/include/__threading_support>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+template <typename _Tp, typename _Sco>
+struct __atomic_poll_tester {
+    using __underlying_t = __atomic_underlying_t<_Tp>;
+
+    _Tp const volatile* __atom;
+    __underlying_t __val;
+    memory_order __order;
+
+    _LIBCUDACXX_HOST_DEVICE
+    __atomic_poll_tester(_Tp const volatile* __a, __underlying_t __v, memory_order __o)
+      : __atom(__a)
+      , __val(__v)
+      , __order(__o)
+    {}
+
+    _LIBCUDACXX_HOST_DEVICE
+    bool operator()() const {
+      return !(__atomic_load_dispatch(*__atom, __order, _Sco{}, __atomic_tag_t<_Tp>{}) == __val);
+    }
+};
+
+template <typename _Tp, typename _Sco>
+_LIBCUDACXX_HOST_DEVICE
+void __atomic_try_wait_slow_fallback(_Tp const volatile* __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco) {
+    __libcpp_thread_poll_with_backoff(__atomic_poll_tester<_Tp, _Sco>(__a, __val, __order));
+}
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // _LIBCUDACXX___ATOMIC_WAIT_POLLING_H
diff --git a/libcudacxx/include/cuda/std/atomic b/libcudacxx/include/cuda/std/atomic
index 298b69726f..8e9eaf1664 100644
--- a/libcudacxx/include/cuda/std/atomic
+++ b/libcudacxx/include/cuda/std/atomic
@@ -589,166 +589,20 @@ void atomic_signal_fence(memory_order m) noexcept;
 # error C++ standard library is incompatible with <stdatomic.h>
 #endif
 
-#define _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m) \
-  _LIBCUDACXX_DIAGNOSE_WARNING(__m == memory_order_consume || \
-                           __m == memory_order_acquire || \
-                           __m == memory_order_acq_rel,   \
-                        "memory order argument to atomic operation is invalid")
-
-#define _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m) \
-  _LIBCUDACXX_DIAGNOSE_WARNING(__m == memory_order_release || \
-                           __m == memory_order_acq_rel,   \
-                        "memory order argument to atomic operation is invalid")
-
-#define _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__m, __f) \
-  _LIBCUDACXX_DIAGNOSE_WARNING(__f == memory_order_release || \
-                           __f == memory_order_acq_rel,   \
-                        "memory order argument to atomic operation is invalid")
-
-#if defined(_LIBCUDACXX_HAS_MSVC_ATOMIC_IMPL)
-#  include <intrin.h>
-#endif
+#include <cuda/std/__atomic/order.h>
+#include <cuda/std/__atomic/scopes.h>
 
-#if !defined(_CCCL_COMPILER_NVRTC)
-#  include <string.h>
-#endif
+#include <cuda/std/__atomic/storage/common.h>
+#include <cuda/std/__atomic/storage/base.h>
+#include <cuda/std/__atomic/storage/locked.h>
+#include <cuda/std/__atomic/storage/reference.h>
+#include <cuda/std/__atomic/storage/small.h>
 
-#if !defined(__CLANG_ATOMIC_BOOL_LOCK_FREE) && !defined(__GCC_ATOMIC_BOOL_LOCK_FREE)
-#define ATOMIC_BOOL_LOCK_FREE      2
-#define ATOMIC_CHAR_LOCK_FREE      2
-#define ATOMIC_CHAR16_T_LOCK_FREE  2
-#define ATOMIC_CHAR32_T_LOCK_FREE  2
-#define ATOMIC_WCHAR_T_LOCK_FREE   2
-#define ATOMIC_SHORT_LOCK_FREE     2
-#define ATOMIC_INT_LOCK_FREE       2
-#define ATOMIC_LONG_LOCK_FREE      2
-#define ATOMIC_LLONG_LOCK_FREE     2
-#define ATOMIC_POINTER_LOCK_FREE   2
-#endif //!defined(__CLANG_ATOMIC_BOOL_LOCK_FREE) && !defined(__GCC_ATOMIC_BOOL_LOCK_FREE)
-
-#ifndef __ATOMIC_RELAXED
-#define __ATOMIC_RELAXED 0
-#define __ATOMIC_CONSUME 1
-#define __ATOMIC_ACQUIRE 2
-#define __ATOMIC_RELEASE 3
-#define __ATOMIC_ACQ_REL 4
-#define __ATOMIC_SEQ_CST 5
-#endif //__ATOMIC_RELAXED
+#include <cuda/std/__atomic/wait/polling.h>
+#include <cuda/std/__atomic/wait/notify_wait.h>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-// Figure out what the underlying type for `memory_order` would be if it were
-// declared as an unscoped enum (accounting for -fshort-enums). Use this result
-// to pin the underlying type in C++20.
-enum __legacy_memory_order {
-    __mo_relaxed,
-    __mo_consume,
-    __mo_acquire,
-    __mo_release,
-    __mo_acq_rel,
-    __mo_seq_cst
-};
-
-typedef underlying_type<__legacy_memory_order>::type __memory_order_underlying_t;
-
-#if _CCCL_STD_VER > 2017
-
-enum class memory_order : __memory_order_underlying_t {
-  relaxed = __mo_relaxed,
-  consume = __mo_consume,
-  acquire = __mo_acquire,
-  release = __mo_release,
-  acq_rel = __mo_acq_rel,
-  seq_cst = __mo_seq_cst
-};
-
-inline constexpr auto memory_order_relaxed = memory_order::relaxed;
-inline constexpr auto memory_order_consume = memory_order::consume;
-inline constexpr auto memory_order_acquire = memory_order::acquire;
-inline constexpr auto memory_order_release = memory_order::release;
-inline constexpr auto memory_order_acq_rel = memory_order::acq_rel;
-inline constexpr auto memory_order_seq_cst = memory_order::seq_cst;
-
-#else
-
-typedef enum memory_order {
-  memory_order_relaxed = __mo_relaxed,
-  memory_order_consume = __mo_consume,
-  memory_order_acquire = __mo_acquire,
-  memory_order_release = __mo_release,
-  memory_order_acq_rel = __mo_acq_rel,
-  memory_order_seq_cst = __mo_seq_cst,
-} memory_order;
-
-#endif // _CCCL_STD_VER > 2017
-
-template <typename _Tp> _LIBCUDACXX_INLINE_VISIBILITY
-bool __cxx_nonatomic_compare_equal(_Tp const& __lhs, _Tp const& __rhs) {
-#if defined(_CCCL_CUDA_COMPILER)
-    return __lhs == __rhs;
-#else
-    return memcmp(&__lhs, &__rhs, sizeof(_Tp)) == 0;
-#endif
-}
-
-static_assert((is_same<underlying_type<memory_order>::type, __memory_order_underlying_t>::value),
-  "unexpected underlying type for std::memory_order");
-
-#if defined(_LIBCUDACXX_HAS_GCC_ATOMIC_IMP) || \
-    defined(_LIBCUDACXX_ATOMIC_ONLY_USE_BUILTINS)
-
-// [atomics.types.generic]p1 guarantees _Tp is trivially copyable. Because
-// the default operator= in an object is not volatile, a byte-by-byte copy
-// is required.
-template <typename _Tp, typename _Tv> _LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t<is_assignable<_Tp&, _Tv>::value>
-__cxx_atomic_assign_volatile(_Tp& __a_value, _Tv const& __val) {
-  __a_value = __val;
-}
-template <typename _Tp, typename _Tv> _LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t<is_assignable<_Tp&, _Tv>::value>
-__cxx_atomic_assign_volatile(_Tp volatile& __a_value, _Tv volatile const& __val) {
-  volatile char* __to = reinterpret_cast<volatile char*>(&__a_value);
-  volatile char* __end = __to + sizeof(_Tp);
-  volatile const char* __from = reinterpret_cast<volatile const char*>(&__val);
-  while (__to != __end)
-    *__to++ = *__from++;
-}
-
-#endif
-
-// Headers are wrapped like so: (cuda::std::|std::)detail
-namespace __detail {
-#if defined(_LIBCUDACXX_HAS_CUDA_ATOMIC_EXT)
-#  include <cuda/std/detail/libcxx/include/support/atomic/atomic_scopes.h>
-#endif
-
-#if defined(_LIBCUDACXX_HAS_CUDA_ATOMIC_IMPL)
-#  include <cuda/std/detail/libcxx/include/support/atomic/atomic_cuda.h>
-#elif defined(_LIBCUDACXX_HAS_MSVC_ATOMIC_IMPL)
-#  include <cuda/std/detail/libcxx/include/support/atomic/atomic_msvc.h>
-#elif defined(_LIBCUDACXX_HAS_GCC_ATOMIC_IMP)
-#  include <cuda/std/detail/libcxx/include/support/atomic/atomic_gcc.h>
-#elif defined(_LIBCUDACXX_HAS_C_ATOMIC_IMP)
-// TODO: Maybe support C11 atomics?
-// #include <cuda/std/detail/libcxx/include/support/atomic/atomic_c11.h>
-#endif // _LIBCUDACXX_HAS_GCC_ATOMIC_IMP, _LIBCUDACXX_HAS_C_ATOMIC_IMP
-}
-
-using __detail::__cxx_atomic_base_impl;
-using __detail::__cxx_atomic_ref_base_impl;
-using __detail::__cxx_atomic_thread_fence;
-using __detail::__cxx_atomic_signal_fence;
-using __detail::__cxx_atomic_load;
-using __detail::__cxx_atomic_store;
-using __detail::__cxx_atomic_exchange;
-using __detail::__cxx_atomic_compare_exchange_weak;
-using __detail::__cxx_atomic_compare_exchange_strong;
-using __detail::__cxx_atomic_fetch_add;
-using __detail::__cxx_atomic_fetch_sub;
-using __detail::__cxx_atomic_fetch_or;
-using __detail::__cxx_atomic_fetch_and;
-using __detail::__cxx_atomic_fetch_xor;
 
 template <class _Tp>
 _LIBCUDACXX_INLINE_VISIBILITY
@@ -757,529 +611,22 @@ _Tp kill_dependency(_Tp __y) noexcept
     return __y;
 }
 
-#if defined(__CLANG_ATOMIC_BOOL_LOCK_FREE)
-# define ATOMIC_BOOL_LOCK_FREE      __CLANG_ATOMIC_BOOL_LOCK_FREE
-# define ATOMIC_CHAR_LOCK_FREE      __CLANG_ATOMIC_CHAR_LOCK_FREE
-# define ATOMIC_CHAR16_T_LOCK_FREE  __CLANG_ATOMIC_CHAR16_T_LOCK_FREE
-# define ATOMIC_CHAR32_T_LOCK_FREE  __CLANG_ATOMIC_CHAR32_T_LOCK_FREE
-# define ATOMIC_WCHAR_T_LOCK_FREE   __CLANG_ATOMIC_WCHAR_T_LOCK_FREE
-# define ATOMIC_SHORT_LOCK_FREE     __CLANG_ATOMIC_SHORT_LOCK_FREE
-# define ATOMIC_INT_LOCK_FREE       __CLANG_ATOMIC_INT_LOCK_FREE
-# define ATOMIC_LONG_LOCK_FREE      __CLANG_ATOMIC_LONG_LOCK_FREE
-# define ATOMIC_LLONG_LOCK_FREE     __CLANG_ATOMIC_LLONG_LOCK_FREE
-# define ATOMIC_POINTER_LOCK_FREE   __CLANG_ATOMIC_POINTER_LOCK_FREE
-#elif defined(__GCC_ATOMIC_BOOL_LOCK_FREE)
-# define ATOMIC_BOOL_LOCK_FREE      __GCC_ATOMIC_BOOL_LOCK_FREE
-# define ATOMIC_CHAR_LOCK_FREE      __GCC_ATOMIC_CHAR_LOCK_FREE
-# define ATOMIC_CHAR16_T_LOCK_FREE  __GCC_ATOMIC_CHAR16_T_LOCK_FREE
-# define ATOMIC_CHAR32_T_LOCK_FREE  __GCC_ATOMIC_CHAR32_T_LOCK_FREE
-# define ATOMIC_WCHAR_T_LOCK_FREE   __GCC_ATOMIC_WCHAR_T_LOCK_FREE
-# define ATOMIC_SHORT_LOCK_FREE     __GCC_ATOMIC_SHORT_LOCK_FREE
-# define ATOMIC_INT_LOCK_FREE       __GCC_ATOMIC_INT_LOCK_FREE
-# define ATOMIC_LONG_LOCK_FREE      __GCC_ATOMIC_LONG_LOCK_FREE
-# define ATOMIC_LLONG_LOCK_FREE     __GCC_ATOMIC_LLONG_LOCK_FREE
-# define ATOMIC_POINTER_LOCK_FREE   __GCC_ATOMIC_POINTER_LOCK_FREE
-#endif
-
-#ifdef _LIBCUDACXX_ATOMIC_ONLY_USE_BUILTINS
-
-template<typename _Tp, int _Sco>
-struct __cxx_atomic_lock_impl {
-
-  _LIBCUDACXX_INLINE_VISIBILITY
-  __cxx_atomic_lock_impl() noexcept
-    : __a_value(), __a_lock(0) {}
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit
-  __cxx_atomic_lock_impl(_Tp value) noexcept
-    : __a_value(value), __a_lock(0) {}
-
-  _Tp __a_value;
-  mutable __cxx_atomic_base_impl<_LIBCUDACXX_ATOMIC_FLAG_TYPE, _Sco> __a_lock;
-
-  _LIBCUDACXX_INLINE_VISIBILITY void __lock() const volatile {
-    while(1 == __cxx_atomic_exchange(&__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), memory_order_acquire))
-        /*spin*/;
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY void __lock() const {
-    while(1 == __cxx_atomic_exchange(&__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), memory_order_acquire))
-        /*spin*/;
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY void __unlock() const volatile {
-    __cxx_atomic_store(&__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), memory_order_release);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY void __unlock() const {
-    __cxx_atomic_store(&__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), memory_order_release);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp __read() const volatile {
-    __lock();
-    _Tp __old;
-    __cxx_atomic_assign_volatile(__old, __a_value);
-    __unlock();
-    return __old;
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp __read() const {
-    __lock();
-    _Tp __old = __a_value;
-    __unlock();
-    return __old;
-  }
-};
-
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-void __cxx_atomic_init(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a,  _Tp __val) {
-  __cxx_atomic_assign_volatile(__a->__a_value, __val);
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-void __cxx_atomic_init(__cxx_atomic_lock_impl<_Tp, _Sco>* __a,  _Tp __val) {
-  __a->__a_value = __val;
-}
-
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-void __cxx_atomic_store(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a,  _Tp __val, memory_order) {
-  __a->__lock();
-  __cxx_atomic_assign_volatile(__a->__a_value, __val);
-  __a->__unlock();
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-void __cxx_atomic_store(__cxx_atomic_lock_impl<_Tp, _Sco>* __a,  _Tp __val, memory_order) {
-  __a->__lock();
-  __a->__a_value = __val;
-  __a->__unlock();
-}
-
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp __cxx_atomic_load(const volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, memory_order) {
-  return __a->__read();
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp __cxx_atomic_load(const __cxx_atomic_lock_impl<_Tp, _Sco>* __a, memory_order) {
-  return __a->__read();
-}
-
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp __cxx_atomic_exchange(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __value, memory_order) {
-  __a->__lock();
-  _Tp __old;
-  __cxx_atomic_assign_volatile(__old, __a->__a_value);
-  __cxx_atomic_assign_volatile(__a->__a_value, __value);
-  __a->__unlock();
-  return __old;
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp __cxx_atomic_exchange(__cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __value, memory_order) {
-  __a->__lock();
-  _Tp __old = __a->__a_value;
-  __a->__a_value = __value;
-  __a->__unlock();
-  return __old;
-}
-
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-bool __cxx_atomic_compare_exchange_strong(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a,
-                                          _Tp* __expected, _Tp __value, memory_order, memory_order) {
-  __a->__lock();
-  _Tp __temp;
-  __cxx_atomic_assign_volatile(__temp, __a->__a_value);
-  bool __ret = __temp == *__expected;
-  if(__ret)
-    __cxx_atomic_assign_volatile(__a->__a_value, __value);
-  else
-    __cxx_atomic_assign_volatile(*__expected, __a->__a_value);
-  __a->__unlock();
-  return __ret;
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-bool __cxx_atomic_compare_exchange_strong(__cxx_atomic_lock_impl<_Tp, _Sco>* __a,
-                                          _Tp* __expected, _Tp __value, memory_order, memory_order) {
-  __a->__lock();
-  bool __ret = __a->__a_value == *__expected;
-  if(__ret)
-    __a->__a_value = __value;
-  else
-    *__expected = __a->__a_value;
-  __a->__unlock();
-  return __ret;
-}
-
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-bool __cxx_atomic_compare_exchange_weak(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a,
-                                        _Tp* __expected, _Tp __value, memory_order, memory_order) {
-  __a->__lock();
-  _Tp __temp;
-  __cxx_atomic_assign_volatile(__temp, __a->__a_value);
-  bool __ret = __temp == *__expected;
-  if(__ret)
-    __cxx_atomic_assign_volatile(__a->__a_value, __value);
-  else
-    __cxx_atomic_assign_volatile(*__expected, __a->__a_value);
-  __a->__unlock();
-  return __ret;
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-bool __cxx_atomic_compare_exchange_weak(__cxx_atomic_lock_impl<_Tp, _Sco>* __a,
-                                        _Tp* __expected, _Tp __value, memory_order, memory_order) {
-  __a->__lock();
-  bool __ret = __a->__a_value == *__expected;
-  if(__ret)
-    __a->__a_value = __value;
-  else
-    *__expected = __a->__a_value;
-  __a->__unlock();
-  return __ret;
-}
-
-template <typename _Tp, typename _Td, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp __cxx_atomic_fetch_add(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a,
-                           _Td __delta, memory_order) {
-  __a->__lock();
-  _Tp __old;
-  __cxx_atomic_assign_volatile(__old, __a->__a_value);
-  __cxx_atomic_assign_volatile(__a->__a_value, _Tp(__old + __delta));
-  __a->__unlock();
-  return __old;
-}
-template <typename _Tp, typename _Td, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp __cxx_atomic_fetch_add(__cxx_atomic_lock_impl<_Tp, _Sco>* __a,
-                           _Td __delta, memory_order) {
-  __a->__lock();
-  _Tp __old = __a->__a_value;
-  __a->__a_value += __delta;
-  __a->__unlock();
-  return __old;
-}
-
-template <typename _Tp, typename _Td, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp* __cxx_atomic_fetch_add(volatile __cxx_atomic_lock_impl<_Tp*, _Sco>* __a,
-                           ptrdiff_t __delta, memory_order) {
-  __a->__lock();
-  _Tp* __old;
-  __cxx_atomic_assign_volatile(__old, __a->__a_value);
-  __cxx_atomic_assign_volatile(__a->__a_value, __old + __delta);
-  __a->__unlock();
-  return __old;
-}
-template <typename _Tp, typename _Td, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp* __cxx_atomic_fetch_add(__cxx_atomic_lock_impl<_Tp*, _Sco>* __a,
-                            ptrdiff_t __delta, memory_order) {
-  __a->__lock();
-  _Tp* __old = __a->__a_value;
-  __a->__a_value += __delta;
-  __a->__unlock();
-  return __old;
-}
-
-template <typename _Tp, typename _Td, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp __cxx_atomic_fetch_sub(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a,
-                           _Td __delta, memory_order) {
-  __a->__lock();
-  _Tp __old;
-  __cxx_atomic_assign_volatile(__old, __a->__a_value);
-  __cxx_atomic_assign_volatile(__a->__a_value, _Tp(__old - __delta));
-  __a->__unlock();
-  return __old;
-}
-template <typename _Tp, typename _Td, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp __cxx_atomic_fetch_sub(__cxx_atomic_lock_impl<_Tp, _Sco>* __a,
-                           _Td __delta, memory_order) {
-  __a->__lock();
-  _Tp __old = __a->__a_value;
-  __a->__a_value -= __delta;
-  __a->__unlock();
-  return __old;
-}
-
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp __cxx_atomic_fetch_and(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a,
-                           _Tp __pattern, memory_order) {
-  __a->__lock();
-  _Tp __old;
-  __cxx_atomic_assign_volatile(__old, __a->__a_value);
-  __cxx_atomic_assign_volatile(__a->__a_value, _Tp(__old & __pattern));
-  __a->__unlock();
-  return __old;
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp __cxx_atomic_fetch_and(__cxx_atomic_lock_impl<_Tp, _Sco>* __a,
-                           _Tp __pattern, memory_order) {
-  __a->__lock();
-  _Tp __old = __a->__a_value;
-  __a->__a_value &= __pattern;
-  __a->__unlock();
-  return __old;
-}
-
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp __cxx_atomic_fetch_or(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a,
-                          _Tp __pattern, memory_order) {
-  __a->__lock();
-  _Tp __old;
-  __cxx_atomic_assign_volatile(__old, __a->__a_value);
-  __cxx_atomic_assign_volatile(__a->__a_value, _Tp(__old | __pattern));
-  __a->__unlock();
-  return __old;
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp __cxx_atomic_fetch_or(__cxx_atomic_lock_impl<_Tp, _Sco>* __a,
-                          _Tp __pattern, memory_order) {
-  __a->__lock();
-  _Tp __old = __a->__a_value;
-  __a->__a_value |= __pattern;
-  __a->__unlock();
-  return __old;
-}
-
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp __cxx_atomic_fetch_xor(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a,
-                           _Tp __pattern, memory_order) {
-  __a->__lock();
-  _Tp __old;
-  __cxx_atomic_assign_volatile(__old, __a->__a_value);
-  __cxx_atomic_assign_volatile(__a->__a_value, _Tp(__old ^ __pattern));
-  __a->__unlock();
-  return __old;
-}
-template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp __cxx_atomic_fetch_xor(__cxx_atomic_lock_impl<_Tp, _Sco>* __a,
-                           _Tp __pattern, memory_order) {
-  __a->__lock();
-  _Tp __old = __a->__a_value;
-  __a->__a_value ^= __pattern;
-  __a->__unlock();
-  return __old;
-}
-
-#if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
-
-template<typename _Tp> struct __cxx_is_always_lock_free {
-    enum { __value = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0) }; };
-
-#else
-
-template<typename _Tp> struct __cxx_is_always_lock_free {
-    enum { __value = sizeof(_Tp) <= 8 }; };
-
-#endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
-
-template <typename _Tp, int _Sco>
-struct __cxx_atomic_impl_conditional {
-    using type = __conditional_t<__cxx_is_always_lock_free<_Tp>::__value,
-                                                __cxx_atomic_base_impl<_Tp, _Sco>,
-                                                __cxx_atomic_lock_impl<_Tp, _Sco> >;
-};
-
-template <typename _Tp, int _Sco,
-          typename _Base = typename __cxx_atomic_impl_conditional<_Tp, _Sco>::type >
-#else
-template <typename _Tp, int _Sco,
-          typename _Base = __cxx_atomic_base_impl<_Tp, _Sco> >
-#endif //_LIBCUDACXX_ATOMIC_ONLY_USE_BUILTINS
-struct __cxx_atomic_impl : public _Base {
-  __cxx_atomic_impl() noexcept = default;
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit __cxx_atomic_impl(_Tp value) noexcept
-    : _Base(value) {}
-};
-
-
-template<int _Sco, typename _Tp = int>
-_LIBCUDACXX_INLINE_VISIBILITY
-__cxx_atomic_impl<_Tp, _Sco>* __cxx_atomic_rebind(_Tp* __inst) {
-    static_assert(sizeof(__cxx_atomic_impl<_Tp, _Sco>) == sizeof(_Tp),"");
-    static_assert(alignof(__cxx_atomic_impl<_Tp, _Sco>) == alignof(_Tp),"");
-    return (__cxx_atomic_impl<_Tp, _Sco>*)__inst;
-}
-
-template <typename _Tp, int _Sco>
-using __cxx_atomic_ref_impl = __cxx_atomic_ref_base_impl<_Tp, _Sco>;
-
-#ifdef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
+template <typename _Tp>
+struct __atomic_impl_traits {
+    static  constexpr bool __atomic_requires_lock = __atomic_is_always_lock_free<_Tp>::__value;
+    static  constexpr bool __atomic_requires_small = sizeof(_Tp) < 4;
+    static  constexpr bool __atomic_supports_reference = sizeof(_Tp) >= 4 && sizeof(_Tp) <= 8;
 
-template <class _Ty, class _Tp = __detail::__cxx_atomic_underlying_t<_Ty>, int _Sco = _Ty::__sco>
-struct __cxx_atomic_poll_tester {
-    _Ty const volatile* __a;
-    _Tp __val;
-    memory_order __order;
+    using __atomic_storage_t = typename __conditional_t<__atomic_requires_small,
+                                                __atomic_small_storage<_Tp>,
+                                                __conditional_t<__atomic_requires_lock,
+                                                    __atomic_locked_storage<_Tp>,
+                                                    __atomic_storage<_Tp>
+                                                    >>;
 
-    _LIBCUDACXX_INLINE_VISIBILITY __cxx_atomic_poll_tester(_Ty const volatile* __a_, _Tp __val_, memory_order __order_)
-      : __a(__a_)
-      , __val(__val_)
-      , __order(__order_)
-    {}
-
-    _LIBCUDACXX_INLINE_VISIBILITY bool operator()() const {
-      return !(__cxx_atomic_load(__a, __order) == __val);
-    }
+    using __atomic_ref_storage_t = typename __atomic_ref_storage<_Tp>;
 };
 
-template <class _Ty, class _Tp = __detail::__cxx_atomic_underlying_t<_Ty>, int _Sco = _Ty::__sco>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_try_wait_slow_fallback(_Ty const volatile* __a, _Tp __val, memory_order __order) {
-    __libcpp_thread_poll_with_backoff(__cxx_atomic_poll_tester<_Ty>(__a, __val, __order));
-}
-
-#endif
-
-#ifdef _LIBCUDACXX_HAS_PLATFORM_WAIT
-
-template <class _Tp, int _Sco, __enable_if_t<!__libcpp_platform_wait_uses_type<_Tp>::__value, int> = 1>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_all(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a) {
-#ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
-    auto * const __c = __libcpp_contention_state(__a);
-    __cxx_atomic_fetch_add(__cxx_atomic_rebind<_Sco>(&__c->__version), (__libcpp_platform_wait_t)1, memory_order_relaxed);
-    __cxx_atomic_thread_fence(memory_order_seq_cst);
-    if (0 != __cxx_atomic_exchange(__cxx_atomic_rebind<_Sco>(&__c->__waiters), (ptrdiff_t)0, memory_order_relaxed))
-        __libcpp_platform_wake(&__c->__version, true);
-#endif
-}
-template <class _Tp, int _Sco, __enable_if_t<!__libcpp_platform_wait_uses_type<_Tp>::__value, int> = 1>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_one(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a) {
-    __cxx_atomic_notify_all(__a);
-}
-template <class _Ty, class _Tp = __detail::__cxx_atomic_underlying_t<_Ty>, int _Sco = _Ty::__sco, __enable_if_t<!__libcpp_platform_wait_uses_type<_Tp>::__value, int> = 1>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_try_wait_slow(_Ty const volatile* __a, _Tp const __val, memory_order __order) {
-#ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
-    auto * const __c = __libcpp_contention_state(__a);
-    __cxx_atomic_store(__cxx_atomic_rebind<_Sco>(&__c->__waiters), (ptrdiff_t)1, memory_order_relaxed);
-    __cxx_atomic_thread_fence(memory_order_seq_cst);
-    auto const __version = __cxx_atomic_load(__cxx_atomic_rebind<_Sco>(&__c->__version), memory_order_relaxed);
-    if (!__cxx_nonatomic_compare_equal(__cxx_atomic_load(__a, __order), __val))
-        return;
-    if(sizeof(__libcpp_platform_wait_t) < 8) {
-        constexpr timespec __timeout = { 2, 0 }; // Hedge on rare 'int version' aliasing.
-        __libcpp_platform_wait(&__c->__version, __version, &__timeout);
-    }
-    else
-        __libcpp_platform_wait(&__c->__version, __version, nullptr);
-#else
-    __cxx_atomic_try_wait_slow_fallback(__a, __val, __order);
-#endif // _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
-}
-
-template <class _Tp, int _Sco, __enable_if_t<__libcpp_platform_wait_uses_type<_Tp>::__value, int> = 1>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_try_wait_slow(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a, _Tp __val, memory_order) {
-#ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
-    auto * const __c = __libcpp_contention_state(__a);
-    __cxx_atomic_fetch_add(__cxx_atomic_rebind<_Sco>(&__c->__waiters), (ptrdiff_t)1, memory_order_relaxed);
-    __cxx_atomic_thread_fence(memory_order_seq_cst);
-#endif
-    __libcpp_platform_wait((_Tp*)__a, __val, nullptr);
-#ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
-    __cxx_atomic_fetch_sub(__cxx_atomic_rebind<_Sco>(&__c->__waiters), (ptrdiff_t)1, memory_order_relaxed);
-#endif
-}
-template <class _Tp, int _Sco, __enable_if_t<__libcpp_platform_wait_uses_type<_Tp>::__value, int> = 1>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_all(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a) {
-#ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
-    auto * const __c = __libcpp_contention_state(__a);
-    __cxx_atomic_thread_fence(memory_order_seq_cst);
-    if (0 != __cxx_atomic_load(__cxx_atomic_rebind<_Sco>(&__c->__waiters), memory_order_relaxed))
-#endif
-        __libcpp_platform_wake((_Tp*)__a, true);
-}
-template <class _Tp, int _Sco, __enable_if_t<__libcpp_platform_wait_uses_type<_Tp>::__value, int> = 1>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_one(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a) {
-#ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
-    auto * const __c = __libcpp_contention_state(__a);
-    __cxx_atomic_thread_fence(memory_order_seq_cst);
-    if (0 != __cxx_atomic_load(__cxx_atomic_rebind<_Sco>(&__c->__waiters), memory_order_relaxed))
-#endif
-        __libcpp_platform_wake((_Tp*)__a, false);
-}
-
-#elif !defined(_LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE)
-
-template <class _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_all(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a) {
-    auto * const __c = __libcpp_contention_state(__a);
-    __cxx_atomic_thread_fence(memory_order_seq_cst);
-    if(0 == __cxx_atomic_load(__cxx_atomic_rebind<_Sco>(&__c->__credit), memory_order_relaxed))
-        return;
-    if(0 != __cxx_atomic_exchange(__cxx_atomic_rebind<_Sco>(&__c->__credit), (ptrdiff_t)0, memory_order_relaxed)) {
-        __libcpp_mutex_lock(&__c->__mutex);
-        __libcpp_mutex_unlock(&__c->__mutex);
-        __libcpp_condvar_broadcast(&__c->__condvar);
-    }
-}
-template <class _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_one(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a) {
-    __cxx_atomic_notify_all(__a);
-}
-template <class _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_try_wait_slow(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a, _Tp const __val, memory_order __order) {
-    auto * const __c = __libcpp_contention_state(__a);
-    __libcpp_mutex_lock(&__c->__mutex);
-    __cxx_atomic_store(__cxx_atomic_rebind<_Sco>(&__c->__credit), (ptrdiff_t)1, memory_order_relaxed);
-    __cxx_atomic_thread_fence(memory_order_seq_cst);
-    if (__cxx_nonatomic_compare_equal(__cxx_atomic_load(__a, __order), __val))
-        __libcpp_condvar_wait(&__c->__condvar, &__c->__mutex);
-    __libcpp_mutex_unlock(&__c->__mutex);
-}
-
-#else
-
-template<typename T>
-struct __atomic_wait_and_notify_supported
-#if defined(__CUDA_MINIMUM_ARCH__) && __CUDA_MINIMUM_ARCH__ < 700
-    : false_type
-#else
-    : true_type
-#endif
-{};
-
-template <class _Ty, class _Tp = __detail::__cxx_atomic_underlying_t<_Ty>>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_try_wait_slow(_Ty const volatile* __a, _Tp __val, memory_order __order) {
-    static_assert(__atomic_wait_and_notify_supported<_Tp>::value, "atomic wait operations are unsupported on Pascal");
-    __cxx_atomic_try_wait_slow_fallback(__a, __val, __order);
-}
-
-template <class _Ty, class _Tp = __detail::__cxx_atomic_underlying_t<_Ty>>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_one(_Ty const volatile*) {
-    static_assert(__atomic_wait_and_notify_supported<_Tp>::value, "atomic notify-one operations are unsupported on Pascal");
-}
-
-template <class _Ty, class _Tp = __detail::__cxx_atomic_underlying_t<_Ty>>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_all(_Ty const volatile*) {
-    static_assert(__atomic_wait_and_notify_supported<_Tp>::value, "atomic notify-all operations are unsupported on Pascal");
-}
-
-#endif // _LIBCUDACXX_HAS_PLATFORM_WAIT || !defined(_LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE)
-
-template <class _Ty, class _Tp = __detail::__cxx_atomic_underlying_t<_Ty>>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_wait(_Ty const volatile* __a, _Tp const __val, memory_order __order) {
-    for(int __i = 0; __i < _LIBCUDACXX_POLLING_COUNT; ++__i) {
-        if(!__cxx_nonatomic_compare_equal(__cxx_atomic_load(__a, __order), __val))
-            return;
-        if(__i < 12)
-            __libcpp_thread_yield_processor();
-        else
-            __libcpp_thread_yield();
-    }
-    while(__cxx_nonatomic_compare_equal(__cxx_atomic_load(__a, __order), __val))
-        __cxx_atomic_try_wait_slow(__a, __val, __order);
-}
-
 template <class _Tp, typename _Storage>
 struct __atomic_base_storage {
     mutable _Storage __a_;
@@ -1321,103 +668,103 @@ struct __atomic_base_core : public __atomic_base_storage<_Tp, _Storage>{
 
     void store(_Tp __d, memory_order __m = memory_order_seq_cst) volatile noexcept
       _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
-        {__cxx_atomic_store(&this->__a_, __d, __m);}
+        {__atomic_store_dispatch(this->__a_, __d, __m);}
     _LIBCUDACXX_INLINE_VISIBILITY
     void store(_Tp __d, memory_order __m = memory_order_seq_cst) noexcept
       _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
-        {__cxx_atomic_store(&this->__a_, __d, __m);}
+        {__atomic_store_dispatch(this->__a_, __d, __m);}
 
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp load(memory_order __m = memory_order_seq_cst) const volatile noexcept
       _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
-        {return __cxx_atomic_load(&this->__a_, __m);}
+        {return __atomic_load_dispatch(this->__a_, __m);}
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp load(memory_order __m = memory_order_seq_cst) const noexcept
       _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
-        {return __cxx_atomic_load(&this->__a_, __m);}
+        {return __atomic_load_dispatch(this->__a_, __m);}
     _LIBCUDACXX_INLINE_VISIBILITY
     operator _Tp() const volatile noexcept {return load();}
     _LIBCUDACXX_INLINE_VISIBILITY
     operator _Tp() const noexcept          {return load();}
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) volatile noexcept
-        {return __cxx_atomic_exchange(&this->__a_, __d, __m);}
+        {return __atomic_exchange_dispatch(this->__a_, __d, __m);}
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) noexcept
-        {return __cxx_atomic_exchange(&this->__a_, __d, __m);}
+        {return __atomic_exchange_dispatch(this->__a_, __d, __m);}
     _LIBCUDACXX_INLINE_VISIBILITY
     bool compare_exchange_weak(_Tp& __e, _Tp __d,
                                memory_order __s, memory_order __f) volatile noexcept
       _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-        {return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __s, __f);}
+        {return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __s, __f);}
     _LIBCUDACXX_INLINE_VISIBILITY
     bool compare_exchange_weak(_Tp& __e, _Tp __d,
                                memory_order __s, memory_order __f) noexcept
       _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-        {return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __s, __f);}
+        {return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __s, __f);}
     _LIBCUDACXX_INLINE_VISIBILITY
     bool compare_exchange_strong(_Tp& __e, _Tp __d,
                                  memory_order __s, memory_order __f) volatile noexcept
       _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-        {return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __s, __f);}
+        {return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __s, __f);}
     _LIBCUDACXX_INLINE_VISIBILITY
     bool compare_exchange_strong(_Tp& __e, _Tp __d,
                                  memory_order __s, memory_order __f) noexcept
       _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-        {return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __s, __f);}
+        {return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __s, __f);}
     _LIBCUDACXX_INLINE_VISIBILITY
     bool compare_exchange_weak(_Tp& __e, _Tp __d,
                               memory_order __m = memory_order_seq_cst) volatile noexcept {
         if (memory_order_acq_rel == __m)
-            return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_acquire);
+            return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __m, memory_order_acquire);
         else if (memory_order_release == __m)
-            return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_relaxed);
+            return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __m, memory_order_relaxed);
         else
-            return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, __m);
+            return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __m, __m);
     }
     _LIBCUDACXX_INLINE_VISIBILITY
     bool compare_exchange_weak(_Tp& __e, _Tp __d,
                                memory_order __m = memory_order_seq_cst) noexcept {
         if(memory_order_acq_rel == __m)
-            return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_acquire);
+            return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __m, memory_order_acquire);
         else if(memory_order_release == __m)
-            return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_relaxed);
+            return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __m, memory_order_relaxed);
         else
-            return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, __m);
+            return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __m, __m);
     }
     _LIBCUDACXX_INLINE_VISIBILITY
     bool compare_exchange_strong(_Tp& __e, _Tp __d,
                               memory_order __m = memory_order_seq_cst) volatile noexcept {
         if (memory_order_acq_rel == __m)
-            return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_acquire);
+            return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __m, memory_order_acquire);
         else if (memory_order_release == __m)
-            return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_relaxed);
+            return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __m, memory_order_relaxed);
         else
-            return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, __m);
+            return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __m, __m);
     }
     _LIBCUDACXX_INLINE_VISIBILITY
     bool compare_exchange_strong(_Tp& __e, _Tp __d,
                                  memory_order __m = memory_order_seq_cst) noexcept {
         if (memory_order_acq_rel == __m)
-            return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_acquire);
+            return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __m, memory_order_acquire);
         else if (memory_order_release == __m)
-            return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_relaxed);
+            return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __m, memory_order_relaxed);
         else
-            return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, __m);
+            return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __m, __m);
     }
 
     _LIBCUDACXX_INLINE_VISIBILITY void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {__cxx_atomic_wait(&this->__a_, __v, __m);}
+        {__atomic_wait_dispatch(this->__a_, __v, __m);}
     _LIBCUDACXX_INLINE_VISIBILITY void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const noexcept
-        {__cxx_atomic_wait(&this->__a_, __v, __m);}
+        {__atomic_wait_dispatch(this->__a_, __v, __m);}
     _LIBCUDACXX_INLINE_VISIBILITY void notify_one() volatile noexcept
-        {__cxx_atomic_notify_one(&this->__a_);}
+        {__atomic_notify_one_dispatch(this->__a_);}
     _LIBCUDACXX_INLINE_VISIBILITY void notify_one() noexcept
-        {__cxx_atomic_notify_one(&this->__a_);}
+        {__atomic_notify_one_dispatch(this->__a_);}
     _LIBCUDACXX_INLINE_VISIBILITY void notify_all() volatile noexcept
-        {__cxx_atomic_notify_all(&this->__a_);}
+        {__atomic_notify_all_dispatch(this->__a_);}
     _LIBCUDACXX_INLINE_VISIBILITY void notify_all() noexcept
-        {__cxx_atomic_notify_all(&this->__a_);}
+        {__atomic_notify_all_dispatch(this->__a_);}
 };
 
 template <class _Tp, typename _Storage>
@@ -1446,103 +793,103 @@ struct __atomic_base_core<_Tp, true, _Storage> : public __atomic_base_storage<_T
 
     void store(_Tp __d, memory_order __m = memory_order_seq_cst) const volatile noexcept
       _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
-        {__cxx_atomic_store(&this->__a_, __d, __m);}
+        {__atomic_store_dispatch(this->__a_, __d, __m);}
     _LIBCUDACXX_INLINE_VISIBILITY
     void store(_Tp __d, memory_order __m = memory_order_seq_cst) const noexcept
       _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
-        {__cxx_atomic_store(&this->__a_, __d, __m);}
+        {__atomic_store_dispatch(this->__a_, __d, __m);}
 
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp load(memory_order __m = memory_order_seq_cst) const volatile noexcept
       _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
-        {return __cxx_atomic_load(&this->__a_, __m);}
+        {return __atomic_load_dispatch(this->__a_, __m);}
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp load(memory_order __m = memory_order_seq_cst) const noexcept
       _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
-        {return __cxx_atomic_load(&this->__a_, __m);}
+        {return __atomic_load_dispatch(this->__a_, __m);}
     _LIBCUDACXX_INLINE_VISIBILITY
     operator _Tp() const volatile noexcept {return load();}
     _LIBCUDACXX_INLINE_VISIBILITY
     operator _Tp() const noexcept          {return load();}
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {return __cxx_atomic_exchange(&this->__a_, __d, __m);}
+        {return __atomic_exchange_dispatch(this->__a_, __d, __m);}
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) const noexcept
-        {return __cxx_atomic_exchange(&this->__a_, __d, __m);}
+        {return __atomic_exchange_dispatch(this->__a_, __d, __m);}
     _LIBCUDACXX_INLINE_VISIBILITY
     bool compare_exchange_weak(_Tp& __e, _Tp __d,
                                memory_order __s, memory_order __f) const volatile noexcept
       _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-        {return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __s, __f);}
+        {return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __s, __f);}
     _LIBCUDACXX_INLINE_VISIBILITY
     bool compare_exchange_weak(_Tp& __e, _Tp __d,
                                memory_order __s, memory_order __f) const noexcept
       _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-        {return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __s, __f);}
+        {return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __s, __f);}
     _LIBCUDACXX_INLINE_VISIBILITY
     bool compare_exchange_strong(_Tp& __e, _Tp __d,
                                  memory_order __s, memory_order __f) const volatile noexcept
       _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-        {return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __s, __f);}
+        {return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __s, __f);}
     _LIBCUDACXX_INLINE_VISIBILITY
     bool compare_exchange_strong(_Tp& __e, _Tp __d,
                                  memory_order __s, memory_order __f) const noexcept
       _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-        {return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __s, __f);}
+        {return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __s, __f);}
     _LIBCUDACXX_INLINE_VISIBILITY
     bool compare_exchange_weak(_Tp& __e, _Tp __d,
                               memory_order __m = memory_order_seq_cst) const volatile noexcept {
         if (memory_order_acq_rel == __m)
-            return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_acquire);
+            return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __m, memory_order_acquire);
         else if (memory_order_release == __m)
-            return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_relaxed);
+            return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __m, memory_order_relaxed);
         else
-            return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, __m);
+            return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __m, __m);
     }
     _LIBCUDACXX_INLINE_VISIBILITY
     bool compare_exchange_weak(_Tp& __e, _Tp __d,
                                memory_order __m = memory_order_seq_cst) const noexcept {
         if(memory_order_acq_rel == __m)
-            return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_acquire);
+            return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __m, memory_order_acquire);
         else if(memory_order_release == __m)
-            return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_relaxed);
+            return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __m, memory_order_relaxed);
         else
-            return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, __m);
+            return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __m, __m);
     }
     _LIBCUDACXX_INLINE_VISIBILITY
     bool compare_exchange_strong(_Tp& __e, _Tp __d,
                               memory_order __m = memory_order_seq_cst) const volatile noexcept {
         if (memory_order_acq_rel == __m)
-            return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_acquire);
+            return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __m, memory_order_acquire);
         else if (memory_order_release == __m)
-            return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_relaxed);
+            return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __m, memory_order_relaxed);
         else
-            return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, __m);
+            return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __m, __m);
     }
     _LIBCUDACXX_INLINE_VISIBILITY
     bool compare_exchange_strong(_Tp& __e, _Tp __d,
                                  memory_order __m = memory_order_seq_cst) const noexcept {
         if (memory_order_acq_rel == __m)
-            return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_acquire);
+            return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __m, memory_order_acquire);
         else if (memory_order_release == __m)
-            return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_relaxed);
+            return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __m, memory_order_relaxed);
         else
-            return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, __m);
+            return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __m, __m);
     }
 
     _LIBCUDACXX_INLINE_VISIBILITY void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {__cxx_atomic_wait(&this->__a_, __v, __m);}
+        {__atomic_wait_dispatch(this->__a_, __v, __m);}
     _LIBCUDACXX_INLINE_VISIBILITY void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const noexcept
-        {__cxx_atomic_wait(&this->__a_, __v, __m);}
+        {__atomic_wait_dispatch(this->__a_, __v, __m);}
     _LIBCUDACXX_INLINE_VISIBILITY void notify_one() const volatile noexcept
-        {__cxx_atomic_notify_one(&this->__a_);}
+        {__atomic_notify_one_dispatch(this->__a_);}
     _LIBCUDACXX_INLINE_VISIBILITY void notify_one() const noexcept
-        {__cxx_atomic_notify_one(&this->__a_);}
+        {__atomic_notify_one_dispatch(this->__a_);}
     _LIBCUDACXX_INLINE_VISIBILITY void notify_all() const volatile noexcept
-        {__cxx_atomic_notify_all(&this->__a_);}
+        {__atomic_notify_all_dispatch(this->__a_);}
     _LIBCUDACXX_INLINE_VISIBILITY void notify_all() const noexcept
-        {__cxx_atomic_notify_all(&this->__a_);}
+        {__atomic_notify_all_dispatch(this->__a_);}
 };
 
 template <class _Tp, bool _Cq, typename _Storage>
@@ -1559,16 +906,16 @@ struct __atomic_base_arithmetic : public __atomic_base_core<_Tp, _Cq, _Storage>
 
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
-        {return __cxx_atomic_fetch_add(&this->__a_, __op, __m);}
+        {return __atomic_fetch_add_dispatch(this->__a_, __op, __m);}
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
-        {return __cxx_atomic_fetch_add(&this->__a_, __op, __m);}
+        {return __atomic_fetch_add_dispatch(this->__a_, __op, __m);}
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
-        {return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);}
+        {return __atomic_fetch_sub_dispatch(this->__a_, __op, __m);}
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
-        {return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);}
+        {return __atomic_fetch_sub_dispatch(this->__a_, __op, __m);}
 
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp operator++(int) volatile noexcept      {return fetch_add(_Tp(1));}
@@ -1610,16 +957,16 @@ struct __atomic_base_arithmetic<_Tp, true, _Storage> : public __atomic_base_core
 
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {return __cxx_atomic_fetch_add(&this->__a_, __op, __m);}
+        {return __atomic_fetch_add_dispatch(this->__a_, __op, __m);}
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
-        {return __cxx_atomic_fetch_add(&this->__a_, __op, __m);}
+        {return __atomic_fetch_add_dispatch(this->__a_, __op, __m);}
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);}
+        {return __atomic_fetch_sub_dispatch(this->__a_, __op, __m);}
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
-        {return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);}
+        {return __atomic_fetch_sub_dispatch(this->__a_, __op, __m);}
 
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp operator++(int) const volatile noexcept      {return fetch_add(_Tp(1));}
@@ -1661,22 +1008,22 @@ struct __atomic_base_bitwise : public __atomic_base_arithmetic<_Tp, _Cq, _Storag
 
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
-        {return __cxx_atomic_fetch_and(&this->__a_, __op, __m);}
+        {return __atomic_fetch_and_dispatch(this->__a_, __op, __m);}
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
-        {return __cxx_atomic_fetch_and(&this->__a_, __op, __m);}
+        {return __atomic_fetch_and_dispatch(this->__a_, __op, __m);}
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
-        {return __cxx_atomic_fetch_or(&this->__a_, __op, __m);}
+        {return __atomic_fetch_or_dispatch(this->__a_, __op, __m);}
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
-        {return __cxx_atomic_fetch_or(&this->__a_, __op, __m);}
+        {return __atomic_fetch_or_dispatch(this->__a_, __op, __m);}
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
-        {return __cxx_atomic_fetch_xor(&this->__a_, __op, __m);}
+        {return __atomic_fetch_xor_dispatch(this->__a_, __op, __m);}
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
-        {return __cxx_atomic_fetch_xor(&this->__a_, __op, __m);}
+        {return __atomic_fetch_xor_dispatch(this->__a_, __op, __m);}
 
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp operator&=(_Tp __op) volatile noexcept {return fetch_and(__op) & __op;}
@@ -1706,22 +1053,22 @@ struct __atomic_base_bitwise<_Tp, true, _Storage> : public __atomic_base_arithme
 
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {return __cxx_atomic_fetch_and(&this->__a_, __op, __m);}
+        {return __atomic_fetch_and_dispatch(this->__a_, __op, __m);}
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
-        {return __cxx_atomic_fetch_and(&this->__a_, __op, __m);}
+        {return __atomic_fetch_and_dispatch(this->__a_, __op, __m);}
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {return __cxx_atomic_fetch_or(&this->__a_, __op, __m);}
+        {return __atomic_fetch_or_dispatch(this->__a_, __op, __m);}
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
-        {return __cxx_atomic_fetch_or(&this->__a_, __op, __m);}
+        {return __atomic_fetch_or_dispatch(this->__a_, __op, __m);}
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {return __cxx_atomic_fetch_xor(&this->__a_, __op, __m);}
+        {return __atomic_fetch_xor_dispatch(this->__a_, __op, __m);}
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
-        {return __cxx_atomic_fetch_xor(&this->__a_, __op, __m);}
+        {return __atomic_fetch_xor_dispatch(this->__a_, __op, __m);}
 
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp operator&=(_Tp __op) const volatile noexcept {return fetch_and(__op) & __op;}
@@ -1744,7 +1091,7 @@ using __atomic_select_base = __conditional_t<is_floating_point<_Tp>::value,
                                                 __atomic_base_bitwise<_Tp, _Cq, _Storage>,
                                                 __atomic_base_core<_Tp, _Cq, _Storage> >>;
 
-template <typename _Tp, int _Sco = 0, typename _Base = __atomic_select_base<_Tp, false, __cxx_atomic_impl<_Tp, _Sco>>>
+template <typename _Tp, typename _Base = __atomic_select_base<_Tp, false, typename __atomic_impl_traits<_Tp>::__atomic_storage_t>>
 struct __atomic_base : public _Base {
     __atomic_base() = default;
     __atomic_base(const __atomic_base&) = delete;
@@ -1755,10 +1102,10 @@ struct __atomic_base : public _Base {
 
     _LIBCUDACXX_INLINE_VISIBILITY constexpr
     __atomic_base(const _Tp& __a) noexcept :
-        _Base(__cxx_atomic_impl<_Tp, _Sco>(__a)) {}
+        _Base(__atomic_impl_traits<_Tp>::__atomic_storage_t(__a)) {}
 };
 
-template <typename _Tp, int _Sco = 0, typename _Base = __atomic_select_base<_Tp, true, __cxx_atomic_ref_impl<_Tp, _Sco>>>
+template <typename _Tp, typename _Base = __atomic_select_base<_Tp, true, typename __atomic_impl_traits<_Tp>::__atomic_ref_storage_t>>
 struct __atomic_base_ref : public _Base {
     __atomic_base_ref() = default;
     __atomic_base_ref(const __atomic_base_ref&) = default;
@@ -1769,7 +1116,7 @@ struct __atomic_base_ref : public _Base {
 
     _LIBCUDACXX_INLINE_VISIBILITY constexpr
     __atomic_base_ref(_Tp& __a) noexcept :
-        _Base(__cxx_atomic_ref_impl<_Tp, _Sco>(__a)) {}
+        _Base(__atomic_impl_traits<_Tp>::__atomic_ref_storage_t(__a)) {}
 };
 
 #if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
@@ -1820,19 +1167,19 @@ struct atomic<_Tp*>
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp* fetch_add(ptrdiff_t __op, memory_order __m = memory_order_seq_cst)
                                                                         volatile noexcept
-        {return __cxx_atomic_fetch_add(&this->__a_, __op, __m);}
+        {return __atomic_fetch_add_dispatch(this->__a_, __op, __m);}
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp* fetch_add(ptrdiff_t __op, memory_order __m = memory_order_seq_cst)
                                                                         noexcept
-        {return __cxx_atomic_fetch_add(&this->__a_, __op, __m);}
+        {return __atomic_fetch_add_dispatch(this->__a_, __op, __m);}
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp* fetch_sub(ptrdiff_t __op, memory_order __m = memory_order_seq_cst)
                                                                         volatile noexcept
-        {return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);}
+        {return __atomic_fetch_sub_dispatch(this->__a_, __op, __m);}
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp* fetch_sub(ptrdiff_t __op, memory_order __m = memory_order_seq_cst)
                                                                         noexcept
-        {return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);}
+        {return __atomic_fetch_sub_dispatch(this->__a_, __op, __m);}
 
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp* operator++(int) volatile noexcept            {return fetch_add(1);}
@@ -1902,11 +1249,11 @@ template <class _Tp>
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp* fetch_add(ptrdiff_t __op, memory_order __m = memory_order_seq_cst)
                                                                         const noexcept
-        {return __cxx_atomic_fetch_add(&this->__a_, __op, __m);}
+        {return __atomic_fetch_add_dispatch(this->__a_, __op, __m);}
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp* fetch_sub(ptrdiff_t __op, memory_order __m = memory_order_seq_cst)
                                                                         const noexcept
-        {return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);}
+        {return __atomic_fetch_sub_dispatch(this->__a_, __op, __m);}
 
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp* operator++(int) const noexcept                     {return fetch_add(1);}
@@ -1947,7 +1294,7 @@ _LIBCUDACXX_INLINE_VISIBILITY
 void
 atomic_init(volatile atomic<_Tp>* __o, _Tp __d) noexcept
 {
-    __cxx_atomic_init(&__o->__a_, __d);
+    __atomic_init_dispatch(__o->__a_, __d);
 }
 
 template <class _Tp>
@@ -1955,7 +1302,7 @@ _LIBCUDACXX_INLINE_VISIBILITY
 void
 atomic_init(atomic<_Tp>* __o, _Tp __d) noexcept
 {
-    __cxx_atomic_init(&__o->__a_, __d);
+    __atomic_init_dispatch(__o->__a_, __d);
 }
 
 // atomic_store
@@ -2553,47 +1900,47 @@ atomic_fetch_xor_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 
 typedef struct atomic_flag
 {
-    __cxx_atomic_impl<_LIBCUDACXX_ATOMIC_FLAG_TYPE, 0> __a_;
+    __atomic_impl_traits<_LIBCUDACXX_ATOMIC_FLAG_TYPE>::__atomic_storage_t __a_;
 
     _LIBCUDACXX_INLINE_VISIBILITY
     bool test(memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {return _LIBCUDACXX_ATOMIC_FLAG_TYPE(true)==__cxx_atomic_load(&__a_, __m);}
+        {return _LIBCUDACXX_ATOMIC_FLAG_TYPE(true)==__atomic_load_dispatch(__a_, __m, __thread_scope_system_tag{}, __atomic_tag_t<decltype(__a_)>{});}
     _LIBCUDACXX_INLINE_VISIBILITY
     bool test(memory_order __m = memory_order_seq_cst) const noexcept
-        {return _LIBCUDACXX_ATOMIC_FLAG_TYPE(true)==__cxx_atomic_load(&__a_, __m);}
+        {return _LIBCUDACXX_ATOMIC_FLAG_TYPE(true)==__atomic_load_dispatch(__a_, __m, __thread_scope_system_tag{}, __atomic_tag_t<decltype(__a_)>{});}
 
     _LIBCUDACXX_INLINE_VISIBILITY
     bool test_and_set(memory_order __m = memory_order_seq_cst) volatile noexcept
-        {return __cxx_atomic_exchange(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), __m);}
+        {return __atomic_exchange_dispatch(__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), __m, __thread_scope_system_tag{}, __atomic_tag_t<decltype(__a_)>{});}
     _LIBCUDACXX_INLINE_VISIBILITY
     bool test_and_set(memory_order __m = memory_order_seq_cst) noexcept
-        {return __cxx_atomic_exchange(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), __m);}
+        {return __atomic_exchange_dispatch(__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), __m, __thread_scope_system_tag{}, __atomic_tag_t<decltype(__a_)>{});}
     _LIBCUDACXX_INLINE_VISIBILITY
     void clear(memory_order __m = memory_order_seq_cst) volatile noexcept
-        {__cxx_atomic_store(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), __m);}
+        {__atomic_store_dispatch(__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), __m, __thread_scope_system_tag{}, __atomic_tag_t<decltype(__a_)>{});}
     _LIBCUDACXX_INLINE_VISIBILITY
     void clear(memory_order __m = memory_order_seq_cst) noexcept
-        {__cxx_atomic_store(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), __m);}
+        {__atomic_store_dispatch(__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), __m, __thread_scope_system_tag{}, __atomic_tag_t<decltype(__a_)>{});}
 
 #if !defined(__CUDA_MINIMUM_ARCH__) || __CUDA_MINIMUM_ARCH__ >= 700
     _LIBCUDACXX_INLINE_VISIBILITY
     void wait(bool __v, memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {__cxx_atomic_wait(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(__v), __m);}
+        {__atomic_wait(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(__v), __m, __thread_scope_system_tag{});}
     _LIBCUDACXX_INLINE_VISIBILITY
     void wait(bool __v, memory_order __m = memory_order_seq_cst) const noexcept
-        {__cxx_atomic_wait(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(__v), __m);}
+        {__atomic_wait(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(__v), __m, __thread_scope_system_tag{});}
     _LIBCUDACXX_INLINE_VISIBILITY
     void notify_one() volatile noexcept
-        {__cxx_atomic_notify_one(&__a_);}
+        {__atomic_notify_one(&__a_, __thread_scope_system_tag{});}
     _LIBCUDACXX_INLINE_VISIBILITY
     void notify_one() noexcept
-        {__cxx_atomic_notify_one(&__a_);}
+        {__atomic_notify_one(&__a_, __thread_scope_system_tag{});}
     _LIBCUDACXX_INLINE_VISIBILITY
     void notify_all() volatile noexcept
-        {__cxx_atomic_notify_all(&__a_);}
+        {__atomic_notify_all(&__a_, __thread_scope_system_tag{});}
     _LIBCUDACXX_INLINE_VISIBILITY
     void notify_all() noexcept
-        {__cxx_atomic_notify_all(&__a_);}
+        {__atomic_notify_all(&__a_, __thread_scope_system_tag{});}
 #endif
 
     atomic_flag() noexcept = default;
@@ -2759,14 +2106,14 @@ inline _LIBCUDACXX_INLINE_VISIBILITY
 void
 atomic_thread_fence(memory_order __m) noexcept
 {
-    __cxx_atomic_thread_fence(__m);
+    __atomic_thread_fence_dispatch(__m);
 }
 
 inline _LIBCUDACXX_INLINE_VISIBILITY
 void
 atomic_signal_fence(memory_order __m) noexcept
 {
-    __cxx_atomic_signal_fence(__m);
+    __atomic_signal_fence_dispatch(__m);
 }
 
 // Atomics for standard typedef types

From a780c269f23e955fe572c725e0e1f3ea786b158b Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Thu, 18 Apr 2024 13:04:45 -0700
Subject: [PATCH 06/71] Change atomic_storage operator()() to get()

---
 .../std/__atomic/operations/heterogeneous.h   | 58 +++++++++----------
 .../cuda/std/__atomic/operations/host.h       |  4 +-
 .../include/cuda/std/__atomic/storage/base.h  |  8 +--
 .../cuda/std/__atomic/storage/reference.h     |  2 +-
 4 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__atomic/operations/heterogeneous.h b/libcudacxx/include/cuda/std/__atomic/operations/heterogeneous.h
index 86a142de08..5a87e876a6 100644
--- a/libcudacxx/include/cuda/std/__atomic/operations/heterogeneous.h
+++ b/libcudacxx/include/cuda/std/__atomic/operations/heterogeneous.h
@@ -66,7 +66,7 @@ using __atomic_enable_if_default_base_t = __enable_if_t<is_same<__atomic_tag_t<_
 template <typename _Tp, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
 _LIBCUDACXX_HOST_DEVICE
  void __atomic_init_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, _Tag = {}) {
-    __atomic_assign_volatile(__a(), __val);
+    __atomic_assign_volatile(__a.get(), __val);
 }
 
 template <typename _Tp, typename _Sco = __thread_scope_system_tag, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
@@ -75,10 +75,10 @@ _LIBCUDACXX_HOST_DEVICE
     alignas(_Tp) auto __tmp = __val;
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
-            __atomic_store_n_cuda(__a(), __tmp, static_cast<__memory_order_underlying_t>(__order),  _Sco{});
+            __atomic_store_n_cuda(__a.get(), __val, static_cast<__memory_order_underlying_t>(__order),  _Sco{});
         ),
         NV_IS_HOST, (
-            __atomic_store_host(__a(), __tmp, __order);
+            __atomic_store_host(__a.get(), __val, __order);
         )
     )
 }
@@ -88,10 +88,10 @@ _LIBCUDACXX_HOST_DEVICE
  auto __atomic_load_dispatch(_Tp const& __a, memory_order __order, _Sco = {}, _Tag = {}) -> __atomic_underlying_t<_Tp> {
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
-            return __atomic_load_n_cuda(__a(), static_cast<__memory_order_underlying_t>(__order),  _Sco{});
+            return __atomic_load_n_cuda(__a.get(), static_cast<__memory_order_underlying_t>(__order),  _Sco{});
         ),
         NV_IS_HOST, (
-            return __atomic_load_host(__a(), __order);
+            return __atomic_load_host(__a.get(), __order);
         )
     )
 }
@@ -102,10 +102,10 @@ __atomic_underlying_t<_Tp> __atomic_exchange_dispatch(_Tp& __a, __atomic_underly
     alignas(_Tp) auto __tmp = __value;
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
-            return __atomic_exchange_n_cuda(__a(), __tmp, static_cast<__memory_order_underlying_t>(__order), _Sco{});
+            return __atomic_exchange_n_cuda(__a.get(), __tmp, static_cast<__memory_order_underlying_t>(__order), _Sco{});
         ),
         NV_IS_HOST, (
-            return __atomic_exchange_host(__a(), __tmp, __order);
+            return __atomic_exchange_host(__a.get(), __tmp, __order);
         )
     )
 }
@@ -116,10 +116,10 @@ _LIBCUDACXX_HOST_DEVICE
     bool __result = false;
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
-            __result = __atomic_compare_exchange_cuda(__a(), __expected, __val, false, static_cast<__memory_order_underlying_t>(__success), static_cast<__memory_order_underlying_t>(__failure), _Sco{});
+            __result = __atomic_compare_exchange_cuda(__a.get(), __expected, __val, false, static_cast<__memory_order_underlying_t>(__success), static_cast<__memory_order_underlying_t>(__failure), _Sco{});
         ),
         NV_IS_HOST, (
-            __result = __atomic_compare_exchange_strong_host(__a(), __expected, __val, __success, __failure);
+            __result = __atomic_compare_exchange_strong_host(__a.get(), __expected, __val, __success, __failure);
         )
     )
     return __result;
@@ -131,10 +131,10 @@ _LIBCUDACXX_HOST_DEVICE
     bool __result = false;
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
-            __result = __atomic_compare_exchange_cuda(__a(), __expected, __val,  true, static_cast<__memory_order_underlying_t>(__success), static_cast<__memory_order_underlying_t>(__failure), _Sco{});
+            __result = __atomic_compare_exchange_cuda(__a.get(), __expected, __val,  true, static_cast<__memory_order_underlying_t>(__success), static_cast<__memory_order_underlying_t>(__failure), _Sco{});
         ),
         NV_IS_HOST, (
-            __result = __atomic_compare_exchange_weak_host(__a(), __expected, __val, __success, __failure);
+            __result = __atomic_compare_exchange_weak_host(__a.get(), __expected, __val, __success, __failure);
         )
     )
     return __result;
@@ -150,10 +150,10 @@ _LIBCUDACXX_HOST_DEVICE
  __atomic_enable_if_not_ptr<_Tp> __atomic_fetch_add_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __delta, memory_order __order, _Sco = {}, _Tag = {}) {
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
-            return __atomic_fetch_add_cuda(__a(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{});
+            return __atomic_fetch_add_cuda(__a.get(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{});
         ),
         NV_IS_HOST, (
-            return __atomic_fetch_add_host(__a(), __delta, __order);
+            return __atomic_fetch_add_host(__a.get(), __delta, __order);
         )
     )
 }
@@ -163,10 +163,10 @@ _LIBCUDACXX_HOST_DEVICE
  __atomic_enable_if_ptr<_Tp> __atomic_fetch_add_dispatch(_Tp& __a, ptrdiff_t __delta, memory_order __order, _Sco = {}, _Tag = {}) {
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
-            return __atomic_fetch_add_cuda(__a(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{});
+            return __atomic_fetch_add_cuda(__a.get(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{});
         ),
         NV_IS_HOST, (
-            return __atomic_fetch_add_host(__a(), __delta, __order);
+            return __atomic_fetch_add_host(__a.get(), __delta, __order);
         )
     )
 }
@@ -176,10 +176,10 @@ _LIBCUDACXX_HOST_DEVICE
  __atomic_enable_if_not_ptr<_Tp> __atomic_fetch_sub_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __delta, memory_order __order, _Sco = {}, _Tag = {}) {
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
-            return __atomic_fetch_sub_cuda(__a(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{});
+            return __atomic_fetch_sub_cuda(__a.get(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{});
         ),
         NV_IS_HOST, (
-            return __atomic_fetch_sub_cuda(__a(), __delta, __order);
+            return __atomic_fetch_sub_cuda(__a.get(), __delta, __order);
         )
     )
 }
@@ -189,10 +189,10 @@ _LIBCUDACXX_HOST_DEVICE
  __atomic_enable_if_ptr<_Tp> __atomic_fetch_sub_dispatch(_Tp& __a, ptrdiff_t __delta, memory_order __order, _Sco = {}, _Tag = {}) {
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
-            return __atomic_fetch_sub_cuda(__a(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{});
+            return __atomic_fetch_sub_cuda(__a.get(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{});
         ),
         NV_IS_HOST, (
-            return __atomic_fetch_sub_host(__a(), __delta, __order);
+            return __atomic_fetch_sub_host(__a.get(), __delta, __order);
         )
     )
 }
@@ -202,10 +202,10 @@ _LIBCUDACXX_HOST_DEVICE
  __atomic_underlying_t<_Tp> __atomic_fetch_and_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order __order, _Sco = {}, _Tag = {}) {
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
-            return __atomic_fetch_and_cuda(__a(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{});
+            return __atomic_fetch_and_cuda(__a.get(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{});
         ),
         NV_IS_HOST, (
-            return __atomic_fetch_and_host(__a(), __pattern, __order);
+            return __atomic_fetch_and_host(__a.get(), __pattern, __order);
         )
     )
 }
@@ -215,10 +215,10 @@ _LIBCUDACXX_HOST_DEVICE
  __atomic_underlying_t<_Tp> __atomic_fetch_or_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order __order, _Sco = {}, _Tag = {}) {
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
-            return __atomic_fetch_or_cuda(__a(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{});
+            return __atomic_fetch_or_cuda(__a.get(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{});
         ),
         NV_IS_HOST, (
-            return __atomic_fetch_or_host(__a(), __pattern, __order);
+            return __atomic_fetch_or_host(__a.get(), __pattern, __order);
         )
     )
 }
@@ -228,10 +228,10 @@ _LIBCUDACXX_HOST_DEVICE
  __atomic_underlying_t<_Tp> __atomic_fetch_xor_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order __order, _Sco = {}, _Tag = {}) {
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
-            return __atomic_fetch_xor_cuda(__a(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{});
+            return __atomic_fetch_xor_cuda(__a.get(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{});
         ),
         NV_IS_HOST, (
-            return __atomic_fetch_xor_host(__a(), __pattern, __order);
+            return __atomic_fetch_xor_host(__a.get(), __pattern, __order);
         )
     )
 }
@@ -241,9 +241,9 @@ _LIBCUDACXX_HOST_DEVICE
  __atomic_underlying_t<_Tp> __atomic_fetch_max_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco = {}, _Tag = {}) {
     NV_IF_TARGET(
         NV_IS_DEVICE, (
-            return __atomic_fetch_max_cuda(__a(), __val, static_cast<__memory_order_underlying_t>(__order), _Sco{});
+            return __atomic_fetch_max_cuda(__a.get(), __val, static_cast<__memory_order_underlying_t>(__order), _Sco{});
         ), (
-            return __atomic_fetch_max_host(__a(), __val, __order);
+            return __atomic_fetch_max_host(__a.get(), __val, __order);
         )
     )
 }
@@ -253,9 +253,9 @@ _LIBCUDACXX_HOST_DEVICE
  __atomic_underlying_t<_Tp> __atomic_fetch_min_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco = {}, _Tag = {}) {
     NV_IF_TARGET(
         NV_IS_DEVICE, (
-            return __atomic_fetch_min_cuda(__a(), __val, static_cast<__memory_order_underlying_t>(__order), _Sco{});
+            return __atomic_fetch_min_cuda(__a.get(), __val, static_cast<__memory_order_underlying_t>(__order), _Sco{});
         ), (
-            return __atomic_fetch_min_host(__a(), __val, __order);
+            return __atomic_fetch_min_host(__a.get(), __val, __order);
         )
     )
 }
diff --git a/libcudacxx/include/cuda/std/__atomic/operations/host.h b/libcudacxx/include/cuda/std/__atomic/operations/host.h
index 4870c011c4..e6015f33e4 100644
--- a/libcudacxx/include/cuda/std/__atomic/operations/host.h
+++ b/libcudacxx/include/cuda/std/__atomic/operations/host.h
@@ -38,14 +38,14 @@ inline void __atomic_store_host(_Tp* __a,  _Up __val, memory_order __order) {
 }
 
 template <typename _Tp>
-inline auto __atomic_load_host(_Tp* __a, memory_order __order) -> _Tp {
+inline auto __atomic_load_host(_Tp* __a, memory_order __order) -> __remove_cvref_t<_Tp> {
   __remove_cvref_t<_Tp> __ret{};
   __atomic_load(__a, &__ret, __atomic_order_to_int(__order));
   return __ret;
 }
 
 template <typename _Tp, typename _Up>
-inline auto __atomic_exchange_host(_Tp* __a, _Up __val, memory_order __order) -> _Tp {
+inline auto __atomic_exchange_host(_Tp* __a, _Up __val, memory_order __order) -> __remove_cvref_t<_Tp> {
   __remove_cvref_t<_Tp> __ret{};
   __atomic_exchange(__a, &__val, &__ret, __atomic_order_to_int(__order));
   return __ret;
diff --git a/libcudacxx/include/cuda/std/__atomic/storage/base.h b/libcudacxx/include/cuda/std/__atomic/storage/base.h
index ef197fd4ef..ca6a5fceaf 100644
--- a/libcudacxx/include/cuda/std/__atomic/storage/base.h
+++ b/libcudacxx/include/cuda/std/__atomic/storage/base.h
@@ -41,16 +41,16 @@ struct __atomic_storage {
   __atomic_storage(_Tp value) noexcept
     : __a_value(value) {}
 
-  _LIBCUDACXX_HOST_DEVICE inline auto operator()() -> __underlying_t* {
+  _LIBCUDACXX_HOST_DEVICE inline auto get() -> __underlying_t* {
     return &__a_value;
   }
-  _LIBCUDACXX_HOST_DEVICE inline auto operator()() volatile -> volatile __underlying_t* {
+  _LIBCUDACXX_HOST_DEVICE inline auto get() volatile -> volatile __underlying_t* {
     return &__a_value;
   }
-  _LIBCUDACXX_HOST_DEVICE inline auto operator()() const -> const __underlying_t* {
+  _LIBCUDACXX_HOST_DEVICE inline auto get() const -> const __underlying_t* {
     return &__a_value;
   }
-  _LIBCUDACXX_HOST_DEVICE inline auto operator()() const volatile -> const volatile __underlying_t* {
+  _LIBCUDACXX_HOST_DEVICE inline auto get() const volatile -> const volatile __underlying_t* {
     return &__a_value;
   }
 };
diff --git a/libcudacxx/include/cuda/std/__atomic/storage/reference.h b/libcudacxx/include/cuda/std/__atomic/storage/reference.h
index 3ead98703a..a892f24d12 100644
--- a/libcudacxx/include/cuda/std/__atomic/storage/reference.h
+++ b/libcudacxx/include/cuda/std/__atomic/storage/reference.h
@@ -38,7 +38,7 @@ struct __atomic_ref_storage {
   __atomic_ref_storage(_Tp& value) noexcept
     : __a_value(&value) {}
 
-  _LIBCUDACXX_HOST_DEVICE inline auto operator()() -> __underlying_t* {
+  _LIBCUDACXX_HOST_DEVICE inline auto get() -> __underlying_t* {
     return __a_value;
   }
 };

From 217527d75683c8765845d99d84de02d4d34ab752 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Thu, 18 Apr 2024 14:43:30 -0700
Subject: [PATCH 07/71] Fixup: Change desired of compexch to accept by value.

* This matches other implementations.
---
 libcudacxx/codegen/codegen.cpp                |  2 +-
 .../operations/atomic_cuda_ptx_generated.h    | 24 +++++++++----------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/libcudacxx/codegen/codegen.cpp b/libcudacxx/codegen/codegen.cpp
index c1f809bd4b..2df154de05 100644
--- a/libcudacxx/codegen/codegen.cpp
+++ b/libcudacxx/codegen/codegen.cpp
@@ -305,7 +305,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
               {
                 out << "template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==" << sz / 8 << ", int> = 0>\n";
                 out << "_CCCL_DEVICE bool __atomic_compare_exchange_cuda(" << cv
-                    << "_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int "
+                    << "_Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int "
                        "__failure_memorder, "
                     << scopenametag(s.first) << ") {\n";
                 out << "    uint" << sz << "_t __tmp = 0, __old = 0, __old_tmp;\n";
diff --git a/libcudacxx/include/cuda/std/__atomic/operations/atomic_cuda_ptx_generated.h b/libcudacxx/include/cuda/std/__atomic/operations/atomic_cuda_ptx_generated.h
index ff1bdcf1ff..52330eab5f 100644
--- a/libcudacxx/include/cuda/std/__atomic/operations/atomic_cuda_ptx_generated.h
+++ b/libcudacxx/include/cuda/std/__atomic/operations/atomic_cuda_ptx_generated.h
@@ -252,7 +252,7 @@ template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inli
 template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_release_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.cta.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); }
 template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_volatile_32_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.cta.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); }
 template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) {
+_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) {
     uint32_t __tmp = 0, __old = 0, __old_tmp;
     memcpy(&__tmp, &__desired, 4);
     memcpy(&__old, __expected, 4);
@@ -286,7 +286,7 @@ _CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *_
     return __ret;
 }
 template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) {
+_CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) {
     uint32_t __tmp = 0, __old = 0, __old_tmp;
     memcpy(&__tmp, &__desired, 4);
     memcpy(&__old, __expected, 4);
@@ -1159,7 +1159,7 @@ template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inli
 template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_release_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.cta.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); }
 template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_volatile_64_block(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.cta.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); }
 template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) {
+_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) {
     uint64_t __tmp = 0, __old = 0, __old_tmp;
     memcpy(&__tmp, &__desired, 8);
     memcpy(&__old, __expected, 8);
@@ -1193,7 +1193,7 @@ _CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *_
     return __ret;
 }
 template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) {
+_CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_block_tag) {
     uint64_t __tmp = 0, __old = 0, __old_tmp;
     memcpy(&__tmp, &__desired, 8);
     memcpy(&__old, __expected, 8);
@@ -2429,7 +2429,7 @@ template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inli
 template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_release_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); }
 template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_volatile_32_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.gpu.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); }
 template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) {
+_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) {
     uint32_t __tmp = 0, __old = 0, __old_tmp;
     memcpy(&__tmp, &__desired, 4);
     memcpy(&__old, __expected, 4);
@@ -2463,7 +2463,7 @@ _CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *_
     return __ret;
 }
 template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) {
+_CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) {
     uint32_t __tmp = 0, __old = 0, __old_tmp;
     memcpy(&__tmp, &__desired, 4);
     memcpy(&__old, __expected, 4);
@@ -3336,7 +3336,7 @@ template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inli
 template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_release_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.gpu.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); }
 template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_volatile_64_device(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.gpu.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); }
 template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) {
+_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) {
     uint64_t __tmp = 0, __old = 0, __old_tmp;
     memcpy(&__tmp, &__desired, 8);
     memcpy(&__old, __expected, 8);
@@ -3370,7 +3370,7 @@ _CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *_
     return __ret;
 }
 template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) {
+_CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_device_tag) {
     uint64_t __tmp = 0, __old = 0, __old_tmp;
     memcpy(&__tmp, &__desired, 8);
     memcpy(&__old, __expected, 8);
@@ -4606,7 +4606,7 @@ template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inli
 template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_release_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.sys.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); }
 template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_volatile_32_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.sys.b32 %0,[%1],%2,%3;" : "=r"(__dst) : "l"(__ptr),"r"(__cmp),"r"(__op) : "memory"); }
 template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) {
+_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) {
     uint32_t __tmp = 0, __old = 0, __old_tmp;
     memcpy(&__tmp, &__desired, 4);
     memcpy(&__old, __expected, 4);
@@ -4640,7 +4640,7 @@ _CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *_
     return __ret;
 }
 template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==4, int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) {
+_CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) {
     uint32_t __tmp = 0, __old = 0, __old_tmp;
     memcpy(&__tmp, &__desired, 4);
     memcpy(&__old, __expected, 4);
@@ -5513,7 +5513,7 @@ template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inli
 template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_release_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.release.sys.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); }
 template<class _CUDA_A, class _CUDA_B, class _CUDA_C, class _CUDA_D> static inline _CCCL_DEVICE void __cuda_compare_exchange_volatile_64_system(_CUDA_A __ptr, _CUDA_B& __dst, _CUDA_C __cmp, _CUDA_D __op) { asm volatile("atom.cas.sys.b64 %0,[%1],%2,%3;" : "=l"(__dst) : "l"(__ptr),"l"(__cmp),"l"(__op) : "memory"); }
 template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) {
+_CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) {
     uint64_t __tmp = 0, __old = 0, __old_tmp;
     memcpy(&__tmp, &__desired, 8);
     memcpy(&__old, __expected, 8);
@@ -5547,7 +5547,7 @@ _CCCL_DEVICE bool __atomic_compare_exchange_cuda(volatile _Type *__ptr, _Type *_
     return __ret;
 }
 template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==8, int> = 0>
-_CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) {
+_CCCL_DEVICE bool __atomic_compare_exchange_cuda(_Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int __failure_memorder, __thread_scope_system_tag) {
     uint64_t __tmp = 0, __old = 0, __old_tmp;
     memcpy(&__tmp, &__desired, 8);
     memcpy(&__old, __expected, 8);

From eaaa670a85108199d512d7416c8663515d3c10c1 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Thu, 18 Apr 2024 14:47:58 -0700
Subject: [PATCH 08/71] Fix merge conflicts (LIBCUDACXX->CCCL)

---
 .../std/__atomic/operations/heterogeneous.h   | 34 ++++++++---------
 libcudacxx/include/cuda/std/__atomic/order.h  |  6 +--
 .../include/cuda/std/__atomic/storage/base.h  | 12 +++---
 .../cuda/std/__atomic/storage/common.h        |  4 +-
 .../cuda/std/__atomic/storage/locked.h        | 36 +++++++++---------
 .../cuda/std/__atomic/storage/reference.h     |  4 +-
 .../include/cuda/std/__atomic/storage/small.h | 38 +++++++++----------
 .../include/cuda/std/__atomic/wait/polling.h  |  6 +--
 8 files changed, 70 insertions(+), 70 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__atomic/operations/heterogeneous.h b/libcudacxx/include/cuda/std/__atomic/operations/heterogeneous.h
index 5a87e876a6..5bfa86661d 100644
--- a/libcudacxx/include/cuda/std/__atomic/operations/heterogeneous.h
+++ b/libcudacxx/include/cuda/std/__atomic/operations/heterogeneous.h
@@ -29,7 +29,7 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-_LIBCUDACXX_HOST_DEVICE
+_CCCL_HOST_DEVICE
 inline
  void __atomic_thread_fence_dispatch(memory_order __order) {
     NV_DISPATCH_TARGET(
@@ -42,7 +42,7 @@ inline
     )
 }
 
-_LIBCUDACXX_HOST_DEVICE
+_CCCL_HOST_DEVICE
 inline
  void __atomic_signal_fence_dispatch(memory_order __order) {
     NV_DISPATCH_TARGET(
@@ -64,13 +64,13 @@ template <typename _Tp>
 using __atomic_enable_if_default_base_t = __enable_if_t<is_same<__atomic_tag_t<_Tp>, __atomic_base_tag>::value, __atomic_tag_t<_Tp>>;
 
 template <typename _Tp, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
-_LIBCUDACXX_HOST_DEVICE
+_CCCL_HOST_DEVICE
  void __atomic_init_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, _Tag = {}) {
     __atomic_assign_volatile(__a.get(), __val);
 }
 
 template <typename _Tp, typename _Sco = __thread_scope_system_tag, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
-_LIBCUDACXX_HOST_DEVICE
+_CCCL_HOST_DEVICE
  void __atomic_store_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco = {}, _Tag = {}) {
     alignas(_Tp) auto __tmp = __val;
     NV_DISPATCH_TARGET(
@@ -84,7 +84,7 @@ _LIBCUDACXX_HOST_DEVICE
 }
 
 template <typename _Tp, typename _Sco = __thread_scope_system_tag, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
-_LIBCUDACXX_HOST_DEVICE
+_CCCL_HOST_DEVICE
  auto __atomic_load_dispatch(_Tp const& __a, memory_order __order, _Sco = {}, _Tag = {}) -> __atomic_underlying_t<_Tp> {
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
@@ -97,7 +97,7 @@ _LIBCUDACXX_HOST_DEVICE
 }
 
 template <typename _Tp, typename _Sco = __thread_scope_system_tag, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
-_LIBCUDACXX_HOST_DEVICE
+_CCCL_HOST_DEVICE
 __atomic_underlying_t<_Tp> __atomic_exchange_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __value, memory_order __order, _Sco = {}, _Tag = {}) {
     alignas(_Tp) auto __tmp = __value;
     NV_DISPATCH_TARGET(
@@ -111,7 +111,7 @@ __atomic_underlying_t<_Tp> __atomic_exchange_dispatch(_Tp& __a, __atomic_underly
 }
 
 template <typename _Tp, typename _Sco = __thread_scope_system_tag, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
-_LIBCUDACXX_HOST_DEVICE
+_CCCL_HOST_DEVICE
  bool __atomic_compare_exchange_strong_dispatch(_Tp& __a, __atomic_underlying_t<_Tp>* __expected, __atomic_underlying_t<_Tp> __val, memory_order __success, memory_order __failure, _Sco = {}, _Tag = {}) {
     bool __result = false;
     NV_DISPATCH_TARGET(
@@ -126,7 +126,7 @@ _LIBCUDACXX_HOST_DEVICE
 }
 
 template <typename _Tp, typename _Sco = __thread_scope_system_tag, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
-_LIBCUDACXX_HOST_DEVICE
+_CCCL_HOST_DEVICE
  bool __atomic_compare_exchange_weak_dispatch(_Tp& __a, __atomic_underlying_t<_Tp>* __expected, __atomic_underlying_t<_Tp> __val, memory_order __success, memory_order __failure, _Sco = {}, _Tag = {}) {
     bool __result = false;
     NV_DISPATCH_TARGET(
@@ -146,7 +146,7 @@ template <typename _Tp>
 using __atomic_enable_if_not_ptr = __enable_if_t<!is_pointer<__atomic_underlying_t<_Tp>>::value, __atomic_underlying_t<_Tp>>;
 
 template <typename _Tp, typename _Sco = __thread_scope_system_tag, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
-_LIBCUDACXX_HOST_DEVICE
+_CCCL_HOST_DEVICE
  __atomic_enable_if_not_ptr<_Tp> __atomic_fetch_add_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __delta, memory_order __order, _Sco = {}, _Tag = {}) {
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
@@ -159,7 +159,7 @@ _LIBCUDACXX_HOST_DEVICE
 }
 
 template <typename _Tp, typename _Sco = __thread_scope_system_tag, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
-_LIBCUDACXX_HOST_DEVICE
+_CCCL_HOST_DEVICE
  __atomic_enable_if_ptr<_Tp> __atomic_fetch_add_dispatch(_Tp& __a, ptrdiff_t __delta, memory_order __order, _Sco = {}, _Tag = {}) {
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
@@ -172,7 +172,7 @@ _LIBCUDACXX_HOST_DEVICE
 }
 
 template <typename _Tp, typename _Sco = __thread_scope_system_tag, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
-_LIBCUDACXX_HOST_DEVICE
+_CCCL_HOST_DEVICE
  __atomic_enable_if_not_ptr<_Tp> __atomic_fetch_sub_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __delta, memory_order __order, _Sco = {}, _Tag = {}) {
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
@@ -185,7 +185,7 @@ _LIBCUDACXX_HOST_DEVICE
 }
 
 template <typename _Tp, typename _Sco = __thread_scope_system_tag, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
-_LIBCUDACXX_HOST_DEVICE
+_CCCL_HOST_DEVICE
  __atomic_enable_if_ptr<_Tp> __atomic_fetch_sub_dispatch(_Tp& __a, ptrdiff_t __delta, memory_order __order, _Sco = {}, _Tag = {}) {
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
@@ -198,7 +198,7 @@ _LIBCUDACXX_HOST_DEVICE
 }
 
 template <typename _Tp, typename _Sco = __thread_scope_system_tag, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
-_LIBCUDACXX_HOST_DEVICE
+_CCCL_HOST_DEVICE
  __atomic_underlying_t<_Tp> __atomic_fetch_and_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order __order, _Sco = {}, _Tag = {}) {
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
@@ -211,7 +211,7 @@ _LIBCUDACXX_HOST_DEVICE
 }
 
 template <typename _Tp, typename _Sco = __thread_scope_system_tag, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
-_LIBCUDACXX_HOST_DEVICE
+_CCCL_HOST_DEVICE
  __atomic_underlying_t<_Tp> __atomic_fetch_or_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order __order, _Sco = {}, _Tag = {}) {
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
@@ -224,7 +224,7 @@ _LIBCUDACXX_HOST_DEVICE
 }
 
 template <typename _Tp, typename _Sco = __thread_scope_system_tag, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
-_LIBCUDACXX_HOST_DEVICE
+_CCCL_HOST_DEVICE
  __atomic_underlying_t<_Tp> __atomic_fetch_xor_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order __order, _Sco = {}, _Tag = {}) {
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
@@ -237,7 +237,7 @@ _LIBCUDACXX_HOST_DEVICE
 }
 
 template <typename _Tp, typename _Sco = __thread_scope_system_tag, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
-_LIBCUDACXX_HOST_DEVICE
+_CCCL_HOST_DEVICE
  __atomic_underlying_t<_Tp> __atomic_fetch_max_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco = {}, _Tag = {}) {
     NV_IF_TARGET(
         NV_IS_DEVICE, (
@@ -249,7 +249,7 @@ _LIBCUDACXX_HOST_DEVICE
 }
 
 template <typename _Tp, typename _Sco = __thread_scope_system_tag, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
-_LIBCUDACXX_HOST_DEVICE
+_CCCL_HOST_DEVICE
  __atomic_underlying_t<_Tp> __atomic_fetch_min_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco = {}, _Tag = {}) {
     NV_IF_TARGET(
         NV_IS_DEVICE, (
diff --git a/libcudacxx/include/cuda/std/__atomic/order.h b/libcudacxx/include/cuda/std/__atomic/order.h
index d5c37c45ec..0310f125b6 100644
--- a/libcudacxx/include/cuda/std/__atomic/order.h
+++ b/libcudacxx/include/cuda/std/__atomic/order.h
@@ -83,7 +83,7 @@ typedef enum memory_order {
 
 #endif // _CCCL_STD_VER > 2017
 
-_LIBCUDACXX_HOST_DEVICE
+_CCCL_HOST_DEVICE
 inline int __stronger_order_cuda(int __a, int __b) {
     int const __max = __a > __b ? __a : __b;
     if(__max != __ATOMIC_RELEASE)
@@ -96,7 +96,7 @@ inline int __stronger_order_cuda(int __a, int __b) {
     return __xform[__a < __b ? __a : __b];
 }
 
-_LIBCUDACXX_HOST_DEVICE
+_CCCL_HOST_DEVICE
 inline constexpr int __atomic_order_to_int(memory_order __order) {
   // Avoid switch statement to make this a constexpr.
   return __order == memory_order_relaxed ? __ATOMIC_RELAXED:
@@ -107,7 +107,7 @@ inline constexpr int __atomic_order_to_int(memory_order __order) {
               __ATOMIC_CONSUME))));
 }
 
-_LIBCUDACXX_HOST_DEVICE
+_CCCL_HOST_DEVICE
 inline constexpr int __atomic_failure_order_to_int(memory_order __order) {
   // Avoid switch statement to make this a constexpr.
   return __order == memory_order_relaxed ? __ATOMIC_RELAXED:
diff --git a/libcudacxx/include/cuda/std/__atomic/storage/base.h b/libcudacxx/include/cuda/std/__atomic/storage/base.h
index ca6a5fceaf..75500d0769 100644
--- a/libcudacxx/include/cuda/std/__atomic/storage/base.h
+++ b/libcudacxx/include/cuda/std/__atomic/storage/base.h
@@ -34,23 +34,23 @@ struct __atomic_storage {
 
   _ALIGNAS(sizeof(_Tp)) _Tp __a_value;
 
-  _LIBCUDACXX_HOST_DEVICE
+  _CCCL_HOST_DEVICE
   __atomic_storage() noexcept
     : __a_value() {}
-  _LIBCUDACXX_HOST_DEVICE constexpr explicit
+  _CCCL_HOST_DEVICE constexpr explicit
   __atomic_storage(_Tp value) noexcept
     : __a_value(value) {}
 
-  _LIBCUDACXX_HOST_DEVICE inline auto get() -> __underlying_t* {
+  _CCCL_HOST_DEVICE inline auto get() -> __underlying_t* {
     return &__a_value;
   }
-  _LIBCUDACXX_HOST_DEVICE inline auto get() volatile -> volatile __underlying_t* {
+  _CCCL_HOST_DEVICE inline auto get() volatile -> volatile __underlying_t* {
     return &__a_value;
   }
-  _LIBCUDACXX_HOST_DEVICE inline auto get() const -> const __underlying_t* {
+  _CCCL_HOST_DEVICE inline auto get() const -> const __underlying_t* {
     return &__a_value;
   }
-  _LIBCUDACXX_HOST_DEVICE inline auto get() const volatile -> const volatile __underlying_t* {
+  _CCCL_HOST_DEVICE inline auto get() const volatile -> const volatile __underlying_t* {
     return &__a_value;
   }
 };
diff --git a/libcudacxx/include/cuda/std/__atomic/storage/common.h b/libcudacxx/include/cuda/std/__atomic/storage/common.h
index 22f946aada..48a3307616 100644
--- a/libcudacxx/include/cuda/std/__atomic/storage/common.h
+++ b/libcudacxx/include/cuda/std/__atomic/storage/common.h
@@ -21,13 +21,13 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 // is required.
 template <typename _Tp, typename _Tv>
 __enable_if_t<is_assignable<_Tp&, _Tv>::value>
-_LIBCUDACXX_HOST_DEVICE __atomic_assign_volatile(_Tp& __a_value, _Tv const& __val) {
+_CCCL_HOST_DEVICE __atomic_assign_volatile(_Tp& __a_value, _Tv const& __val) {
   __a_value = __val;
 }
 
 template <typename _Tp, typename _Tv>
 __enable_if_t<is_assignable<_Tp&, _Tv>::value>
-_LIBCUDACXX_HOST_DEVICE __atomic_assign_volatile(_Tp volatile& __a_value, _Tv volatile const& __val) {
+_CCCL_HOST_DEVICE __atomic_assign_volatile(_Tp volatile& __a_value, _Tv volatile const& __val) {
   volatile char* __to = reinterpret_cast<volatile char*>(&__a_value);
   volatile char* __end = __to + sizeof(_Tp);
   volatile const char* __from = reinterpret_cast<volatile const char*>(&__val);
diff --git a/libcudacxx/include/cuda/std/__atomic/storage/locked.h b/libcudacxx/include/cuda/std/__atomic/storage/locked.h
index ab359bc780..2c579cf23a 100644
--- a/libcudacxx/include/cuda/std/__atomic/storage/locked.h
+++ b/libcudacxx/include/cuda/std/__atomic/storage/locked.h
@@ -30,10 +30,10 @@ struct __atomic_locked_storage {
   using __underlying_t = typename remove_cv<_Tp>::type;
   using __tag_t = typename __atomic_locked_tag;
 
-  _LIBCUDACXX_HOST_DEVICE
+  _CCCL_HOST_DEVICE
   __atomic_locked_storage() noexcept
     : __a_value(), __a_lock(0) {}
-  _LIBCUDACXX_HOST_DEVICE constexpr explicit
+  _CCCL_HOST_DEVICE constexpr explicit
   __atomic_locked_storage(_Tp value) noexcept
     : __a_value(value), __a_lock(0) {}
 
@@ -41,33 +41,33 @@ struct __atomic_locked_storage {
   mutable __atomic_storage<_LIBCUDACXX_ATOMIC_FLAG_TYPE> __a_lock;
 
   template <typename _Sco>
-  _LIBCUDACXX_HOST_DEVICE void __lock(_Sco) const volatile {
+  _CCCL_HOST_DEVICE void __lock(_Sco) const volatile {
     while(1 == __atomic_exchange_dispatch(__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), memory_order_acquire, _Sco{}))
         /*spin*/;
   }
   template <typename _Sco>
-  _LIBCUDACXX_HOST_DEVICE void __lock(_Sco) const {
+  _CCCL_HOST_DEVICE void __lock(_Sco) const {
     while(1 == __atomic_exchange_dispatch(__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), memory_order_acquire, _Sco{}))
         /*spin*/;
   }
   template <typename _Sco>
-  _LIBCUDACXX_HOST_DEVICE void __unlock(_Sco) const volatile {
+  _CCCL_HOST_DEVICE void __unlock(_Sco) const volatile {
     __atomic_store_dispatch(__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), memory_order_release, _Sco{});
   }
   template <typename _Sco>
-  _LIBCUDACXX_HOST_DEVICE void __unlock(_Sco) const {
+  _CCCL_HOST_DEVICE void __unlock(_Sco) const {
     __atomic_store_dispatch(__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), memory_order_release, _Sco{});
   }
 };
 
 template <typename _Tp, typename _Sco>
-_LIBCUDACXX_HOST_DEVICE
+_CCCL_HOST_DEVICE
 void __atomic_init_dispatch(_Tp& __a,  __atomic_underlying_t<_Tp> __val, _Sco, __atomic_locked_tag) {
   __atomic_assign_volatile(__a.__a_value, __val);
 }
 
 template <typename _Tp, typename _Sco>
-_LIBCUDACXX_HOST_DEVICE
+_CCCL_HOST_DEVICE
 void __atomic_store_dispatch(_Tp& __a,  __atomic_underlying_t<_Tp> __val, memory_order, _Sco, __atomic_locked_tag) {
   __a.__lock(_Sco{});
   __atomic_assign_volatile(__a.__a_value, __val);
@@ -75,7 +75,7 @@ void __atomic_store_dispatch(_Tp& __a,  __atomic_underlying_t<_Tp> __val, memory
 }
 
 template <typename _Tp, typename _Sco>
-_LIBCUDACXX_HOST_DEVICE
+_CCCL_HOST_DEVICE
 __atomic_underlying_t<_Tp> __atomic_load_dispatch(const _Tp& __a, memory_order, _Sco, __atomic_locked_tag) {
   __atomic_underlying_t<_Tp> __old;
   __a.__lock(_Sco{});
@@ -85,7 +85,7 @@ __atomic_underlying_t<_Tp> __atomic_load_dispatch(const _Tp& __a, memory_order,
 }
 
 template <typename _Tp, typename _Sco>
-_LIBCUDACXX_HOST_DEVICE
+_CCCL_HOST_DEVICE
 __atomic_underlying_t<_Tp> __atomic_exchange_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __value, memory_order, _Sco, __atomic_locked_tag) {
   __atomic_underlying_t<_Tp> __old;
   __a.__lock(_Sco{});
@@ -96,7 +96,7 @@ __atomic_underlying_t<_Tp> __atomic_exchange_dispatch(_Tp& __a, __atomic_underly
 }
 
 template <typename _Tp, typename _Sco>
-_LIBCUDACXX_HOST_DEVICE
+_CCCL_HOST_DEVICE
 bool __atomic_compare_exchange_strong_dispatch(_Tp& __a,
                                           __atomic_underlying_t<_Tp>* __expected, __atomic_underlying_t<_Tp> __value, memory_order, memory_order, _Sco, __atomic_locked_tag) {
   __atomic_underlying_t<_Tp> __temp;
@@ -112,7 +112,7 @@ bool __atomic_compare_exchange_strong_dispatch(_Tp& __a,
 }
 
 template <typename _Tp, typename _Sco>
-_LIBCUDACXX_HOST_DEVICE
+_CCCL_HOST_DEVICE
 bool __atomic_compare_exchange_weak_dispatch(_Tp& __a,
                                         __atomic_underlying_t<_Tp>* __expected, __atomic_underlying_t<_Tp> __value, memory_order, memory_order, _Sco, __atomic_locked_tag) {
   __atomic_underlying_t<_Tp> __temp;
@@ -128,7 +128,7 @@ bool __atomic_compare_exchange_weak_dispatch(_Tp& __a,
 }
 
 template <typename _Tp, typename _Td, typename _Sco>
-_LIBCUDACXX_HOST_DEVICE
+_CCCL_HOST_DEVICE
 __atomic_underlying_t<_Tp> __atomic_fetch_add_dispatch(_Tp& __a,
                            _Td __delta, memory_order, _Sco, __atomic_locked_tag) {
   __atomic_underlying_t<_Tp> __old;
@@ -140,7 +140,7 @@ __atomic_underlying_t<_Tp> __atomic_fetch_add_dispatch(_Tp& __a,
 }
 
 template <typename _Tp, typename _Sco>
-_LIBCUDACXX_HOST_DEVICE
+_CCCL_HOST_DEVICE
 __atomic_underlying_t<_Tp> __atomic_fetch_add_dispatch(_Tp& __a,
                            ptrdiff_t __delta, memory_order, _Sco, __atomic_locked_tag) {
   __atomic_underlying_t<_Tp> __old;
@@ -152,7 +152,7 @@ __atomic_underlying_t<_Tp> __atomic_fetch_add_dispatch(_Tp& __a,
 }
 
 template <typename _Tp, typename _Td, typename _Sco>
-_LIBCUDACXX_HOST_DEVICE
+_CCCL_HOST_DEVICE
 __atomic_underlying_t<_Tp> __atomic_fetch_sub_dispatch(_Tp& __a,
                            __atomic_underlying_t<_Tp> __delta, memory_order, _Sco, __atomic_locked_tag) {
   __atomic_underlying_t<_Tp> __old;
@@ -164,7 +164,7 @@ __atomic_underlying_t<_Tp> __atomic_fetch_sub_dispatch(_Tp& __a,
 }
 
 template <typename _Tp, typename _Sco>
-_LIBCUDACXX_HOST_DEVICE
+_CCCL_HOST_DEVICE
 __atomic_underlying_t<_Tp> __atomic_fetch_and_dispatch(_Tp& __a,
                            __atomic_underlying_t<_Tp> __pattern, memory_order, _Sco, __atomic_locked_tag) {
   __atomic_underlying_t<_Tp> __old;
@@ -176,7 +176,7 @@ __atomic_underlying_t<_Tp> __atomic_fetch_and_dispatch(_Tp& __a,
 }
 
 template <typename _Tp, typename _Sco>
-_LIBCUDACXX_HOST_DEVICE
+_CCCL_HOST_DEVICE
 __atomic_underlying_t<_Tp> __atomic_fetch_or_dispatch(_Tp& __a,
                           __atomic_underlying_t<_Tp> __pattern, memory_order, _Sco, __atomic_locked_tag) {
   __atomic_underlying_t<_Tp> __old;
@@ -188,7 +188,7 @@ __atomic_underlying_t<_Tp> __atomic_fetch_or_dispatch(_Tp& __a,
 }
 
 template <typename _Tp, typename _Sco>
-_LIBCUDACXX_HOST_DEVICE
+_CCCL_HOST_DEVICE
 __atomic_underlying_t<_Tp> __atomic_fetch_xor_dispatch(_Tp& __a,
                            __atomic_underlying_t<_Tp> __pattern, memory_order, _Sco, __atomic_locked_tag) {
   __atomic_underlying_t<_Tp> __old;
diff --git a/libcudacxx/include/cuda/std/__atomic/storage/reference.h b/libcudacxx/include/cuda/std/__atomic/storage/reference.h
index a892f24d12..cfd2e3a5e2 100644
--- a/libcudacxx/include/cuda/std/__atomic/storage/reference.h
+++ b/libcudacxx/include/cuda/std/__atomic/storage/reference.h
@@ -34,11 +34,11 @@ struct __atomic_ref_storage {
 
   _Tp* __a_value;
 
-  _LIBCUDACXX_HOST_DEVICE constexpr explicit
+  _CCCL_HOST_DEVICE constexpr explicit
   __atomic_ref_storage(_Tp& value) noexcept
     : __a_value(&value) {}
 
-  _LIBCUDACXX_HOST_DEVICE inline auto get() -> __underlying_t* {
+  _CCCL_HOST_DEVICE inline auto get() -> __underlying_t* {
     return __a_value;
   }
 };
diff --git a/libcudacxx/include/cuda/std/__atomic/storage/small.h b/libcudacxx/include/cuda/std/__atomic/storage/small.h
index 679fbd5487..1f4c88abd1 100644
--- a/libcudacxx/include/cuda/std/__atomic/storage/small.h
+++ b/libcudacxx/include/cuda/std/__atomic/storage/small.h
@@ -33,25 +33,25 @@ using __atomic_small_proxy_t = __conditional_t<is_signed<_Tp>::value, int32_t, u
 
 // Arithmetic conversions to/from proxy types
 template<class _Tp, __enable_if_t<is_arithmetic<_Tp>::value, int> = 0>
-constexpr _LIBCUDACXX_HOST_DEVICE inline __atomic_small_proxy_t<_Tp> __atomic_small_to_32(_Tp __val) {
+constexpr _CCCL_HOST_DEVICE inline __atomic_small_proxy_t<_Tp> __atomic_small_to_32(_Tp __val) {
     return static_cast<__atomic_small_proxy_t<_Tp>>(__val);
 }
 
 template<class _Tp, __enable_if_t<is_arithmetic<_Tp>::value, int> = 0>
-constexpr _LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_small_from_32(__atomic_small_proxy_t<_Tp> __val) {
+constexpr _CCCL_HOST_DEVICE inline _Tp __atomic_small_from_32(__atomic_small_proxy_t<_Tp> __val) {
     return static_cast<_Tp>(__val);
 }
 
 // Non-arithmetic conversion to/from proxy types
 template<class _Tp, __enable_if_t<!is_arithmetic<_Tp>::value, int> = 0>
-_LIBCUDACXX_HOST_DEVICE inline __atomic_small_proxy_t<_Tp> __atomic_small_to_32(_Tp __val) {
+_CCCL_HOST_DEVICE inline __atomic_small_proxy_t<_Tp> __atomic_small_to_32(_Tp __val) {
     __atomic_small_proxy_t<_Tp> __temp{};
     memcpy(&__temp, &__val, sizeof(_Tp));
     return __temp;
 }
 
 template<class _Tp, __enable_if_t<!is_arithmetic<_Tp>::value, int> = 0>
-_LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_small_from_32(__atomic_small_proxy_t<_Tp> __val) {
+_CCCL_HOST_DEVICE inline _Tp __atomic_small_from_32(__atomic_small_proxy_t<_Tp> __val) {
     _Tp __temp{};
     memcpy(&__temp, &__val, sizeof(_Tp));
     return __temp;
@@ -65,33 +65,33 @@ struct __atomic_small_storage {
 
     __atomic_small_storage() noexcept = default;
 
-    _LIBCUDACXX_HOST_DEVICE
+    _CCCL_HOST_DEVICE
     constexpr explicit __atomic_small_storage(_Tp __value) : __a_value(__atomic_small_to_32(__value)) {}
 
     __atomic_storage<__proxy_t> __a_value;
 };
 
 template <typename _Tp, typename _Sco>
-_LIBCUDACXX_HOST_DEVICE
+_CCCL_HOST_DEVICE
 void __atomic_init_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, _Sco, __atomic_small_tag) {
     __atomic_init_dispatch(__a.__a_value, __atomic_small_to_32(__val), _Sco{});
 }
 
 template <typename _Tp, typename _Sco>
-_LIBCUDACXX_HOST_DEVICE inline void __atomic_store_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco, __atomic_small_tag) {
+_CCCL_HOST_DEVICE inline void __atomic_store_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco, __atomic_small_tag) {
     __atomic_store_dispatch(__a.__a_value, __atomic_small_to_32(__val), __order, _Sco{});
 }
 
 template <typename _Tp, typename _Sco>
-_LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_load_dispatch(_Tp const& __a, memory_order __order, _Sco, __atomic_small_tag) {
+_CCCL_HOST_DEVICE inline _Tp __atomic_load_dispatch(_Tp const& __a, memory_order __order, _Sco, __atomic_small_tag) {
     return __atomic_small_from_32<_Tp>(__atomic_load_dispatch(__a.__a_value, __order, _Sco{}));
 }
 
 template <typename _Tp, typename _Sco>
-_LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_exchange_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __value, memory_order __order, _Sco, __atomic_small_tag) {
+_CCCL_HOST_DEVICE inline _Tp __atomic_exchange_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __value, memory_order __order, _Sco, __atomic_small_tag) {
     return __atomic_small_from_32<_Tp>(__atomic_exchange_dispatch(__a.__a_value, __atomic_small_to_32(__value), __order, _Sco{}));
 }
-_LIBCUDACXX_HOST_DEVICE
+_CCCL_HOST_DEVICE
 inline int __cuda_memcmp(void const * __lhs, void const * __rhs, size_t __count) {
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
@@ -112,7 +112,7 @@ inline int __cuda_memcmp(void const * __lhs, void const * __rhs, size_t __count)
 }
 
 template <typename _Tp, typename _Sco>
-_LIBCUDACXX_HOST_DEVICE inline bool __atomic_compare_exchange_weak_dispatch(_Tp& __a, __atomic_underlying_t<_Tp>* __expected, __atomic_underlying_t<_Tp> __value, memory_order __success, memory_order __failure, _Sco, __atomic_small_tag) {
+_CCCL_HOST_DEVICE inline bool __atomic_compare_exchange_weak_dispatch(_Tp& __a, __atomic_underlying_t<_Tp>* __expected, __atomic_underlying_t<_Tp> __value, memory_order __success, memory_order __failure, _Sco, __atomic_small_tag) {
     auto __temp_expected = __atomic_small_to_32(*__expected);
     auto const __ret = __atomic_compare_exchange_weak_dispatch(__a.__a_value, &__temp_expected, __atomic_small_to_32(__value), __success, __failure, _Sco{});
     auto const __actual = __atomic_small_from_32<__atomic_underlying_t<_Tp>>(__temp_expected);
@@ -127,7 +127,7 @@ _LIBCUDACXX_HOST_DEVICE inline bool __atomic_compare_exchange_weak_dispatch(_Tp&
 }
 
 template <typename _Tp, typename _Sco>
-_LIBCUDACXX_HOST_DEVICE inline bool __atomic_compare_exchange_strong_dispatch(_Tp& __a, __atomic_underlying_t<_Tp>* __expected, __atomic_underlying_t<_Tp> __value, memory_order __success, memory_order __failure, _Sco, __atomic_small_tag) {
+_CCCL_HOST_DEVICE inline bool __atomic_compare_exchange_strong_dispatch(_Tp& __a, __atomic_underlying_t<_Tp>* __expected, __atomic_underlying_t<_Tp> __value, memory_order __success, memory_order __failure, _Sco, __atomic_small_tag) {
     auto const __old = *__expected;
     while(1) {
         if(__atomic_compare_exchange_weak_dispatch(__a, __expected, __value, __success, __failure, _Sco{}, __atomic_small_tag{}))
@@ -138,37 +138,37 @@ _LIBCUDACXX_HOST_DEVICE inline bool __atomic_compare_exchange_strong_dispatch(_T
 }
 
 template <typename _Tp, typename _Sco>
-_LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_fetch_add_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __delta, memory_order __order, _Sco, __atomic_small_tag) {
+_CCCL_HOST_DEVICE inline _Tp __atomic_fetch_add_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __delta, memory_order __order, _Sco, __atomic_small_tag) {
     return __atomic_small_from_32<_Tp>(__atomic_fetch_add_dispatch(__a.__a_value, __atomic_small_to_32(__delta), __order, _Sco{}));
 }
 
 template <typename _Tp, typename _Sco>
-_LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_fetch_sub_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __delta, memory_order __order, _Sco, __atomic_small_tag) {
+_CCCL_HOST_DEVICE inline _Tp __atomic_fetch_sub_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __delta, memory_order __order, _Sco, __atomic_small_tag) {
     return __atomic_small_from_32<_Tp>(__atomic_fetch_sub_dispatch(__a.__a_value, __atomic_small_to_32(__delta), __order, _Sco{}));
 }
 
 template <typename _Tp, typename _Sco>
-_LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_fetch_and_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order __order, _Sco, __atomic_small_tag) {
+_CCCL_HOST_DEVICE inline _Tp __atomic_fetch_and_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order __order, _Sco, __atomic_small_tag) {
     return __atomic_small_from_32<_Tp>(__atomic_fetch_and_dispatch(__a.__a_value, __atomic_small_to_32(__pattern), __order, _Sco{}));
 }
 
 template <typename _Tp, typename _Sco>
-_LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_fetch_or_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order __order, _Sco, __atomic_small_tag) {
+_CCCL_HOST_DEVICE inline _Tp __atomic_fetch_or_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order __order, _Sco, __atomic_small_tag) {
     return __atomic_small_from_32<_Tp>(__atomic_fetch_or_dispatch(__a.__a_value, __atomic_small_to_32(__pattern), __order, _Sco{}));
 }
 
 template <typename _Tp, typename _Sco>
-_LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_fetch_xor_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order __order, _Sco, __atomic_small_tag) {
+_CCCL_HOST_DEVICE inline _Tp __atomic_fetch_xor_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order __order, _Sco, __atomic_small_tag) {
     return __atomic_small_from_32<_Tp>(__atomic_fetch_xor_dispatch(__a.__a_value, __atomic_small_to_32(__pattern), __order, _Sco{}));
 }
 
 template <typename _Tp, typename _Delta, typename _Sco>
-_LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_fetch_max_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco, __atomic_small_tag) {
+_CCCL_HOST_DEVICE inline _Tp __atomic_fetch_max_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco, __atomic_small_tag) {
     return __atomic_small_from_32<_Tp>(__atomic_fetch_max_dispatch(__a.__a_value, __atomic_small_to_32(__val), __order, _Sco{}));
 }
 
 template <typename _Tp, typename _Delta, typename _Sco>
-_LIBCUDACXX_HOST_DEVICE inline _Tp __atomic_fetch_min_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco, __atomic_small_tag) {
+_CCCL_HOST_DEVICE inline _Tp __atomic_fetch_min_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco, __atomic_small_tag) {
     return __atomic_small_from_32<_Tp>(__atomic_fetch_min_dispatch(__a.__a_value, __atomic_small_to_32(__val), __order, _Sco{}));
 }
 
diff --git a/libcudacxx/include/cuda/std/__atomic/wait/polling.h b/libcudacxx/include/cuda/std/__atomic/wait/polling.h
index 4f4a8dd9a3..0a5e06c28f 100644
--- a/libcudacxx/include/cuda/std/__atomic/wait/polling.h
+++ b/libcudacxx/include/cuda/std/__atomic/wait/polling.h
@@ -32,21 +32,21 @@ struct __atomic_poll_tester {
     __underlying_t __val;
     memory_order __order;
 
-    _LIBCUDACXX_HOST_DEVICE
+    _CCCL_HOST_DEVICE
     __atomic_poll_tester(_Tp const volatile* __a, __underlying_t __v, memory_order __o)
       : __atom(__a)
       , __val(__v)
       , __order(__o)
     {}
 
-    _LIBCUDACXX_HOST_DEVICE
+    _CCCL_HOST_DEVICE
     bool operator()() const {
       return !(__atomic_load_dispatch(*__atom, __order, _Sco{}, __atomic_tag_t<_Tp>{}) == __val);
     }
 };
 
 template <typename _Tp, typename _Sco>
-_LIBCUDACXX_HOST_DEVICE
+_CCCL_HOST_DEVICE
 void __atomic_try_wait_slow_fallback(_Tp const volatile* __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco) {
     __libcpp_thread_poll_with_backoff(__atomic_poll_tester<_Tp, _Sco>(__a, __val, __order));
 }

From 452fc3b0c55cd43dfdaa543fce764ba8163c680d Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Thu, 18 Apr 2024 16:09:27 -0700
Subject: [PATCH 09/71] Fix another merge conflict (LIBCUDACXX->CCCL)

---
 libcudacxx/include/cuda/std/__atomic/storage/base.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcudacxx/include/cuda/std/__atomic/storage/base.h b/libcudacxx/include/cuda/std/__atomic/storage/base.h
index 75500d0769..1725c4e819 100644
--- a/libcudacxx/include/cuda/std/__atomic/storage/base.h
+++ b/libcudacxx/include/cuda/std/__atomic/storage/base.h
@@ -32,7 +32,7 @@ struct __atomic_storage {
     "std::atomic<Tp> requires that 'Tp' be a trivially copyable type");
 #endif
 
-  _ALIGNAS(sizeof(_Tp)) _Tp __a_value;
+  _CCCL_ALIGNAS(sizeof(_Tp)) _Tp __a_value;
 
   _CCCL_HOST_DEVICE
   __atomic_storage() noexcept

From 91f8b11623205e0515763f099dc69eb1458acb1a Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Thu, 18 Apr 2024 16:11:08 -0700
Subject: [PATCH 10/71] Simplify tag dispatch in the atomic backend

---
 .../std/__atomic/operations/heterogeneous.h   | 73 +++++++--------
 .../include/cuda/std/__atomic/storage/base.h  |  3 +-
 .../cuda/std/__atomic/storage/common.h        |  3 -
 .../cuda/std/__atomic/storage/locked.h        | 91 +++++++++----------
 .../cuda/std/__atomic/storage/reference.h     |  3 +-
 .../include/cuda/std/__atomic/storage/small.h | 63 +++++++------
 6 files changed, 111 insertions(+), 125 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__atomic/operations/heterogeneous.h b/libcudacxx/include/cuda/std/__atomic/operations/heterogeneous.h
index 5bfa86661d..9ef3fcf51e 100644
--- a/libcudacxx/include/cuda/std/__atomic/operations/heterogeneous.h
+++ b/libcudacxx/include/cuda/std/__atomic/operations/heterogeneous.h
@@ -55,23 +55,16 @@ inline
     )
 }
 
-// Regarding __atomic_base_Tag
-// It *is* possible to define it as:
-// _Tag = __atomic_enable_if_default_base_t<_Tp> and make all tag types default to the 'base' backend
-// I don't know if it's necessary to do that though. For now, this just adds some kind of protection
-// preventing access to the functions with the wrong tag type.
-template <typename _Tp>
-using __atomic_enable_if_default_base_t = __enable_if_t<is_same<__atomic_tag_t<_Tp>, __atomic_base_tag>::value, __atomic_tag_t<_Tp>>;
-
-template <typename _Tp, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
+// automatically dispatch based on default argument of '_Sto<_Tp, tag_t>'
+template <template <typename, typename = __atomic_base_tag> _Sto, typename _Tp>
 _CCCL_HOST_DEVICE
- void __atomic_init_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, _Tag = {}) {
+ void __atomic_init_dispatch(_Sto<_Tp>& __a, _Tp __val) {
     __atomic_assign_volatile(__a.get(), __val);
 }
 
-template <typename _Tp, typename _Sco = __thread_scope_system_tag, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
+template <template <typename, typename = __atomic_base_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
 _CCCL_HOST_DEVICE
- void __atomic_store_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco = {}, _Tag = {}) {
+ void __atomic_store_dispatch(_Sto<_Tp>& __a, _Tp __val, memory_order __order, _Sco = {}) {
     alignas(_Tp) auto __tmp = __val;
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
@@ -83,9 +76,9 @@ _CCCL_HOST_DEVICE
     )
 }
 
-template <typename _Tp, typename _Sco = __thread_scope_system_tag, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
+template <template <typename, typename = __atomic_base_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
 _CCCL_HOST_DEVICE
- auto __atomic_load_dispatch(_Tp const& __a, memory_order __order, _Sco = {}, _Tag = {}) -> __atomic_underlying_t<_Tp> {
+ auto __atomic_load_dispatch(_Sto<_Tp> const& __a, memory_order __order, _Sco = {}) -> _Tp {
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
             return __atomic_load_n_cuda(__a.get(), static_cast<__memory_order_underlying_t>(__order),  _Sco{});
@@ -96,9 +89,9 @@ _CCCL_HOST_DEVICE
     )
 }
 
-template <typename _Tp, typename _Sco = __thread_scope_system_tag, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
+template <template <typename, typename = __atomic_base_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
 _CCCL_HOST_DEVICE
-__atomic_underlying_t<_Tp> __atomic_exchange_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __value, memory_order __order, _Sco = {}, _Tag = {}) {
+_Tp __atomic_exchange_dispatch(_Sto<_Tp>& __a, _Tp __value, memory_order __order, _Sco = {}) {
     alignas(_Tp) auto __tmp = __value;
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
@@ -110,9 +103,9 @@ __atomic_underlying_t<_Tp> __atomic_exchange_dispatch(_Tp& __a, __atomic_underly
     )
 }
 
-template <typename _Tp, typename _Sco = __thread_scope_system_tag, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
+template <template <typename, typename = __atomic_base_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
 _CCCL_HOST_DEVICE
- bool __atomic_compare_exchange_strong_dispatch(_Tp& __a, __atomic_underlying_t<_Tp>* __expected, __atomic_underlying_t<_Tp> __val, memory_order __success, memory_order __failure, _Sco = {}, _Tag = {}) {
+ bool __atomic_compare_exchange_strong_dispatch(_Sto<_Tp>& __a, _Tp* __expected, _Tp __val, memory_order __success, memory_order __failure, _Sco = {}) {
     bool __result = false;
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
@@ -125,9 +118,9 @@ _CCCL_HOST_DEVICE
     return __result;
 }
 
-template <typename _Tp, typename _Sco = __thread_scope_system_tag, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
+template <template <typename, typename = __atomic_base_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
 _CCCL_HOST_DEVICE
- bool __atomic_compare_exchange_weak_dispatch(_Tp& __a, __atomic_underlying_t<_Tp>* __expected, __atomic_underlying_t<_Tp> __val, memory_order __success, memory_order __failure, _Sco = {}, _Tag = {}) {
+ bool __atomic_compare_exchange_weak_dispatch(_Sto<_Tp>& __a, _Tp* __expected, _Tp __val, memory_order __success, memory_order __failure, _Sco = {}) {
     bool __result = false;
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
@@ -141,13 +134,13 @@ _CCCL_HOST_DEVICE
 }
 
 template <typename _Tp>
-using __atomic_enable_if_ptr = __enable_if_t<is_pointer<__atomic_underlying_t<_Tp>>::value, __atomic_underlying_t<_Tp>>;
+using __atomic_enable_if_ptr = __enable_if_t<is_pointer<_Tp>::value, _Tp>;
 template <typename _Tp>
-using __atomic_enable_if_not_ptr = __enable_if_t<!is_pointer<__atomic_underlying_t<_Tp>>::value, __atomic_underlying_t<_Tp>>;
+using __atomic_enable_if_not_ptr = __enable_if_t<!is_pointer<_Tp>::value, _Tp>;
 
-template <typename _Tp, typename _Sco = __thread_scope_system_tag, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
+template <template <typename, typename = __atomic_base_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
 _CCCL_HOST_DEVICE
- __atomic_enable_if_not_ptr<_Tp> __atomic_fetch_add_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __delta, memory_order __order, _Sco = {}, _Tag = {}) {
+ __atomic_enable_if_not_ptr<_Tp> __atomic_fetch_add_dispatch(_Sto<_Tp>& __a, _Tp __delta, memory_order __order, _Sco = {}) {
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
             return __atomic_fetch_add_cuda(__a.get(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{});
@@ -158,9 +151,9 @@ _CCCL_HOST_DEVICE
     )
 }
 
-template <typename _Tp, typename _Sco = __thread_scope_system_tag, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
+template <template <typename, typename = __atomic_base_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
 _CCCL_HOST_DEVICE
- __atomic_enable_if_ptr<_Tp> __atomic_fetch_add_dispatch(_Tp& __a, ptrdiff_t __delta, memory_order __order, _Sco = {}, _Tag = {}) {
+ __atomic_enable_if_ptr<_Tp> __atomic_fetch_add_dispatch(_Sto<_Tp>& __a, ptrdiff_t __delta, memory_order __order, _Sco = {}) {
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
             return __atomic_fetch_add_cuda(__a.get(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{});
@@ -171,9 +164,9 @@ _CCCL_HOST_DEVICE
     )
 }
 
-template <typename _Tp, typename _Sco = __thread_scope_system_tag, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
+template <template <typename, typename = __atomic_base_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
 _CCCL_HOST_DEVICE
- __atomic_enable_if_not_ptr<_Tp> __atomic_fetch_sub_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __delta, memory_order __order, _Sco = {}, _Tag = {}) {
+ __atomic_enable_if_not_ptr<_Tp> __atomic_fetch_sub_dispatch(_Sto<_Tp>& __a, _Tp __delta, memory_order __order, _Sco = {}) {
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
             return __atomic_fetch_sub_cuda(__a.get(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{});
@@ -184,9 +177,9 @@ _CCCL_HOST_DEVICE
     )
 }
 
-template <typename _Tp, typename _Sco = __thread_scope_system_tag, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
+template <template <typename, typename = __atomic_base_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
 _CCCL_HOST_DEVICE
- __atomic_enable_if_ptr<_Tp> __atomic_fetch_sub_dispatch(_Tp& __a, ptrdiff_t __delta, memory_order __order, _Sco = {}, _Tag = {}) {
+ __atomic_enable_if_ptr<_Tp> __atomic_fetch_sub_dispatch(_Sto<_Tp>& __a, ptrdiff_t __delta, memory_order __order, _Sco = {}) {
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
             return __atomic_fetch_sub_cuda(__a.get(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{});
@@ -197,9 +190,9 @@ _CCCL_HOST_DEVICE
     )
 }
 
-template <typename _Tp, typename _Sco = __thread_scope_system_tag, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
+template <template <typename, typename = __atomic_base_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
 _CCCL_HOST_DEVICE
- __atomic_underlying_t<_Tp> __atomic_fetch_and_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order __order, _Sco = {}, _Tag = {}) {
+ _Tp __atomic_fetch_and_dispatch(_Sto<_Tp>& __a, _Tp __pattern, memory_order __order, _Sco = {}) {
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
             return __atomic_fetch_and_cuda(__a.get(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{});
@@ -210,9 +203,9 @@ _CCCL_HOST_DEVICE
     )
 }
 
-template <typename _Tp, typename _Sco = __thread_scope_system_tag, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
+template <template <typename, typename = __atomic_base_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
 _CCCL_HOST_DEVICE
- __atomic_underlying_t<_Tp> __atomic_fetch_or_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order __order, _Sco = {}, _Tag = {}) {
+ _Tp __atomic_fetch_or_dispatch(_Sto<_Tp>& __a, _Tp __pattern, memory_order __order, _Sco = {}) {
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
             return __atomic_fetch_or_cuda(__a.get(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{});
@@ -223,9 +216,9 @@ _CCCL_HOST_DEVICE
     )
 }
 
-template <typename _Tp, typename _Sco = __thread_scope_system_tag, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
+template <template <typename, typename = __atomic_base_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
 _CCCL_HOST_DEVICE
- __atomic_underlying_t<_Tp> __atomic_fetch_xor_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order __order, _Sco = {}, _Tag = {}) {
+ _Tp __atomic_fetch_xor_dispatch(_Sto<_Tp>& __a, _Tp __pattern, memory_order __order, _Sco = {}) {
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
             return __atomic_fetch_xor_cuda(__a.get(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{});
@@ -236,9 +229,9 @@ _CCCL_HOST_DEVICE
     )
 }
 
-template <typename _Tp, typename _Sco = __thread_scope_system_tag, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
+template <template <typename, typename = __atomic_base_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
 _CCCL_HOST_DEVICE
- __atomic_underlying_t<_Tp> __atomic_fetch_max_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco = {}, _Tag = {}) {
+ _Tp __atomic_fetch_max_dispatch(_Sto<_Tp>& __a, _Tp __val, memory_order __order, _Sco = {}) {
     NV_IF_TARGET(
         NV_IS_DEVICE, (
             return __atomic_fetch_max_cuda(__a.get(), __val, static_cast<__memory_order_underlying_t>(__order), _Sco{});
@@ -248,9 +241,9 @@ _CCCL_HOST_DEVICE
     )
 }
 
-template <typename _Tp, typename _Sco = __thread_scope_system_tag, typename _Tag = __atomic_enable_if_default_base_t<_Tp>>
+template <template <typename, typename = __atomic_base_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
 _CCCL_HOST_DEVICE
- __atomic_underlying_t<_Tp> __atomic_fetch_min_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco = {}, _Tag = {}) {
+ _Tp __atomic_fetch_min_dispatch(_Sto<_Tp>& __a, _Tp __val, memory_order __order, _Sco = {}) {
     NV_IF_TARGET(
         NV_IS_DEVICE, (
             return __atomic_fetch_min_cuda(__a.get(), __val, static_cast<__memory_order_underlying_t>(__order), _Sco{});
diff --git a/libcudacxx/include/cuda/std/__atomic/storage/base.h b/libcudacxx/include/cuda/std/__atomic/storage/base.h
index 1725c4e819..399b0e6ce5 100644
--- a/libcudacxx/include/cuda/std/__atomic/storage/base.h
+++ b/libcudacxx/include/cuda/std/__atomic/storage/base.h
@@ -22,10 +22,9 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 struct __atomic_base_tag {};
 
-template <typename _Tp>
+template <typename _Tp, typename _Tag = __atomic_base_tag>
 struct __atomic_storage {
   using __underlying_t = _Tp;
-  using __tag_t = __atomic_base_tag;
 
 #if !defined(_CCCL_COMPILER_GCC) || (__GNUC__ >= 5)
   static_assert(is_trivially_copyable<_Tp>::value,
diff --git a/libcudacxx/include/cuda/std/__atomic/storage/common.h b/libcudacxx/include/cuda/std/__atomic/storage/common.h
index 48a3307616..406e467aca 100644
--- a/libcudacxx/include/cuda/std/__atomic/storage/common.h
+++ b/libcudacxx/include/cuda/std/__atomic/storage/common.h
@@ -38,9 +38,6 @@ _CCCL_HOST_DEVICE __atomic_assign_volatile(_Tp volatile& __a_value, _Tv volatile
 template <typename _Tp>
 using __atomic_underlying_t = typename __remove_cvref_t<_Tp>::__underlying_t;
 
-template <typename _Tp>
-using __atomic_tag_t = typename __remove_cvref_t<_Tp>::__tag_t;
-
 _LIBCUDACXX_END_NAMESPACE_STD
 
 #endif // _LIBCUDACXX___ATOMIC_STORAGE_COMMON_H
diff --git a/libcudacxx/include/cuda/std/__atomic/storage/locked.h b/libcudacxx/include/cuda/std/__atomic/storage/locked.h
index 2c579cf23a..08c03d2aab 100644
--- a/libcudacxx/include/cuda/std/__atomic/storage/locked.h
+++ b/libcudacxx/include/cuda/std/__atomic/storage/locked.h
@@ -25,10 +25,9 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 // Locked atomics must override the dispatch to be able to implement RMW primitives around the embedded lock.
 struct __atomic_locked_tag {};
 
-template<typename _Tp>
+template <typename _Tp, typename _Tag = __atomic_locked_tag>
 struct __atomic_locked_storage {
   using __underlying_t = typename remove_cv<_Tp>::type;
-  using __tag_t = typename __atomic_locked_tag;
 
   _CCCL_HOST_DEVICE
   __atomic_locked_storage() noexcept
@@ -60,34 +59,34 @@ struct __atomic_locked_storage {
   }
 };
 
-template <typename _Tp, typename _Sco>
+template <template <typename, typename = __atomic_locked_tag> _Sto, typename _Tp>
 _CCCL_HOST_DEVICE
-void __atomic_init_dispatch(_Tp& __a,  __atomic_underlying_t<_Tp> __val, _Sco, __atomic_locked_tag) {
+void __atomic_init_dispatch(_Sto<_Tp>& __a,  _Tp __val) {
   __atomic_assign_volatile(__a.__a_value, __val);
 }
 
-template <typename _Tp, typename _Sco>
+template <template <typename, typename = __atomic_locked_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
 _CCCL_HOST_DEVICE
-void __atomic_store_dispatch(_Tp& __a,  __atomic_underlying_t<_Tp> __val, memory_order, _Sco, __atomic_locked_tag) {
+void __atomic_store_dispatch(_Sto<_Tp>& __a,  _Tp __val, memory_order, _Sco) {
   __a.__lock(_Sco{});
   __atomic_assign_volatile(__a.__a_value, __val);
   __a.__unlock(_Sco{});
 }
 
-template <typename _Tp, typename _Sco>
+template <template <typename, typename = __atomic_locked_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
 _CCCL_HOST_DEVICE
-__atomic_underlying_t<_Tp> __atomic_load_dispatch(const _Tp& __a, memory_order, _Sco, __atomic_locked_tag) {
-  __atomic_underlying_t<_Tp> __old;
+_Tp __atomic_load_dispatch(const _Sto<_Tp>& __a, memory_order, _Sco) {
+  _Tp __old;
   __a.__lock(_Sco{});
   __atomic_assign_volatile(__old, __a.__a_value);
   __a.__unlock(_Sco{});
   return __old;
 }
 
-template <typename _Tp, typename _Sco>
+template <template <typename, typename = __atomic_locked_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
 _CCCL_HOST_DEVICE
-__atomic_underlying_t<_Tp> __atomic_exchange_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __value, memory_order, _Sco, __atomic_locked_tag) {
-  __atomic_underlying_t<_Tp> __old;
+_Tp __atomic_exchange_dispatch(_Sto<_Tp>& __a, _Tp __value, memory_order, _Sco) {
+  _Tp __old;
   __a.__lock(_Sco{});
   __atomic_assign_volatile(__old, __a.__a_value);
   __atomic_assign_volatile(__a.__a_value, __value);
@@ -95,11 +94,11 @@ __atomic_underlying_t<_Tp> __atomic_exchange_dispatch(_Tp& __a, __atomic_underly
   return __old;
 }
 
-template <typename _Tp, typename _Sco>
+template <template <typename, typename = __atomic_locked_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
 _CCCL_HOST_DEVICE
-bool __atomic_compare_exchange_strong_dispatch(_Tp& __a,
-                                          __atomic_underlying_t<_Tp>* __expected, __atomic_underlying_t<_Tp> __value, memory_order, memory_order, _Sco, __atomic_locked_tag) {
-  __atomic_underlying_t<_Tp> __temp;
+bool __atomic_compare_exchange_strong_dispatch(_Sto<_Tp>& __a,
+                                          _Tp* __expected, _Tp __value, memory_order, memory_order, _Sco) {
+  _Tp __temp;
   __a.__lock(_Sco{});
   __atomic_assign_volatile(__temp, __a.__a_value);
   bool __ret = __temp == *__expected;
@@ -111,11 +110,11 @@ bool __atomic_compare_exchange_strong_dispatch(_Tp& __a,
   return __ret;
 }
 
-template <typename _Tp, typename _Sco>
+template <template <typename, typename = __atomic_locked_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
 _CCCL_HOST_DEVICE
-bool __atomic_compare_exchange_weak_dispatch(_Tp& __a,
-                                        __atomic_underlying_t<_Tp>* __expected, __atomic_underlying_t<_Tp> __value, memory_order, memory_order, _Sco, __atomic_locked_tag) {
-  __atomic_underlying_t<_Tp> __temp;
+bool __atomic_compare_exchange_weak_dispatch(_Sto<_Tp>& __a,
+                                        _Tp* __expected, _Tp __value, memory_order, memory_order, _Sco) {
+  _Tp __temp;
   __a.__lock(_Sco{});
   __atomic_assign_volatile(__temp, __a.__a_value);
   bool __ret = __temp == *__expected;
@@ -129,21 +128,21 @@ bool __atomic_compare_exchange_weak_dispatch(_Tp& __a,
 
 template <typename _Tp, typename _Td, typename _Sco>
 _CCCL_HOST_DEVICE
-__atomic_underlying_t<_Tp> __atomic_fetch_add_dispatch(_Tp& __a,
-                           _Td __delta, memory_order, _Sco, __atomic_locked_tag) {
-  __atomic_underlying_t<_Tp> __old;
+_Tp __atomic_fetch_add_dispatch(_Sto<_Tp>& __a,
+                           _Td __delta, memory_order, _Sco) {
+  _Tp __old;
   __a.__lock(_Sco{});
   __atomic_assign_volatile(__old, __a.__a_value);
-  __atomic_assign_volatile(__a.__a_value, __atomic_underlying_t<_Tp>(__old + __delta));
+  __atomic_assign_volatile(__a.__a_value, _Tp(__old + __delta));
   __a.__unlock(_Sco{});
   return __old;
 }
 
-template <typename _Tp, typename _Sco>
+template <template <typename, typename = __atomic_locked_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
 _CCCL_HOST_DEVICE
-__atomic_underlying_t<_Tp> __atomic_fetch_add_dispatch(_Tp& __a,
-                           ptrdiff_t __delta, memory_order, _Sco, __atomic_locked_tag) {
-  __atomic_underlying_t<_Tp> __old;
+_Tp __atomic_fetch_add_dispatch(_Sto<_Tp>& __a,
+                           ptrdiff_t __delta, memory_order, _Sco) {
+  _Tp __old;
   __a.__lock(_Sco{});
   __atomic_assign_volatile(__old, __a.__a_value);
   __atomic_assign_volatile(__a.__a_value, __old + __delta);
@@ -153,45 +152,45 @@ __atomic_underlying_t<_Tp> __atomic_fetch_add_dispatch(_Tp& __a,
 
 template <typename _Tp, typename _Td, typename _Sco>
 _CCCL_HOST_DEVICE
-__atomic_underlying_t<_Tp> __atomic_fetch_sub_dispatch(_Tp& __a,
-                           __atomic_underlying_t<_Tp> __delta, memory_order, _Sco, __atomic_locked_tag) {
-  __atomic_underlying_t<_Tp> __old;
+_Tp __atomic_fetch_sub_dispatch(_Sto<_Tp>& __a,
+                           _Tp __delta, memory_order, _Sco) {
+  _Tp __old;
   __a.__lock(_Sco{});
   __atomic_assign_volatile(__old, __a.__a_value);
-  __atomic_assign_volatile(__a.__a_value, __atomic_underlying_t<_Tp>(__old - __delta));
+  __atomic_assign_volatile(__a.__a_value, _Tp(__old - __delta));
   __a.__unlock(_Sco{});
   return __old;
 }
 
-template <typename _Tp, typename _Sco>
+template <template <typename, typename = __atomic_locked_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
 _CCCL_HOST_DEVICE
-__atomic_underlying_t<_Tp> __atomic_fetch_and_dispatch(_Tp& __a,
-                           __atomic_underlying_t<_Tp> __pattern, memory_order, _Sco, __atomic_locked_tag) {
-  __atomic_underlying_t<_Tp> __old;
+_Tp __atomic_fetch_and_dispatch(_Sto<_Tp>& __a,
+                           _Tp __pattern, memory_order, _Sco) {
+  _Tp __old;
   __a.__lock(_Sco{});
   __atomic_assign_volatile(__old, __a.__a_value);
-  __atomic_assign_volatile(__a.__a_value, __atomic_underlying_t<_Tp>(__old & __pattern));
+  __atomic_assign_volatile(__a.__a_value, _Tp(__old & __pattern));
   __a.__unlock(_Sco{});
   return __old;
 }
 
-template <typename _Tp, typename _Sco>
+template <template <typename, typename = __atomic_locked_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
 _CCCL_HOST_DEVICE
-__atomic_underlying_t<_Tp> __atomic_fetch_or_dispatch(_Tp& __a,
-                          __atomic_underlying_t<_Tp> __pattern, memory_order, _Sco, __atomic_locked_tag) {
-  __atomic_underlying_t<_Tp> __old;
+_Tp __atomic_fetch_or_dispatch(_Sto<_Tp>& __a,
+                          _Tp __pattern, memory_order, _Sco) {
+  _Tp __old;
   __a.__lock(_Sco{});
   __atomic_assign_volatile(__old, __a.__a_value);
-  __atomic_assign_volatile(__a.__a_value, __atomic_underlying_t<_Tp>(__old | __pattern));
+  __atomic_assign_volatile(__a.__a_value, _Tp(__old | __pattern));
   __a.__unlock(_Sco{});
   return __old;
 }
 
-template <typename _Tp, typename _Sco>
+template <template <typename, typename = __atomic_locked_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
 _CCCL_HOST_DEVICE
-__atomic_underlying_t<_Tp> __atomic_fetch_xor_dispatch(_Tp& __a,
-                           __atomic_underlying_t<_Tp> __pattern, memory_order, _Sco, __atomic_locked_tag) {
-  __atomic_underlying_t<_Tp> __old;
+_Tp __atomic_fetch_xor_dispatch(_Sto<_Tp>& __a,
+                           _Tp __pattern, memory_order, _Sco) {
+  _Tp __old;
   __a.__lock(_Sco{});
   __atomic_assign_volatile(__old, __a.__a_value);
   __atomic_assign_volatile(__a.__a_value, _Tp(__old ^ __pattern));
diff --git a/libcudacxx/include/cuda/std/__atomic/storage/reference.h b/libcudacxx/include/cuda/std/__atomic/storage/reference.h
index cfd2e3a5e2..16d5eee48f 100644
--- a/libcudacxx/include/cuda/std/__atomic/storage/reference.h
+++ b/libcudacxx/include/cuda/std/__atomic/storage/reference.h
@@ -22,10 +22,9 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 // Reference is compatible with __atomic_base_tag and uses default dispatch
 
-template <typename _Tp>
+template <typename _Tp, typename _Tag = __atomic_base_tag>
 struct __atomic_ref_storage {
   using __underlying_t = _Tp;
-  using __tag_t = __atomic_base_tag;
 
 #if !defined(_CCCL_COMPILER_GCC) || (__GNUC__ >= 5)
   static_assert(is_trivially_copyable<_Tp>::value,
diff --git a/libcudacxx/include/cuda/std/__atomic/storage/small.h b/libcudacxx/include/cuda/std/__atomic/storage/small.h
index 1f4c88abd1..9d8d3a9791 100644
--- a/libcudacxx/include/cuda/std/__atomic/storage/small.h
+++ b/libcudacxx/include/cuda/std/__atomic/storage/small.h
@@ -57,10 +57,9 @@ _CCCL_HOST_DEVICE inline _Tp __atomic_small_from_32(__atomic_small_proxy_t<_Tp>
     return __temp;
 }
 
-template <typename _Tp>
+template <typename _Tp, typename _Tag = __atomic_small_tag>
 struct __atomic_small_storage {
     using __underlying_t = _Tp;
-    using __tag_t = __atomic_small_tag;
     using __proxy_t = __atomic_small_proxy_t<_Tp>;
 
     __atomic_small_storage() noexcept = default;
@@ -71,24 +70,24 @@ struct __atomic_small_storage {
     __atomic_storage<__proxy_t> __a_value;
 };
 
-template <typename _Tp, typename _Sco>
+template <template <typename, typename = __atomic_small_tag> _Sto, typename _Tp>
 _CCCL_HOST_DEVICE
-void __atomic_init_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, _Sco, __atomic_small_tag) {
-    __atomic_init_dispatch(__a.__a_value, __atomic_small_to_32(__val), _Sco{});
+void __atomic_init_dispatch(_Sto<_Tp>& __a, _Tp __val) {
+    __atomic_assign_volatile(__a.__a_value, __atomic_small_to_32(__val));
 }
 
-template <typename _Tp, typename _Sco>
-_CCCL_HOST_DEVICE inline void __atomic_store_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco, __atomic_small_tag) {
+template <template <typename, typename = __atomic_small_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
+_CCCL_HOST_DEVICE inline void __atomic_store_dispatch(_Sto<_Tp>& __a, _Tp __val, memory_order __order, _Sco) {
     __atomic_store_dispatch(__a.__a_value, __atomic_small_to_32(__val), __order, _Sco{});
 }
 
-template <typename _Tp, typename _Sco>
-_CCCL_HOST_DEVICE inline _Tp __atomic_load_dispatch(_Tp const& __a, memory_order __order, _Sco, __atomic_small_tag) {
+template <template <typename, typename = __atomic_small_tag> _Sto, typename _Tp>
+_CCCL_HOST_DEVICE inline _Tp __atomic_load_dispatch(_Sto<_Tp> const& __a, memory_order __order, _Sco) {
     return __atomic_small_from_32<_Tp>(__atomic_load_dispatch(__a.__a_value, __order, _Sco{}));
 }
 
-template <typename _Tp, typename _Sco>
-_CCCL_HOST_DEVICE inline _Tp __atomic_exchange_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __value, memory_order __order, _Sco, __atomic_small_tag) {
+template <template <typename, typename = __atomic_small_tag> _Sto, typename _Tp>
+_CCCL_HOST_DEVICE inline _Tp __atomic_exchange_dispatch(_Sto<_Tp>& __a, _Tp __value, memory_order __order, _Sco) {
     return __atomic_small_from_32<_Tp>(__atomic_exchange_dispatch(__a.__a_value, __atomic_small_to_32(__value), __order, _Sco{}));
 }
 _CCCL_HOST_DEVICE
@@ -111,14 +110,14 @@ inline int __cuda_memcmp(void const * __lhs, void const * __rhs, size_t __count)
     )
 }
 
-template <typename _Tp, typename _Sco>
-_CCCL_HOST_DEVICE inline bool __atomic_compare_exchange_weak_dispatch(_Tp& __a, __atomic_underlying_t<_Tp>* __expected, __atomic_underlying_t<_Tp> __value, memory_order __success, memory_order __failure, _Sco, __atomic_small_tag) {
+template <template <typename, typename = __atomic_small_tag> _Sto, typename _Tp>
+_CCCL_HOST_DEVICE inline bool __atomic_compare_exchange_weak_dispatch(_Sto<_Tp>& __a, _Tp* __expected, _Tp __value, memory_order __success, memory_order __failure, _Sco) {
     auto __temp_expected = __atomic_small_to_32(*__expected);
     auto const __ret = __atomic_compare_exchange_weak_dispatch(__a.__a_value, &__temp_expected, __atomic_small_to_32(__value), __success, __failure, _Sco{});
-    auto const __actual = __atomic_small_from_32<__atomic_underlying_t<_Tp>>(__temp_expected);
-    constexpr auto __mask = static_cast<decltype(__temp_expected)>((1u << (8*sizeof(__atomic_underlying_t<_Tp>))) - 1);
+    auto const __actual = __atomic_small_from_32<_Tp>(__temp_expected);
+    constexpr auto __mask = static_cast<decltype(__temp_expected)>((1u << (8*sizeof(_Tp))) - 1);
     if(!__ret) {
-        if(0 == __cuda_memcmp(&__actual, __expected, sizeof(__atomic_underlying_t<_Tp>)))
+        if(0 == __cuda_memcmp(&__actual, __expected, sizeof(_Tp)))
             __atomic_fetch_and_dispatch(__a.__a_value, __mask, memory_order_relaxed, _Sco{});
         else
             *__expected = __actual;
@@ -126,49 +125,49 @@ _CCCL_HOST_DEVICE inline bool __atomic_compare_exchange_weak_dispatch(_Tp& __a,
     return __ret;
 }
 
-template <typename _Tp, typename _Sco>
-_CCCL_HOST_DEVICE inline bool __atomic_compare_exchange_strong_dispatch(_Tp& __a, __atomic_underlying_t<_Tp>* __expected, __atomic_underlying_t<_Tp> __value, memory_order __success, memory_order __failure, _Sco, __atomic_small_tag) {
+template <template <typename, typename = __atomic_small_tag> _Sto, typename _Tp>
+_CCCL_HOST_DEVICE inline bool __atomic_compare_exchange_strong_dispatch(_Sto<_Tp>& __a, _Tp* __expected, _Tp __value, memory_order __success, memory_order __failure, _Sco) {
     auto const __old = *__expected;
     while(1) {
-        if(__atomic_compare_exchange_weak_dispatch(__a, __expected, __value, __success, __failure, _Sco{}, __atomic_small_tag{}))
+        if(__atomic_compare_exchange_weak_dispatch(__a, __expected, __value, __success, __failure, _Sco{}{}))
             return true;
-        if(0 != __cuda_memcmp(&__old, __expected, sizeof(__atomic_underlying_t<_Tp>)))
+        if(0 != __cuda_memcmp(&__old, __expected, sizeof(_Tp)))
             return false;
     }
 }
 
-template <typename _Tp, typename _Sco>
-_CCCL_HOST_DEVICE inline _Tp __atomic_fetch_add_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __delta, memory_order __order, _Sco, __atomic_small_tag) {
+template <template <typename, typename = __atomic_small_tag> _Sto, typename _Tp>
+_CCCL_HOST_DEVICE inline _Tp __atomic_fetch_add_dispatch(_Sto<_Tp>& __a, _Tp __delta, memory_order __order, _Sco) {
     return __atomic_small_from_32<_Tp>(__atomic_fetch_add_dispatch(__a.__a_value, __atomic_small_to_32(__delta), __order, _Sco{}));
 }
 
-template <typename _Tp, typename _Sco>
-_CCCL_HOST_DEVICE inline _Tp __atomic_fetch_sub_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __delta, memory_order __order, _Sco, __atomic_small_tag) {
+template <template <typename, typename = __atomic_small_tag> _Sto, typename _Tp>
+_CCCL_HOST_DEVICE inline _Tp __atomic_fetch_sub_dispatch(_Sto<_Tp>& __a, _Tp __delta, memory_order __order, _Sco) {
     return __atomic_small_from_32<_Tp>(__atomic_fetch_sub_dispatch(__a.__a_value, __atomic_small_to_32(__delta), __order, _Sco{}));
 }
 
-template <typename _Tp, typename _Sco>
-_CCCL_HOST_DEVICE inline _Tp __atomic_fetch_and_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order __order, _Sco, __atomic_small_tag) {
+template <template <typename, typename = __atomic_small_tag> _Sto, typename _Tp>
+_CCCL_HOST_DEVICE inline _Tp __atomic_fetch_and_dispatch(_Sto<_Tp>& __a, _Tp __pattern, memory_order __order, _Sco) {
     return __atomic_small_from_32<_Tp>(__atomic_fetch_and_dispatch(__a.__a_value, __atomic_small_to_32(__pattern), __order, _Sco{}));
 }
 
-template <typename _Tp, typename _Sco>
-_CCCL_HOST_DEVICE inline _Tp __atomic_fetch_or_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order __order, _Sco, __atomic_small_tag) {
+template <template <typename, typename = __atomic_small_tag> _Sto, typename _Tp>
+_CCCL_HOST_DEVICE inline _Tp __atomic_fetch_or_dispatch(_Sto<_Tp>& __a, _Tp __pattern, memory_order __order, _Sco) {
     return __atomic_small_from_32<_Tp>(__atomic_fetch_or_dispatch(__a.__a_value, __atomic_small_to_32(__pattern), __order, _Sco{}));
 }
 
-template <typename _Tp, typename _Sco>
-_CCCL_HOST_DEVICE inline _Tp __atomic_fetch_xor_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __pattern, memory_order __order, _Sco, __atomic_small_tag) {
+template <template <typename, typename = __atomic_small_tag> _Sto, typename _Tp>
+_CCCL_HOST_DEVICE inline _Tp __atomic_fetch_xor_dispatch(_Sto<_Tp>& __a, _Tp __pattern, memory_order __order, _Sco) {
     return __atomic_small_from_32<_Tp>(__atomic_fetch_xor_dispatch(__a.__a_value, __atomic_small_to_32(__pattern), __order, _Sco{}));
 }
 
 template <typename _Tp, typename _Delta, typename _Sco>
-_CCCL_HOST_DEVICE inline _Tp __atomic_fetch_max_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco, __atomic_small_tag) {
+_CCCL_HOST_DEVICE inline _Tp __atomic_fetch_max_dispatch(_Sto<_Tp>& __a, _Tp __val, memory_order __order, _Sco) {
     return __atomic_small_from_32<_Tp>(__atomic_fetch_max_dispatch(__a.__a_value, __atomic_small_to_32(__val), __order, _Sco{}));
 }
 
 template <typename _Tp, typename _Delta, typename _Sco>
-_CCCL_HOST_DEVICE inline _Tp __atomic_fetch_min_dispatch(_Tp& __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco, __atomic_small_tag) {
+_CCCL_HOST_DEVICE inline _Tp __atomic_fetch_min_dispatch(_Sto<_Tp>& __a, _Tp __val, memory_order __order, _Sco) {
     return __atomic_small_from_32<_Tp>(__atomic_fetch_min_dispatch(__a.__a_value, __atomic_small_to_32(__val), __order, _Sco{}));
 }
 

From 0e6c0c01acd238a9d3d13f41d7213bbcca7d36c7 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Tue, 23 Apr 2024 15:45:38 -0700
Subject: [PATCH 11/71] Make tests work when full path is specified to lit.

---
 libcudacxx/test/utils/libcudacxx/test/format.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/libcudacxx/test/utils/libcudacxx/test/format.py b/libcudacxx/test/utils/libcudacxx/test/format.py
index f2b6f478fb..3a58447989 100644
--- a/libcudacxx/test/utils/libcudacxx/test/format.py
+++ b/libcudacxx/test/utils/libcudacxx/test/format.py
@@ -74,6 +74,10 @@ def getTestsInDirectory(self, testSuite, path_in_suite,
                     yield lit.Test.Test(testSuite, path_in_suite + (filename,),
                                         localConfig)
 
+    def getTestsForPath(self, testSuite, path_in_suite,
+                            litConfig, localConfig):
+        yield lit.Test.Test(testSuite, path_in_suite, localConfig)
+
     def execute(self, test, lit_config):
         while True:
             try:

From a370a02669ae2e4ce7901507987e73e88dac43bc Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Fri, 26 Apr 2024 16:48:36 -0700
Subject: [PATCH 12/71] Update barrier, latch, and semaphore, to use new
 atomic_impl.

---
 .../cuda/std/detail/libcxx/include/barrier    | 18 +++++++--------
 .../cuda/std/detail/libcxx/include/latch      |  9 +++++---
 .../cuda/std/detail/libcxx/include/semaphore  | 22 +++++++++----------
 3 files changed, 26 insertions(+), 23 deletions(-)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/barrier b/libcudacxx/include/cuda/std/detail/libcxx/include/barrier
index 4127fe7526..c0f9e5a718 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/barrier
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/barrier
@@ -80,16 +80,16 @@ template<class _CompletionF = __empty_completion, int _Sco = 0>
 class alignas(64) __barrier_base {
 
     ptrdiff_t                       __expected;
-    __atomic_base<ptrdiff_t, _Sco>  __expected_adjustment;
+    __atomic_impl<ptrdiff_t, _Sco>  __expected_adjustment;
     _CompletionF                    __completion;
 
     using __phase_t = uint8_t;
-    __atomic_base<__phase_t, _Sco>  __phase;
+    __atomic_impl<__phase_t, _Sco>  __phase;
 
     struct alignas(64) __state_t
     {
         struct {
-            __atomic_base<__phase_t, _Sco> __phase = ATOMIC_VAR_INIT(0);
+            __atomic_impl<__phase_t, _Sco> __phase = ATOMIC_VAR_INIT(0);
         } __tickets[64];
     };
     ::std::vector<__state_t>   __state;
@@ -256,12 +256,12 @@ bool __call_try_wait_parity(const _Barrier& __b, bool __parity)
 }
 
 
-template<class _CompletionF, int _Sco = 0>
+template<class _CompletionF, thread_scope _Sco = thread_scope_system>
 class __barrier_base {
 
-    _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_base<ptrdiff_t, _Sco> __expected, __arrived;
+    _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_impl<ptrdiff_t, _Sco> __expected, __arrived;
     _LIBCUDACXX_BARRIER_ALIGNMENTS _CompletionF                   __completion;
-    _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_base<bool, _Sco>      __phase;
+    _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_impl<bool, _Sco>      __phase;
 
 public:
     using arrival_token = bool;
@@ -317,7 +317,7 @@ public:
             __completion();
             __arrived.store(__new_expected, memory_order_relaxed);
             __phase.store(!__old_phase, memory_order_release);
-            __cxx_atomic_notify_all(&__phase.__a_);
+            __atomic_notify_all(&__phase.__a, __scope_to_tag<_Sco>{});
         }
         return __old_phase;
     }
@@ -345,7 +345,7 @@ public:
     }
 };
 
-template<int _Sco>
+template<thread_scope _Sco>
 class __barrier_base<__empty_completion, _Sco> {
 
     static constexpr uint64_t __expected_unit = 1ull;
@@ -354,7 +354,7 @@ class __barrier_base<__empty_completion, _Sco> {
     static constexpr uint64_t __phase_bit = 1ull << 63;
     static constexpr uint64_t __arrived_mask = (__phase_bit - 1) & ~__expected_mask;
 
-    _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_base<uint64_t, _Sco> __phase_arrived_expected;
+    _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_impl<uint64_t, _Sco> __phase_arrived_expected;
 
 public:
     using arrival_token = uint64_t;
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/latch b/libcudacxx/include/cuda/std/detail/libcxx/include/latch
index 2dcac0588b..290eb924eb 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/latch
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/latch
@@ -50,7 +50,10 @@ namespace std
 
 #include <cuda/std/detail/libcxx/include/__assert> // all public C++ headers provide the assertion handler
 #include <cuda/std/detail/libcxx/include/__debug>
-#include <cuda/std/atomic>
+
+#include <cuda/std/limits>
+
+#include <cuda/std/__atomic/api/atomic_impl.h>
 
 #include <cuda/std/detail/libcxx/include/__pragma_push>
 
@@ -66,10 +69,10 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #  define _LIBCUDACXX_LATCH_ALIGNMENT
 # endif
 
-template<int _Sco = 0>
+template<thread_scope _Sco = thread_scope_system>
 class __latch_base
 {
-    _LIBCUDACXX_LATCH_ALIGNMENT __atomic_base<ptrdiff_t, _Sco> __counter;
+    _LIBCUDACXX_LATCH_ALIGNMENT __atomic_impl<ptrdiff_t, _Sco> __counter;
 public:
     inline _LIBCUDACXX_INLINE_VISIBILITY constexpr
     explicit __latch_base(ptrdiff_t __expected)
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/semaphore b/libcudacxx/include/cuda/std/detail/libcxx/include/semaphore
index f038532e77..1089e62c20 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/semaphore
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/semaphore
@@ -68,7 +68,7 @@ using binary_semaphore = counting_semaphore<1>;
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-template<int _Sco, ptrdiff_t __least_max_value>
+template<thread_scope _Sco, ptrdiff_t __least_max_value>
 class __atomic_semaphore_base
 {
     _LIBCUDACXX_INLINE_VISIBILITY
@@ -111,7 +111,7 @@ class __atomic_semaphore_base
             return __old != 0 && __fetch_sub_if_slow(__old);
         }, __rel_time);
     }
-    __atomic_base<ptrdiff_t, _Sco> __count;
+    __atomic_impl<ptrdiff_t, _Sco> __count;
 
 public:
     _LIBCUDACXX_INLINE_VISIBILITY
@@ -175,7 +175,7 @@ public:
 
 #ifndef _LIBCUDACXX_USE_NATIVE_SEMAPHORES
 
-template<int _Sco>
+template<thread_scope _Sco>
 class __atomic_semaphore_base<_Sco, 1> {
 
     _LIBCUDACXX_INLINE_VISIBILITY
@@ -185,7 +185,7 @@ class __atomic_semaphore_base<_Sco, 1> {
             return try_acquire();
         }, __rel_time);
     }
-    __atomic_base<int, _Sco> __available;
+    __atomic_impl<int, _Sco> __available;
 
 public:
     _LIBCUDACXX_INLINE_VISIBILITY
@@ -244,7 +244,7 @@ public:
 
 #else
 
-template<int _Sco>
+template<thread_scope _Sco>
 class __sem_semaphore_base {
 
     _LIBCUDACXX_INLINE_VISIBILITY
@@ -325,10 +325,10 @@ class __sem_semaphore_base {
 
     __libcpp_semaphore_t __semaphore;
 #ifndef _LIBCUDACXX_HAS_NO_SEMAPHORE_FRONT_BUFFER
-    __atomic_base<ptrdiff_t, _Sco> __frontbuffer;
+    __atomic_impl<ptrdiff_t, _Sco> __frontbuffer;
 #endif
 #ifndef _LIBCUDACXX_HAS_NO_SEMAPHORE_BACK_BUFFER
-    __atomic_base<ptrdiff_t, _Sco> __backbuffer;
+    __atomic_impl<ptrdiff_t, _Sco> __backbuffer;
 #endif
 
 public:
@@ -414,7 +414,7 @@ public:
 
 #endif //_LIBCUDACXX_HAS_NO_SEMAPHORES
 
-template<ptrdiff_t __least_max_value, int _Sco>
+template<ptrdiff_t __least_max_value, thread_scope _Sco>
 using __semaphore_base =
 #ifdef _LIBCUDACXX_USE_NATIVE_SEMAPHORES
     __conditional_t<__least_max_value <= __sem_semaphore_base<_Sco>::max(),
@@ -426,12 +426,12 @@ using __semaphore_base =
     ;
 
 template<ptrdiff_t __least_max_value = INT_MAX>
-class counting_semaphore : public __semaphore_base<__least_max_value, 0>
+class counting_semaphore : public __semaphore_base<__least_max_value, thread_scope_system>
 {
-    static_assert(__least_max_value <= __semaphore_base<__least_max_value, 0>::max(), "");
+    static_assert(__least_max_value <= __semaphore_base<__least_max_value, thread_scope_system>::max(), "");
 public:
     _LIBCUDACXX_INLINE_VISIBILITY constexpr
-    counting_semaphore(ptrdiff_t __count = 0) : __semaphore_base<__least_max_value, 0>(__count) { }
+    counting_semaphore(ptrdiff_t __count = 0) : __semaphore_base<__least_max_value, thread_scope_system>(__count) { }
     ~counting_semaphore() = default;
 
     counting_semaphore(const counting_semaphore&) = delete;

From bf801cefba594393257cd9e0df1f8410ea9af9e8 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Fri, 26 Apr 2024 16:49:17 -0700
Subject: [PATCH 13/71] Make changes to atomic work.

---
 libcudacxx/include/cuda/atomic                |   2 +-
 .../cuda/std/__atomic/api/atomic_crtp.h       |  42 ++
 .../cuda/std/__atomic/api/atomic_impl.h       | 138 ++++
 .../include/cuda/std/__atomic/api/const.h     | 152 +++++
 .../include/cuda/std/__atomic/api/nonconst.h  | 237 +++++++
 .../operations/atomic_cuda_ptx_derived.h      | 149 +++--
 .../std/__atomic/operations/heterogeneous.h   | 169 +++--
 .../cuda/std/__atomic/operations/host.h       |  19 +-
 libcudacxx/include/cuda/std/__atomic/order.h  |  18 +
 libcudacxx/include/cuda/std/__atomic/scopes.h |  43 +-
 .../include/cuda/std/__atomic/storage/base.h  |  24 +-
 .../cuda/std/__atomic/storage/common.h        |  34 +-
 .../cuda/std/__atomic/storage/locked.h        | 221 +++----
 .../cuda/std/__atomic/storage/reference.h     |  26 +-
 .../include/cuda/std/__atomic/storage/small.h | 147 +++--
 .../cuda/std/__atomic/wait/notify_wait.h      |   4 +-
 .../include/cuda/std/__atomic/wait/polling.h  |   2 +-
 libcudacxx/include/cuda/std/__cuda/atomic.h   | 100 ++-
 libcudacxx/include/cuda/std/__cuda/barrier.h  |   4 +-
 libcudacxx/include/cuda/std/atomic            | 605 ++----------------
 20 files changed, 1146 insertions(+), 990 deletions(-)
 create mode 100644 libcudacxx/include/cuda/std/__atomic/api/atomic_crtp.h
 create mode 100644 libcudacxx/include/cuda/std/__atomic/api/atomic_impl.h
 create mode 100644 libcudacxx/include/cuda/std/__atomic/api/const.h
 create mode 100644 libcudacxx/include/cuda/std/__atomic/api/nonconst.h

diff --git a/libcudacxx/include/cuda/atomic b/libcudacxx/include/cuda/atomic
index 3c9e76cb1d..565703dfb8 100644
--- a/libcudacxx/include/cuda/atomic
+++ b/libcudacxx/include/cuda/atomic
@@ -11,6 +11,6 @@
 #ifndef _CUDA_ATOMIC
 #define _CUDA_ATOMIC
 
-#include <cuda/std/atomic>
+#include <cuda/std/__cuda/atomic.h>
 
 #endif // _CUDA_ATOMIC
diff --git a/libcudacxx/include/cuda/std/__atomic/api/atomic_crtp.h b/libcudacxx/include/cuda/std/__atomic/api/atomic_crtp.h
new file mode 100644
index 0000000000..1d659120b5
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/api/atomic_crtp.h
@@ -0,0 +1,42 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __LIBCUDACXX___ATOMIC_API_ATOMIC_CRTP_H
+#define __LIBCUDACXX___ATOMIC_API_ATOMIC_CRTP_H
+
+#include <cuda/std/detail/__config>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+// __atomic_crtp_accessor defines a way to statically fetch the atomic storage object
+// which owns the stored atomic.
+template <typename _Impl, typename _Sto>
+struct __atomic_crtp_accessor {
+    _CCCL_HOST_DEVICE
+    inline auto __this_atom() -> _Sto* {
+        return static_cast<_Impl*>(this)->__get_atom();
+    }
+    _CCCL_HOST_DEVICE
+    inline auto __this_atom() const -> const _Sto* {
+        return static_cast<const _Impl*>(this)->__get_atom();
+    }
+        _CCCL_HOST_DEVICE
+    inline auto __this_atom() volatile -> volatile _Sto* {
+        return static_cast<volatile _Impl*>(this)->__get_atom();
+    }
+        _CCCL_HOST_DEVICE
+    inline auto __this_atom() const volatile -> const volatile _Sto* {
+        return static_cast<const volatile _Impl*>(this)->__get_atom();
+    }
+};
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif __LIBCUDACXX___ATOMIC_API_ATOMIC_CRTP_H
diff --git a/libcudacxx/include/cuda/std/__atomic/api/atomic_impl.h b/libcudacxx/include/cuda/std/__atomic/api/atomic_impl.h
new file mode 100644
index 0000000000..fc6dd30b0c
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/api/atomic_impl.h
@@ -0,0 +1,138 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __LIBCUDACXX___ATOMIC_API_ATOMIC_IMPL_H
+#define __LIBCUDACXX___ATOMIC_API_ATOMIC_IMPL_H
+
+#include <cuda/std/detail/__config>
+
+#include <cuda/std/__atomic/api/const.h>
+#include <cuda/std/__atomic/api/nonconst.h>
+
+#include <cuda/std/__atomic/storage/base.h>
+#include <cuda/std/__atomic/storage/reference.h>
+#include <cuda/std/__atomic/storage/small.h>
+#include <cuda/std/__atomic/storage/locked.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+template <typename _Tp>
+struct __atomic_traits {
+    static constexpr bool __atomic_requires_lock = !__atomic_is_always_lock_free<_Tp>::__value;
+    static constexpr bool __atomic_requires_small = sizeof(_Tp) < 4;
+    static constexpr bool __atomic_supports_reference = __atomic_is_always_lock_free<_Tp>::__value && (sizeof(_Tp) >= 4 && sizeof(_Tp) <= 8);
+};
+
+template <typename _Tp>
+using __atomic_get_storage_t = typename __conditional_t<__atomic_traits<_Tp>::__atomic_requires_small,
+                                            __atomic_small_storage<_Tp>,
+                                            __conditional_t<__atomic_traits<_Tp>::__atomic_requires_lock,
+                                                __atomic_locked_storage<_Tp>,
+                                                __atomic_storage<_Tp>
+                                                >>;
+
+template <typename _Tp, typename _Crtp, typename _Sco = __thread_scope_system_tag>
+using __atomic_impl_t = __conditional_t<is_floating_point<_Tp>::value,
+                                            __atomic_arithmetic<_Tp, _Crtp, _Sco>,
+                                            __conditional_t<is_integral<_Tp>::value,
+                                                __atomic_bitwise<_Tp, _Crtp, _Sco>,
+                                                __atomic_common<_Tp, _Crtp, _Sco> >>;
+
+template <typename _Tp, typename _Crtp, typename _Sco = __thread_scope_system_tag>
+using __atomic_const_impl_t = __conditional_t<is_floating_point<_Tp>::value,
+                                            __atomic_arithmetic_const<_Tp, _Crtp, _Sco>,
+                                            __conditional_t<is_integral<_Tp>::value,
+                                                __atomic_bitwise_const<_Tp, _Crtp, _Sco>,
+                                                __atomic_common_const<_Tp, _Crtp, _Sco> >>;
+
+
+template <typename _Tp, thread_scope _Sco>
+struct __atomic_impl :
+    public __atomic_impl_t<_Tp, __atomic_crtp_accessor<__atomic_impl<_Tp,_Sco>, __atomic_get_storage_t<_Tp>>, __scope_to_tag<_Sco>> {
+
+    using __storage = __atomic_get_storage_t<_Tp>;
+    __storage __a;
+
+    _CCCL_HOST_DEVICE constexpr inline
+    __atomic_impl(_Tp __v) noexcept : __a(__v) {}
+
+    _CCCL_HOST_DEVICE constexpr inline
+    __storage* __get_atom() {
+        return &__a;
+    }
+    _CCCL_HOST_DEVICE constexpr inline
+    const __storage* __get_atom() const {
+        return &__a;
+    }
+    _CCCL_HOST_DEVICE constexpr inline
+    volatile __storage* __get_atom() volatile {
+        return &__a;
+    }
+    _CCCL_HOST_DEVICE constexpr inline
+    const volatile __storage* __get_atom() const volatile {
+        return &__a;
+    }
+
+    constexpr inline
+    __atomic_impl() noexcept = default;
+    constexpr inline
+    __atomic_impl(const __atomic_impl&) = delete;
+    constexpr inline
+    __atomic_impl(__atomic_impl&&) = delete;
+
+    constexpr inline
+    __atomic_impl& operator=(const __atomic_impl&) = delete;
+    constexpr inline
+    __atomic_impl& operator=(__atomic_impl&&) = delete;
+};
+
+template <typename _Tp, thread_scope _Sco>
+struct __atomic_ref_impl :
+    public __atomic_const_impl_t<_Tp, __atomic_crtp_accessor<__atomic_ref_impl<_Tp,_Sco>, __atomic_ref_storage<_Tp>>, __scope_to_tag<_Sco>> {
+
+    using __storage = __atomic_ref_storage<_Tp>;
+    __storage __a;
+
+    _CCCL_HOST_DEVICE constexpr inline
+    __storage* __get_atom() {
+        return &__a;
+    }
+    _CCCL_HOST_DEVICE constexpr inline
+    const __storage* __get_atom() const {
+        return &__a;
+    }
+    _CCCL_HOST_DEVICE constexpr inline
+    volatile __storage* __get_atom() volatile {
+        return &__a;
+    }
+    _CCCL_HOST_DEVICE constexpr inline
+    const volatile __storage* __get_atom() const volatile {
+        return &__a;
+    }
+
+    _CCCL_HOST_DEVICE constexpr inline
+    __atomic_ref_impl(_Tp& __v) : __a(&__v) {}
+
+    constexpr inline
+    __atomic_ref_impl() = delete;
+    constexpr inline
+    __atomic_ref_impl(const __atomic_ref_impl&) noexcept = default;
+    constexpr inline
+    __atomic_ref_impl(__atomic_ref_impl&&) = delete;
+
+    constexpr inline
+    __atomic_ref_impl& operator=(const __atomic_ref_impl&) = delete;
+    constexpr inline
+    __atomic_ref_impl& operator=(__atomic_ref_impl&&) = delete;
+};
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif __LIBCUDACXX___ATOMIC_API_ATOMIC_IMPL_H
diff --git a/libcudacxx/include/cuda/std/__atomic/api/const.h b/libcudacxx/include/cuda/std/__atomic/api/const.h
new file mode 100644
index 0000000000..d3904f2701
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/api/const.h
@@ -0,0 +1,152 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __LIBCUDACXX___ATOMIC_API_CONST_H
+#define __LIBCUDACXX___ATOMIC_API_CONST_H
+
+#include <cuda/std/detail/__config>
+
+#include <cuda/std/__atomic/api/atomic_crtp.h>
+
+#include <cuda/std/__atomic/wait/polling.h>
+#include <cuda/std/__atomic/wait/notify_wait.h>
+
+#include <cuda/std/__atomic/order.h>
+#include <cuda/std/__atomic/scopes.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+template <typename _Tp, typename _Crtp, typename _Sco = __thread_scope_system_tag>
+struct __atomic_common_const : public _Crtp {
+#if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+    static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
+#endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+
+    _CCCL_HOST_DEVICE inline
+    bool is_lock_free() const noexcept
+        {return _LIBCUDACXX_ATOMIC_IS_LOCK_FREE(sizeof(_Tp));}
+
+    _CCCL_HOST_DEVICE inline
+    void store(_Tp __d, memory_order __m = memory_order_seq_cst) const noexcept
+      _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
+        {__atomic_store_dispatch(this->__this_atom(), __d, __m, _Sco{});}
+
+    _CCCL_HOST_DEVICE inline
+    _Tp load(memory_order __m = memory_order_seq_cst) const noexcept
+      _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
+        {return __atomic_load_dispatch(this->__this_atom(), __m, _Sco{});}
+
+    _CCCL_HOST_DEVICE inline
+    operator _Tp() const noexcept          {return load();}
+
+    _CCCL_HOST_DEVICE inline
+    _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) const noexcept
+        {return __atomic_exchange_dispatch(this->__this_atom(), __d, __m, _Sco{});}
+
+    _CCCL_HOST_DEVICE inline
+    bool compare_exchange_weak(_Tp& __e, _Tp __d,
+                               memory_order __s, memory_order __f) const noexcept
+      _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
+        {return __atomic_compare_exchange_weak_dispatch(this->__this_atom(), &__e, __d, __s, __f, _Sco{});}
+
+    _CCCL_HOST_DEVICE inline
+    bool compare_exchange_strong(_Tp& __e, _Tp __d,
+                                 memory_order __s, memory_order __f) const noexcept
+      _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
+        {return __atomic_compare_exchange_strong_dispatch(this->__this_atom(), &__e, __d, __s, __f, _Sco{});}
+
+    _CCCL_HOST_DEVICE inline
+    bool compare_exchange_weak(_Tp& __e, _Tp __d,
+                               memory_order __m = memory_order_seq_cst) const noexcept {
+        if(memory_order_acq_rel == __m)
+            return __atomic_compare_exchange_weak_dispatch(this->__this_atom(), &__e, __d, __m, memory_order_acquire, _Sco{});
+        else if(memory_order_release == __m)
+            return __atomic_compare_exchange_weak_dispatch(this->__this_atom(), &__e, __d, __m, memory_order_relaxed, _Sco{});
+        else
+            return __atomic_compare_exchange_weak_dispatch(this->__this_atom(), &__e, __d, __m, __m, _Sco{});
+    }
+
+    _CCCL_HOST_DEVICE inline
+    bool compare_exchange_strong(_Tp& __e, _Tp __d,
+                                 memory_order __m = memory_order_seq_cst) const noexcept {
+        if (memory_order_acq_rel == __m)
+            return __atomic_compare_exchange_strong_dispatch(this->__this_atom(), &__e, __d, __m, memory_order_acquire, _Sco{});
+        else if (memory_order_release == __m)
+            return __atomic_compare_exchange_strong_dispatch(this->__this_atom(), &__e, __d, __m, memory_order_relaxed, _Sco{});
+        else
+            return __atomic_compare_exchange_strong_dispatch(this->__this_atom(), &__e, __d, __m, __m, _Sco{});
+    }
+
+    _CCCL_HOST_DEVICE inline void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const noexcept
+        {__atomic_wait(this->__this_atom(), __v, __m, _Sco{});}
+
+    _CCCL_HOST_DEVICE inline void notify_one() const noexcept
+        {__atomic_notify_one(this->__this_atom(), _Sco{});}
+
+    _CCCL_HOST_DEVICE inline void notify_all() const noexcept
+        {__atomic_notify_all(this->__this_atom(), _Sco{});}
+};
+
+template <typename _Tp, typename _Crtp, typename _Sco = __thread_scope_system_tag>
+struct __atomic_arithmetic_const : public __atomic_common_const<_Tp, _Crtp, _Sco> {
+    _CCCL_HOST_DEVICE inline
+    _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
+        {return __atomic_fetch_add_dispatch(this->__this_atom(), __op, __m, _Sco{});}
+
+    _CCCL_HOST_DEVICE inline
+    _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
+        {return __atomic_fetch_sub_dispatch(this->__this_atom(), __op, __m, _Sco{});}
+
+    _CCCL_HOST_DEVICE inline
+    _Tp operator++(int) const noexcept               {return fetch_add(_Tp(1));}
+
+    _CCCL_HOST_DEVICE inline
+    _Tp operator--(int) const noexcept               {return fetch_sub(_Tp(1));}
+
+    _CCCL_HOST_DEVICE inline
+    _Tp operator++() const noexcept                  {return fetch_add(_Tp(1)) + _Tp(1);}
+
+    _CCCL_HOST_DEVICE inline
+    _Tp operator--() const noexcept                  {return fetch_sub(_Tp(1)) - _Tp(1);}
+
+    _CCCL_HOST_DEVICE inline
+    _Tp operator+=(_Tp __op) const noexcept          {return fetch_add(__op) + __op;}
+
+    _CCCL_HOST_DEVICE inline
+    _Tp operator-=(_Tp __op) const noexcept          {return fetch_sub(__op) - __op;}
+};
+
+template <typename _Tp, typename _Crtp, typename _Sco = __thread_scope_system_tag>
+struct __atomic_bitwise_const : public __atomic_arithmetic_const<_Tp, _Crtp, _Sco> {
+    _CCCL_HOST_DEVICE inline
+    _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
+        {return __atomic_fetch_and_dispatch(this->__this_atom(), __op, __m, _Sco{});}
+
+    _CCCL_HOST_DEVICE inline
+    _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
+        {return __atomic_fetch_or_dispatch(this->__this_atom(), __op, __m, _Sco{});}
+
+    _CCCL_HOST_DEVICE inline
+    _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
+        {return __atomic_fetch_xor_dispatch(this->__this_atom(), __op, __m, _Sco{});}
+
+    _CCCL_HOST_DEVICE inline
+    _Tp operator&=(_Tp __op) const noexcept          {return fetch_and(__op) & __op;}
+
+    _CCCL_HOST_DEVICE inline
+    _Tp operator|=(_Tp __op) const noexcept          {return fetch_or(__op) | __op;}
+
+    _CCCL_HOST_DEVICE inline
+    _Tp operator^=(_Tp __op) const noexcept          {return fetch_xor(__op) ^ __op;}
+};
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif __LIBCUDACXX___ATOMIC_API_CONST_H
diff --git a/libcudacxx/include/cuda/std/__atomic/api/nonconst.h b/libcudacxx/include/cuda/std/__atomic/api/nonconst.h
new file mode 100644
index 0000000000..6b147d66c7
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/api/nonconst.h
@@ -0,0 +1,237 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __LIBCUDACXX___ATOMIC_API_NONCONST_H
+#define __LIBCUDACXX___ATOMIC_API_NONCONST_H
+
+#include <cuda/std/detail/__config>
+
+#include <cuda/std/__atomic/api/atomic_crtp.h>
+
+#include <cuda/std/__atomic/wait/polling.h>
+#include <cuda/std/__atomic/wait/notify_wait.h>
+
+#include <cuda/std/__atomic/order.h>
+#include <cuda/std/__atomic/scopes.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+template <typename _Tp, typename _Crtp, typename _Sco = __thread_scope_system_tag>
+struct __atomic_common : public _Crtp {
+#if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+    static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
+#endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+
+    _CCCL_HOST_DEVICE inline
+    bool is_lock_free() const volatile noexcept
+        {return _LIBCUDACXX_ATOMIC_IS_LOCK_FREE(sizeof(_Tp));}
+    _CCCL_HOST_DEVICE inline
+    bool is_lock_free() const noexcept
+        {return _LIBCUDACXX_ATOMIC_IS_LOCK_FREE(sizeof(_Tp));}
+
+    _CCCL_HOST_DEVICE inline
+    void store(_Tp __d, memory_order __m = memory_order_seq_cst) volatile noexcept
+      _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
+        {__atomic_store_dispatch(this->__this_atom(), __d, __m, _Sco{});}
+    _CCCL_HOST_DEVICE inline
+    void store(_Tp __d, memory_order __m = memory_order_seq_cst) noexcept
+      _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
+        {__atomic_store_dispatch(this->__this_atom(), __d, __m, _Sco{});}
+
+    _CCCL_HOST_DEVICE inline
+    _Tp load(memory_order __m = memory_order_seq_cst) const volatile noexcept
+      _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
+        {return __atomic_load_dispatch(this->__this_atom(), __m, _Sco{});}
+    _CCCL_HOST_DEVICE inline
+    _Tp load(memory_order __m = memory_order_seq_cst) const noexcept
+      _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
+        {return __atomic_load_dispatch(this->__this_atom(), __m, _Sco{});}
+
+    _CCCL_HOST_DEVICE inline
+    operator _Tp() const volatile noexcept {return load();}
+    _CCCL_HOST_DEVICE inline
+    operator _Tp() const noexcept          {return load();}
+
+    _CCCL_HOST_DEVICE inline
+    _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) volatile noexcept
+        {return __atomic_exchange_dispatch(this->__this_atom(), __d, __m, _Sco{});}
+    _CCCL_HOST_DEVICE inline
+    _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) noexcept
+        {return __atomic_exchange_dispatch(this->__this_atom(), __d, __m, _Sco{});}
+
+    _CCCL_HOST_DEVICE inline
+    bool compare_exchange_weak(_Tp& __e, _Tp __d,
+                               memory_order __s, memory_order __f) volatile noexcept
+      _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
+        {return __atomic_compare_exchange_weak_dispatch(this->__this_atom(), &__e, __d, __s, __f, _Sco{});}
+    _CCCL_HOST_DEVICE inline
+    bool compare_exchange_weak(_Tp& __e, _Tp __d,
+                               memory_order __s, memory_order __f) noexcept
+      _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
+        {return __atomic_compare_exchange_weak_dispatch(this->__this_atom(), &__e, __d, __s, __f, _Sco{});}
+
+    _CCCL_HOST_DEVICE inline
+    bool compare_exchange_strong(_Tp& __e, _Tp __d,
+                                 memory_order __s, memory_order __f) volatile noexcept
+      _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
+        {return __atomic_compare_exchange_strong_dispatch(this->__this_atom(), &__e, __d, __s, __f, _Sco{});}
+    _CCCL_HOST_DEVICE inline
+    bool compare_exchange_strong(_Tp& __e, _Tp __d,
+                                 memory_order __s, memory_order __f) noexcept
+      _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
+        {return __atomic_compare_exchange_strong_dispatch(this->__this_atom(), &__e, __d, __s, __f, _Sco{});}
+
+    _CCCL_HOST_DEVICE inline
+    bool compare_exchange_weak(_Tp& __e, _Tp __d,
+                              memory_order __m = memory_order_seq_cst) volatile noexcept {
+        if (memory_order_acq_rel == __m)
+            return __atomic_compare_exchange_weak_dispatch(this->__this_atom(), &__e, __d, __m, memory_order_acquire, _Sco{});
+        else if (memory_order_release == __m)
+            return __atomic_compare_exchange_weak_dispatch(this->__this_atom(), &__e, __d, __m, memory_order_relaxed, _Sco{});
+        else
+            return __atomic_compare_exchange_weak_dispatch(this->__this_atom(), &__e, __d, __m, __m, _Sco{});
+    }
+    _CCCL_HOST_DEVICE inline
+    bool compare_exchange_weak(_Tp& __e, _Tp __d,
+                               memory_order __m = memory_order_seq_cst) noexcept {
+        if(memory_order_acq_rel == __m)
+            return __atomic_compare_exchange_weak_dispatch(this->__this_atom(), &__e, __d, __m, memory_order_acquire, _Sco{});
+        else if(memory_order_release == __m)
+            return __atomic_compare_exchange_weak_dispatch(this->__this_atom(), &__e, __d, __m, memory_order_relaxed, _Sco{});
+        else
+            return __atomic_compare_exchange_weak_dispatch(this->__this_atom(), &__e, __d, __m, __m, _Sco{});
+    }
+
+    _CCCL_HOST_DEVICE inline
+    bool compare_exchange_strong(_Tp& __e, _Tp __d,
+                              memory_order __m = memory_order_seq_cst) volatile noexcept {
+        if (memory_order_acq_rel == __m)
+            return __atomic_compare_exchange_strong_dispatch(this->__this_atom(), &__e, __d, __m, memory_order_acquire, _Sco{});
+        else if (memory_order_release == __m)
+            return __atomic_compare_exchange_strong_dispatch(this->__this_atom(), &__e, __d, __m, memory_order_relaxed, _Sco{});
+        else
+            return __atomic_compare_exchange_strong_dispatch(this->__this_atom(), &__e, __d, __m, __m, _Sco{});
+    }
+    _CCCL_HOST_DEVICE inline
+    bool compare_exchange_strong(_Tp& __e, _Tp __d,
+                                 memory_order __m = memory_order_seq_cst) noexcept {
+        if (memory_order_acq_rel == __m)
+            return __atomic_compare_exchange_strong_dispatch(this->__this_atom(), &__e, __d, __m, memory_order_acquire, _Sco{});
+        else if (memory_order_release == __m)
+            return __atomic_compare_exchange_strong_dispatch(this->__this_atom(), &__e, __d, __m, memory_order_relaxed, _Sco{});
+        else
+            return __atomic_compare_exchange_strong_dispatch(this->__this_atom(), &__e, __d, __m, __m, _Sco{});
+    }
+
+    _CCCL_HOST_DEVICE inline void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const volatile noexcept
+        {__atomic_wait(this->__this_atom(), __v, __m, _Sco{});}
+    _CCCL_HOST_DEVICE inline void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const noexcept
+        {__atomic_wait(this->__this_atom(), __v, __m, _Sco{});}
+
+    _CCCL_HOST_DEVICE inline void notify_one() volatile noexcept
+        {__atomic_notify_one(this->__this_atom(), _Sco{});}
+    _CCCL_HOST_DEVICE inline void notify_one() noexcept
+        {__atomic_notify_one(this->__this_atom(), _Sco{});}
+
+    _CCCL_HOST_DEVICE inline void notify_all() volatile noexcept
+        {__atomic_notify_all(this->__this_atom(), _Sco{});}
+    _CCCL_HOST_DEVICE inline void notify_all() noexcept
+        {__atomic_notify_all(this->__this_atom(), _Sco{});}
+};
+
+template <typename _Tp, typename _Crtp, typename _Sco = __thread_scope_system_tag>
+struct __atomic_arithmetic : public __atomic_common<_Tp, _Crtp, _Sco> {
+    _CCCL_HOST_DEVICE inline
+    _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
+        {return __atomic_fetch_add_dispatch(this->__this_atom(), __op, __m, _Sco{});}
+    _CCCL_HOST_DEVICE inline
+    _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
+        {return __atomic_fetch_add_dispatch(this->__this_atom(), __op, __m, _Sco{});}
+
+    _CCCL_HOST_DEVICE inline
+    _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
+        {return __atomic_fetch_sub_dispatch(this->__this_atom(), __op, __m, _Sco{});}
+    _CCCL_HOST_DEVICE inline
+    _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
+        {return __atomic_fetch_sub_dispatch(this->__this_atom(), __op, __m, _Sco{});}
+
+    _CCCL_HOST_DEVICE inline
+    _Tp operator++(int) volatile noexcept      {return fetch_add(_Tp(1));}
+    _CCCL_HOST_DEVICE inline
+    _Tp operator++(int) noexcept               {return fetch_add(_Tp(1));}
+
+    _CCCL_HOST_DEVICE inline
+    _Tp operator--(int) volatile noexcept      {return fetch_sub(_Tp(1));}
+    _CCCL_HOST_DEVICE inline
+    _Tp operator--(int) noexcept               {return fetch_sub(_Tp(1));}
+
+    _CCCL_HOST_DEVICE inline
+    _Tp operator++() volatile noexcept         {return fetch_add(_Tp(1)) + _Tp(1);}
+    _CCCL_HOST_DEVICE inline
+    _Tp operator++() noexcept                  {return fetch_add(_Tp(1)) + _Tp(1);}
+
+    _CCCL_HOST_DEVICE inline
+    _Tp operator--() volatile noexcept         {return fetch_sub(_Tp(1)) - _Tp(1);}
+    _CCCL_HOST_DEVICE inline
+    _Tp operator--() noexcept                  {return fetch_sub(_Tp(1)) - _Tp(1);}
+
+    _CCCL_HOST_DEVICE inline
+    _Tp operator+=(_Tp __op) volatile noexcept {return fetch_add(__op) + __op;}
+    _CCCL_HOST_DEVICE inline
+    _Tp operator+=(_Tp __op) noexcept          {return fetch_add(__op) + __op;}
+
+    _CCCL_HOST_DEVICE inline
+    _Tp operator-=(_Tp __op) volatile noexcept {return fetch_sub(__op) - __op;}
+    _CCCL_HOST_DEVICE inline
+    _Tp operator-=(_Tp __op) noexcept          {return fetch_sub(__op) - __op;}
+};
+
+template <typename _Tp, typename _Crtp, typename _Sco = __thread_scope_system_tag>
+struct __atomic_bitwise : public __atomic_arithmetic<_Tp, _Crtp, _Sco> {
+    _CCCL_HOST_DEVICE inline
+    _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
+        {return __atomic_fetch_and_dispatch(this->__this_atom(), __op, __m, _Sco{});}
+    _CCCL_HOST_DEVICE inline
+    _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
+        {return __atomic_fetch_and_dispatch(this->__this_atom(), __op, __m, _Sco{});}
+
+    _CCCL_HOST_DEVICE inline
+    _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
+        {return __atomic_fetch_or_dispatch(this->__this_atom(), __op, __m, _Sco{});}
+    _CCCL_HOST_DEVICE inline
+    _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
+        {return __atomic_fetch_or_dispatch(this->__this_atom(), __op, __m, _Sco{});}
+
+    _CCCL_HOST_DEVICE inline
+    _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
+        {return __atomic_fetch_xor_dispatch(this->__this_atom(), __op, __m, _Sco{});}
+    _CCCL_HOST_DEVICE inline
+    _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
+        {return __atomic_fetch_xor_dispatch(this->__this_atom(), __op, __m, _Sco{});}
+
+    _CCCL_HOST_DEVICE inline
+    _Tp operator&=(_Tp __op) volatile noexcept {return fetch_and(__op) & __op;}
+    _CCCL_HOST_DEVICE inline
+    _Tp operator&=(_Tp __op) noexcept          {return fetch_and(__op) & __op;}
+
+    _CCCL_HOST_DEVICE inline
+    _Tp operator|=(_Tp __op) volatile noexcept {return fetch_or(__op) | __op;}
+    _CCCL_HOST_DEVICE inline
+    _Tp operator|=(_Tp __op) noexcept          {return fetch_or(__op) | __op;}
+
+    _CCCL_HOST_DEVICE inline
+    _Tp operator^=(_Tp __op) volatile noexcept {return fetch_xor(__op) ^ __op;}
+    _CCCL_HOST_DEVICE inline
+    _Tp operator^=(_Tp __op) noexcept          {return fetch_xor(__op) ^ __op;}
+};
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif __LIBCUDACXX___ATOMIC_API_NONCONST_H
diff --git a/libcudacxx/include/cuda/std/__atomic/operations/atomic_cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/operations/atomic_cuda_ptx_derived.h
index 891b0ffe1c..e5df2ac630 100644
--- a/libcudacxx/include/cuda/std/__atomic/operations/atomic_cuda_ptx_derived.h
+++ b/libcudacxx/include/cuda/std/__atomic/operations/atomic_cuda_ptx_derived.h
@@ -8,19 +8,26 @@
 //
 //===----------------------------------------------------------------------===//
 
-template <class _Type, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type) <= 2, int>::type = 0>
+#ifndef __LIBCUDACXX___ATOMIC_OPERATIONS_DERIVED_H
+#define __LIBCUDACXX___ATOMIC_OPERATIONS_DERIVED_H
+
+#include <cuda/std/detail/__config>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+template <typename _Tp, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
 bool _CCCL_DEVICE __atomic_compare_exchange_cuda(
-  _Type volatile* __ptr,
-  _Type* __expected,
-  const _Type* __desired,
+  _Tp volatile* __ptr,
+  _Tp* __expected,
+  const _Tp __desired,
   bool,
   int __success_memorder,
   int __failure_memorder,
-  _Scope __s)
+  _Sco)
 {
   auto const __aligned = (uint32_t*) ((intptr_t) __ptr & ~(sizeof(uint32_t) - 1));
   auto const __offset  = uint32_t((intptr_t) __ptr & (sizeof(uint32_t) - 1)) * 8;
-  auto const __mask    = ((1 << sizeof(_Type) * 8) - 1) << __offset;
+  auto const __mask    = ((1 << sizeof(_Tp) * 8) - 1) << __offset;
 
   uint32_t __old = *__expected << __offset;
   uint32_t __old_value;
@@ -32,7 +39,7 @@ bool _CCCL_DEVICE __atomic_compare_exchange_cuda(
       break;
     }
     uint32_t const __attempt = (__old & ~__mask) | (*__desired << __offset);
-    if (__atomic_compare_exchange_cuda(__aligned, &__old, &__attempt, true, __success_memorder, __failure_memorder, __s))
+    if (__atomic_compare_exchange_cuda(__aligned, &__old, &__attempt, true, __success_memorder, __failure_memorder, _Sco{}))
     {
       return true;
     }
@@ -41,21 +48,21 @@ bool _CCCL_DEVICE __atomic_compare_exchange_cuda(
   return false;
 }
 
-template <class _Type, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type) <= 2, int>::type = 0>
-void _CCCL_DEVICE __atomic_exchange_cuda(_Type volatile* __ptr, _Type* __val, _Type* __ret, int __memorder, _Scope __s)
+template <typename _Tp, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
+void _CCCL_DEVICE __atomic_exchange_cuda(_Tp volatile* __ptr, _Tp* __val, _Tp* __ret, int __memorder, _Sco)
 {
-  _Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s);
-  while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __val, true, __memorder, __memorder, __s))
+  _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
+  while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __val, true, __memorder, __memorder, _Sco{}))
     ;
   *__ret = __expected;
 }
 
-template <class _Type, class _Delta, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type) <= 2, int>::type = 0>
-_Type _CCCL_DEVICE __atomic_fetch_add_cuda(_Type volatile* __ptr, _Delta __val, int __memorder, _Scope __s)
+template <typename _Tp, typename _Up, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
+_Tp _CCCL_DEVICE __atomic_fetch_add_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
 {
-  _Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s);
-  _Type __desired  = __expected + __val;
-  while (!__atomic_compare_exchange_cuda(__ptr, &__expected, &__desired, true, __memorder, __memorder, __s))
+  _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
+  _Tp __desired  = __expected + __val;
+  while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
   {
     __desired = __expected + __val;
   }
@@ -63,17 +70,17 @@ _Type _CCCL_DEVICE __atomic_fetch_add_cuda(_Type volatile* __ptr, _Delta __val,
 }
 
 template <
-  class _Type,
-  class _Delta,
-  class _Scope,
-  typename _CUDA_VSTD::enable_if<sizeof(_Type) <= 2 || _CUDA_VSTD::is_floating_point<_Type>::value, int>::type = 0>
-_Type _CCCL_HOST_DEVICE __atomic_fetch_max_cuda(_Type volatile* __ptr, _Delta __val, int __memorder, _Scope __s)
+  typename _Tp,
+  typename _Up,
+  typename _Sco,
+  __enable_if_t<sizeof(_Tp) <= 2 || _CUDA_VSTD::is_floating_point<_Tp>::value, int> = 0>
+_Tp _CCCL_HOST_DEVICE __atomic_fetch_max_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
 {
-  _Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s);
-  _Type __desired  = __expected > __val ? __expected : __val;
+  _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
+  _Tp __desired  = __expected > __val ? __expected : __val;
 
   while (__desired == __val
-         && !__atomic_compare_exchange_cuda(__ptr, &__expected, &__desired, true, __memorder, __memorder, __s))
+         && !__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
   {
     __desired = __expected > __val ? __expected : __val;
   }
@@ -82,17 +89,17 @@ _Type _CCCL_HOST_DEVICE __atomic_fetch_max_cuda(_Type volatile* __ptr, _Delta __
 }
 
 template <
-  class _Type,
-  class _Delta,
-  class _Scope,
-  typename _CUDA_VSTD::enable_if<sizeof(_Type) <= 2 || _CUDA_VSTD::is_floating_point<_Type>::value, int>::type = 0>
-_Type _CCCL_HOST_DEVICE __atomic_fetch_min_cuda(_Type volatile* __ptr, _Delta __val, int __memorder, _Scope __s)
+  typename _Tp,
+  typename _Up,
+  typename _Sco,
+  __enable_if_t<sizeof(_Tp) <= 2 || _CUDA_VSTD::is_floating_point<_Tp>::value, int> = 0>
+_Tp _CCCL_HOST_DEVICE __atomic_fetch_min_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
 {
-  _Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s);
-  _Type __desired  = __expected < __val ? __expected : __val;
+  _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
+  _Tp __desired  = __expected < __val ? __expected : __val;
 
   while (__desired == __val
-         && !__atomic_compare_exchange_cuda(__ptr, &__expected, &__desired, true, __memorder, __memorder, __s))
+         && !__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
   {
     __desired = __expected < __val ? __expected : __val;
   }
@@ -100,87 +107,87 @@ _Type _CCCL_HOST_DEVICE __atomic_fetch_min_cuda(_Type volatile* __ptr, _Delta __
   return __expected;
 }
 
-template <class _Type, class _Delta, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type) <= 2, int>::type = 0>
-_Type _CCCL_DEVICE __atomic_fetch_sub_cuda(_Type volatile* __ptr, _Delta __val, int __memorder, _Scope __s)
+template <typename _Tp, typename _Up, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
+_Tp _CCCL_DEVICE __atomic_fetch_sub_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
 {
-  _Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s);
-  _Type __desired  = __expected - __val;
-  while (!__atomic_compare_exchange_cuda(__ptr, &__expected, &__desired, true, __memorder, __memorder, __s))
+  _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
+  _Tp __desired  = __expected - __val;
+  while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
   {
     __desired = __expected - __val;
   }
   return __expected;
 }
 
-template <class _Type, class _Delta, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type) <= 2, int>::type = 0>
-_Type _CCCL_DEVICE __atomic_fetch_and_cuda(_Type volatile* __ptr, _Delta __val, int __memorder, _Scope __s)
+template <typename _Tp, typename _Up, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
+_Tp _CCCL_DEVICE __atomic_fetch_and_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
 {
-  _Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s);
-  _Type __desired  = __expected & __val;
-  while (!__atomic_compare_exchange_cuda(__ptr, &__expected, &__desired, true, __memorder, __memorder, __s))
+  _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
+  _Tp __desired  = __expected & __val;
+  while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
   {
     __desired = __expected & __val;
   }
   return __expected;
 }
 
-template <class _Type, class _Delta, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type) <= 2, int>::type = 0>
-_Type _CCCL_DEVICE __atomic_fetch_xor_cuda(_Type volatile* __ptr, _Delta __val, int __memorder, _Scope __s)
+template <typename _Tp, typename _Up, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
+_Tp _CCCL_DEVICE __atomic_fetch_xor_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
 {
-  _Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s);
-  _Type __desired  = __expected ^ __val;
-  while (!__atomic_compare_exchange_cuda(__ptr, &__expected, &__desired, true, __memorder, __memorder, __s))
+  _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
+  _Tp __desired  = __expected ^ __val;
+  while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
   {
     __desired = __expected ^ __val;
   }
   return __expected;
 }
 
-template <class _Type, class _Delta, class _Scope, typename _CUDA_VSTD::enable_if<sizeof(_Type) <= 2, int>::type = 0>
-_Type _CCCL_DEVICE __atomic_fetch_or_cuda(_Type volatile* __ptr, _Delta __val, int __memorder, _Scope __s)
+template <typename _Tp, typename _Up, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
+_Tp _CCCL_DEVICE __atomic_fetch_or_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
 {
-  _Type __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, __s);
-  _Type __desired  = __expected | __val;
-  while (!__atomic_compare_exchange_cuda(__ptr, &__expected, &__desired, true, __memorder, __memorder, __s))
+  _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
+  _Tp __desired  = __expected | __val;
+  while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __desired, true, __memorder, __memorder, _Sco{}))
   {
     __desired = __expected | __val;
   }
   return __expected;
 }
 
-template <class _Type, class _Scope>
-_Type _CCCL_DEVICE __atomic_load_n_cuda(const _Type volatile* __ptr, int __memorder, _Scope __s)
+template <typename _Tp, typename _Sco>
+_Tp _CCCL_DEVICE __atomic_load_n_cuda(const _Tp volatile* __ptr, int __memorder, _Sco)
 {
-  _Type __ret;
-  __atomic_load_cuda(__ptr, &__ret, __memorder, __s);
+  _Tp __ret;
+  __atomic_load_cuda(__ptr, &__ret, __memorder, _Sco{});
   return __ret;
 }
 
-template <class _Type, class _Scope>
-void _CCCL_DEVICE __atomic_store_n_cuda(_Type volatile* __ptr, _Type __val, int __memorder, _Scope __s)
+template <typename _Tp, typename _Sco>
+void _CCCL_DEVICE __atomic_store_n_cuda(_Tp volatile* __ptr, _Tp __val, int __memorder, _Sco)
 {
-  __atomic_store_cuda(__ptr, &__val, __memorder, __s);
+  __atomic_store_cuda(__ptr, &__val, __memorder, _Sco{});
 }
 
-template <class _Type, class _Scope>
+template <typename _Tp, typename _Sco>
 bool _CCCL_DEVICE __atomic_compare_exchange_n_cuda(
-  _Type volatile* __ptr,
-  _Type* __expected,
-  _Type __desired,
+  _Tp volatile* __ptr,
+  _Tp* __expected,
+  _Tp __desired,
   bool __weak,
   int __success_memorder,
   int __failure_memorder,
-  _Scope __s)
+  _Sco)
 {
   return __atomic_compare_exchange_cuda(
-    __ptr, __expected, &__desired, __weak, __success_memorder, __failure_memorder, __s);
+    __ptr, __expected, __desired, __weak, __success_memorder, __failure_memorder, _Sco{});
 }
 
-template <class _Type, class _Scope>
-_Type _CCCL_DEVICE __atomic_exchange_n_cuda(_Type volatile* __ptr, _Type __val, int __memorder, _Scope __s)
+template <typename _Tp, typename _Sco>
+_Tp _CCCL_DEVICE __atomic_exchange_n_cuda(_Tp volatile* __ptr, _Tp __val, int __memorder, _Sco)
 {
-  _Type __ret;
-  __atomic_exchange_cuda(__ptr, &__val, &__ret, __memorder, __s);
+  _Tp __ret;
+  __atomic_exchange_cuda(__ptr, &__val, &__ret, __memorder, _Sco{});
   return __ret;
 }
 
@@ -188,3 +195,7 @@ static inline _CCCL_DEVICE void __atomic_signal_fence_cuda(int)
 {
   asm volatile("" ::: "memory");
 }
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif __LIBCUDACXX___ATOMIC_OPERATIONS_DERIVED_H
diff --git a/libcudacxx/include/cuda/std/__atomic/operations/heterogeneous.h b/libcudacxx/include/cuda/std/__atomic/operations/heterogeneous.h
index 9ef3fcf51e..4111003489 100644
--- a/libcudacxx/include/cuda/std/__atomic/operations/heterogeneous.h
+++ b/libcudacxx/include/cuda/std/__atomic/operations/heterogeneous.h
@@ -29,9 +29,8 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-_CCCL_HOST_DEVICE
-inline
- void __atomic_thread_fence_dispatch(memory_order __order) {
+_CCCL_HOST_DEVICE inline
+void __atomic_thread_fence_dispatch(memory_order __order) {
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
             __atomic_thread_fence_cuda(static_cast<__memory_order_underlying_t>(__order), __thread_scope_system_tag());
@@ -42,9 +41,8 @@ inline
     )
 }
 
-_CCCL_HOST_DEVICE
-inline
- void __atomic_signal_fence_dispatch(memory_order __order) {
+_CCCL_HOST_DEVICE inline
+void __atomic_signal_fence_dispatch(memory_order __order) {
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
             __atomic_signal_fence_cuda(static_cast<__memory_order_underlying_t>(__order));
@@ -55,200 +53,201 @@ inline
     )
 }
 
-// automatically dispatch based on default argument of '_Sto<_Tp, tag_t>'
-template <template <typename, typename = __atomic_base_tag> _Sto, typename _Tp>
-_CCCL_HOST_DEVICE
- void __atomic_init_dispatch(_Sto<_Tp>& __a, _Tp __val) {
-    __atomic_assign_volatile(__a.get(), __val);
+// Extract the storage tag and SFINAE on the tag inside the storage object
+template <typename _Sto>
+using __atomic_storage_is_base = __enable_if_t<__atomic_tag::__atomic_base_tag == __remove_cvref_t<_Sto>::__tag, int>;
+
+template <typename _Sto, typename _Up, __atomic_storage_is_base<_Sto> = 0>
+_CCCL_HOST_DEVICE inline
+void __atomic_init_dispatch(_Sto* __a, _Up __val) {
+    __atomic_assign_volatile(__a->get(), __val);
 }
 
-template <template <typename, typename = __atomic_base_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
-_CCCL_HOST_DEVICE
- void __atomic_store_dispatch(_Sto<_Tp>& __a, _Tp __val, memory_order __order, _Sco = {}) {
-    alignas(_Tp) auto __tmp = __val;
+template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0>
+_CCCL_HOST_DEVICE inline
+void __atomic_store_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {}) {
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
-            __atomic_store_n_cuda(__a.get(), __val, static_cast<__memory_order_underlying_t>(__order),  _Sco{});
+            __atomic_store_n_cuda(__a->get(), __val, static_cast<__memory_order_underlying_t>(__order),  _Sco{});
         ),
         NV_IS_HOST, (
-            __atomic_store_host(__a.get(), __val, __order);
+            __atomic_store_host(__a->get(), __val, __order);
         )
     )
 }
 
-template <template <typename, typename = __atomic_base_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
-_CCCL_HOST_DEVICE
- auto __atomic_load_dispatch(_Sto<_Tp> const& __a, memory_order __order, _Sco = {}) -> _Tp {
+template <typename _Sto, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0>
+_CCCL_HOST_DEVICE inline
+auto __atomic_load_dispatch(const _Sto* __a, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
-            return __atomic_load_n_cuda(__a.get(), static_cast<__memory_order_underlying_t>(__order),  _Sco{});
+            return __atomic_load_n_cuda(__a->get(), static_cast<__memory_order_underlying_t>(__order),  _Sco{});
         ),
         NV_IS_HOST, (
-            return __atomic_load_host(__a.get(), __order);
+            return __atomic_load_host(__a->get(), __order);
         )
     )
 }
 
-template <template <typename, typename = __atomic_base_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
-_CCCL_HOST_DEVICE
-_Tp __atomic_exchange_dispatch(_Sto<_Tp>& __a, _Tp __value, memory_order __order, _Sco = {}) {
-    alignas(_Tp) auto __tmp = __value;
+template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0>
+_CCCL_HOST_DEVICE inline
+auto __atomic_exchange_dispatch(_Sto* __a, _Up __value, memory_order __order, _Sco = {})  -> __atomic_underlying_t<_Sto> {
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
-            return __atomic_exchange_n_cuda(__a.get(), __tmp, static_cast<__memory_order_underlying_t>(__order), _Sco{});
+            return __atomic_exchange_n_cuda(__a->get(), __value, static_cast<__memory_order_underlying_t>(__order), _Sco{});
         ),
         NV_IS_HOST, (
-            return __atomic_exchange_host(__a.get(), __tmp, __order);
+            return __atomic_exchange_host(__a->get(), __value, __order);
         )
     )
 }
 
-template <template <typename, typename = __atomic_base_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
-_CCCL_HOST_DEVICE
- bool __atomic_compare_exchange_strong_dispatch(_Sto<_Tp>& __a, _Tp* __expected, _Tp __val, memory_order __success, memory_order __failure, _Sco = {}) {
+template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0>
+_CCCL_HOST_DEVICE inline
+bool __atomic_compare_exchange_strong_dispatch(_Sto* __a, _Up* __expected, _Up __val, memory_order __success, memory_order __failure, _Sco = {}) {
     bool __result = false;
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
-            __result = __atomic_compare_exchange_cuda(__a.get(), __expected, __val, false, static_cast<__memory_order_underlying_t>(__success), static_cast<__memory_order_underlying_t>(__failure), _Sco{});
+            __result = __atomic_compare_exchange_cuda(__a->get(), __expected, __val, false, static_cast<__memory_order_underlying_t>(__success), static_cast<__memory_order_underlying_t>(__failure), _Sco{});
         ),
         NV_IS_HOST, (
-            __result = __atomic_compare_exchange_strong_host(__a.get(), __expected, __val, __success, __failure);
+            __result = __atomic_compare_exchange_strong_host(__a->get(), __expected, __val, __success, __failure);
         )
     )
     return __result;
 }
 
-template <template <typename, typename = __atomic_base_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
-_CCCL_HOST_DEVICE
- bool __atomic_compare_exchange_weak_dispatch(_Sto<_Tp>& __a, _Tp* __expected, _Tp __val, memory_order __success, memory_order __failure, _Sco = {}) {
+template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0>
+_CCCL_HOST_DEVICE inline
+bool __atomic_compare_exchange_weak_dispatch(_Sto* __a, _Up* __expected, _Up __val, memory_order __success, memory_order __failure, _Sco = {}) {
     bool __result = false;
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
-            __result = __atomic_compare_exchange_cuda(__a.get(), __expected, __val,  true, static_cast<__memory_order_underlying_t>(__success), static_cast<__memory_order_underlying_t>(__failure), _Sco{});
+            __result = __atomic_compare_exchange_cuda(__a->get(), __expected, __val,  true, static_cast<__memory_order_underlying_t>(__success), static_cast<__memory_order_underlying_t>(__failure), _Sco{});
         ),
         NV_IS_HOST, (
-            __result = __atomic_compare_exchange_weak_host(__a.get(), __expected, __val, __success, __failure);
+            __result = __atomic_compare_exchange_weak_host(__a->get(), __expected, __val, __success, __failure);
         )
     )
     return __result;
 }
 
 template <typename _Tp>
-using __atomic_enable_if_ptr = __enable_if_t<is_pointer<_Tp>::value, _Tp>;
+using __atomic_enable_if_ptr = __enable_if_t<is_pointer<__atomic_underlying_t<_Tp>>::value, int>;
 template <typename _Tp>
-using __atomic_enable_if_not_ptr = __enable_if_t<!is_pointer<_Tp>::value, _Tp>;
+using __atomic_enable_if_not_ptr = __enable_if_t<!is_pointer<__atomic_underlying_t<_Tp>>::value, int>;
 
-template <template <typename, typename = __atomic_base_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
-_CCCL_HOST_DEVICE
- __atomic_enable_if_not_ptr<_Tp> __atomic_fetch_add_dispatch(_Sto<_Tp>& __a, _Tp __delta, memory_order __order, _Sco = {}) {
+template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0, __atomic_enable_if_not_ptr<_Sto> = 0>
+_CCCL_HOST_DEVICE inline
+auto __atomic_fetch_add_dispatch(_Sto* __a, _Up __delta, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
-            return __atomic_fetch_add_cuda(__a.get(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{});
+            return __atomic_fetch_add_cuda(__a->get(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{});
         ),
         NV_IS_HOST, (
-            return __atomic_fetch_add_host(__a.get(), __delta, __order);
+            return __atomic_fetch_add_host(__a->get(), __delta, __order);
         )
     )
 }
 
-template <template <typename, typename = __atomic_base_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
-_CCCL_HOST_DEVICE
- __atomic_enable_if_ptr<_Tp> __atomic_fetch_add_dispatch(_Sto<_Tp>& __a, ptrdiff_t __delta, memory_order __order, _Sco = {}) {
+template <typename _Sto, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0, __atomic_enable_if_ptr<_Sto> = 0>
+_CCCL_HOST_DEVICE inline
+auto __atomic_fetch_add_dispatch(_Sto* __a, ptrdiff_t __delta, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
-            return __atomic_fetch_add_cuda(__a.get(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{});
+            return __atomic_fetch_add_cuda(__a->get(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{});
         ),
         NV_IS_HOST, (
-            return __atomic_fetch_add_host(__a.get(), __delta, __order);
+            return __atomic_fetch_add_host(__a->get(), __delta, __order);
         )
     )
 }
 
-template <template <typename, typename = __atomic_base_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
-_CCCL_HOST_DEVICE
- __atomic_enable_if_not_ptr<_Tp> __atomic_fetch_sub_dispatch(_Sto<_Tp>& __a, _Tp __delta, memory_order __order, _Sco = {}) {
+template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0, __atomic_enable_if_not_ptr<_Sto> = 0>
+_CCCL_HOST_DEVICE inline
+auto __atomic_fetch_sub_dispatch(_Sto* __a, _Up __delta, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
-            return __atomic_fetch_sub_cuda(__a.get(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{});
+            return __atomic_fetch_sub_cuda(__a->get(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{});
         ),
         NV_IS_HOST, (
-            return __atomic_fetch_sub_cuda(__a.get(), __delta, __order);
+            return __atomic_fetch_sub_host(__a->get(), __delta, __order);
         )
     )
 }
 
-template <template <typename, typename = __atomic_base_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
-_CCCL_HOST_DEVICE
- __atomic_enable_if_ptr<_Tp> __atomic_fetch_sub_dispatch(_Sto<_Tp>& __a, ptrdiff_t __delta, memory_order __order, _Sco = {}) {
+template <typename _Sto, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0, __atomic_enable_if_ptr<_Sto> = 0>
+_CCCL_HOST_DEVICE inline
+auto __atomic_fetch_sub_dispatch(_Sto* __a, ptrdiff_t __delta, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
-            return __atomic_fetch_sub_cuda(__a.get(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{});
+            return __atomic_fetch_sub_cuda(__a->get(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{});
         ),
         NV_IS_HOST, (
-            return __atomic_fetch_sub_host(__a.get(), __delta, __order);
+            return __atomic_fetch_sub_host(__a->get(), __delta, __order);
         )
     )
 }
 
-template <template <typename, typename = __atomic_base_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
-_CCCL_HOST_DEVICE
- _Tp __atomic_fetch_and_dispatch(_Sto<_Tp>& __a, _Tp __pattern, memory_order __order, _Sco = {}) {
+template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0>
+_CCCL_HOST_DEVICE inline
+auto __atomic_fetch_and_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {})  -> __atomic_underlying_t<_Sto> {
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
-            return __atomic_fetch_and_cuda(__a.get(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{});
+            return __atomic_fetch_and_cuda(__a->get(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{});
         ),
         NV_IS_HOST, (
-            return __atomic_fetch_and_host(__a.get(), __pattern, __order);
+            return __atomic_fetch_and_host(__a->get(), __pattern, __order);
         )
     )
 }
 
-template <template <typename, typename = __atomic_base_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
-_CCCL_HOST_DEVICE
- _Tp __atomic_fetch_or_dispatch(_Sto<_Tp>& __a, _Tp __pattern, memory_order __order, _Sco = {}) {
+template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0>
+_CCCL_HOST_DEVICE inline
+auto __atomic_fetch_or_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {})  -> __atomic_underlying_t<_Sto> {
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
-            return __atomic_fetch_or_cuda(__a.get(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{});
+            return __atomic_fetch_or_cuda(__a->get(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{});
         ),
         NV_IS_HOST, (
-            return __atomic_fetch_or_host(__a.get(), __pattern, __order);
+            return __atomic_fetch_or_host(__a->get(), __pattern, __order);
         )
     )
 }
 
-template <template <typename, typename = __atomic_base_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
-_CCCL_HOST_DEVICE
- _Tp __atomic_fetch_xor_dispatch(_Sto<_Tp>& __a, _Tp __pattern, memory_order __order, _Sco = {}) {
+template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0>
+_CCCL_HOST_DEVICE inline
+auto __atomic_fetch_xor_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {})  -> __atomic_underlying_t<_Sto> {
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
-            return __atomic_fetch_xor_cuda(__a.get(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{});
+            return __atomic_fetch_xor_cuda(__a->get(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{});
         ),
         NV_IS_HOST, (
-            return __atomic_fetch_xor_host(__a.get(), __pattern, __order);
+            return __atomic_fetch_xor_host(__a->get(), __pattern, __order);
         )
     )
 }
 
-template <template <typename, typename = __atomic_base_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
-_CCCL_HOST_DEVICE
- _Tp __atomic_fetch_max_dispatch(_Sto<_Tp>& __a, _Tp __val, memory_order __order, _Sco = {}) {
+template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0>
+_CCCL_HOST_DEVICE inline
+auto __atomic_fetch_max_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {})  -> __atomic_underlying_t<_Sto> {
     NV_IF_TARGET(
         NV_IS_DEVICE, (
-            return __atomic_fetch_max_cuda(__a.get(), __val, static_cast<__memory_order_underlying_t>(__order), _Sco{});
+            return __atomic_fetch_max_cuda(__a->get(), __val, static_cast<__memory_order_underlying_t>(__order), _Sco{});
         ), (
-            return __atomic_fetch_max_host(__a.get(), __val, __order);
+            return __atomic_fetch_max_host(__a->get(), __val, __order);
         )
     )
 }
 
-template <template <typename, typename = __atomic_base_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
-_CCCL_HOST_DEVICE
- _Tp __atomic_fetch_min_dispatch(_Sto<_Tp>& __a, _Tp __val, memory_order __order, _Sco = {}) {
+template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0>
+_CCCL_HOST_DEVICE inline
+auto __atomic_fetch_min_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {})  -> __atomic_underlying_t<_Sto> {
     NV_IF_TARGET(
         NV_IS_DEVICE, (
-            return __atomic_fetch_min_cuda(__a.get(), __val, static_cast<__memory_order_underlying_t>(__order), _Sco{});
+            return __atomic_fetch_min_cuda(__a->get(), __val, static_cast<__memory_order_underlying_t>(__order), _Sco{});
         ), (
-            return __atomic_fetch_min_host(__a.get(), __val, __order);
+            return __atomic_fetch_min_host(__a->get(), __val, __order);
         )
     )
 }
diff --git a/libcudacxx/include/cuda/std/__atomic/operations/host.h b/libcudacxx/include/cuda/std/__atomic/operations/host.h
index e6015f33e4..cba9d9c5f9 100644
--- a/libcudacxx/include/cuda/std/__atomic/operations/host.h
+++ b/libcudacxx/include/cuda/std/__atomic/operations/host.h
@@ -74,22 +74,29 @@ inline bool __atomic_compare_exchange_weak_host(
 }
 
 template <typename _Tp>
-struct __atomic_ptr_inc { enum {value = 1}; };
+struct __atomic_ptr_skip {
+  static constexpr auto __skip = 1;
+};
 
 template <typename _Tp>
-struct __atomic_ptr_inc<_Tp*> { enum {value = sizeof(_Tp)}; };
+struct __atomic_ptr_skip<_Tp*> {
+  static constexpr auto __skip = sizeof(_Tp);
+};
 
 // FIXME: Haven't figured out what the spec says about using arrays with
 // atomic_fetch_add. Force a failure rather than creating bad behavior.
 template <typename _Tp>
-struct __atomic_ptr_inc<_Tp[]> { };
+struct __atomic_ptr_skip<_Tp[]> { };
 template <typename _Tp, int n>
-struct __atomic_ptr_inc<_Tp[n]> { };
+struct __atomic_ptr_skip<_Tp[n]> { };
+
+template <typename _Tp>
+using __atomic_ptr_skip_t = __atomic_ptr_skip<__remove_cvref_t<_Tp>>;
 
 template <typename _Tp, typename _Td, __enable_if_t<!is_floating_point<_Tp>::value, int> = 0>
 inline _Tp __atomic_fetch_add_host(_Tp* __a, _Td __delta,
                            memory_order __order) {
-  constexpr auto __skip_v = __atomic_ptr_inc<_Tp>::value;
+  constexpr auto __skip_v = __atomic_ptr_skip_t<_Tp>::__skip;
   return __atomic_fetch_add(__a, __delta * __skip_v,
                             __atomic_order_to_int(__order));
 }
@@ -110,7 +117,7 @@ inline _Tp __atomic_fetch_add_host(_Tp* __a, _Td __delta,
 template <typename _Tp, typename _Td, __enable_if_t<!is_floating_point<_Tp>::value, int> = 0>
 inline _Tp __atomic_fetch_sub_host(_Tp* __a, _Td __delta,
                            memory_order __order) {
-  constexpr auto __skip_v = __atomic_ptr_inc<_Tp>::value;
+  constexpr auto __skip_v = __atomic_ptr_skip_t<_Tp>::__skip;
   return __atomic_fetch_sub(__a, __delta * __skip_v,
                             __atomic_order_to_int(__order));
 }
diff --git a/libcudacxx/include/cuda/std/__atomic/order.h b/libcudacxx/include/cuda/std/__atomic/order.h
index 0310f125b6..3e88f6f1e1 100644
--- a/libcudacxx/include/cuda/std/__atomic/order.h
+++ b/libcudacxx/include/cuda/std/__atomic/order.h
@@ -11,6 +11,10 @@
 #ifndef __LIBCUDACXX_ATOMIC_ORDER_H
 #define __LIBCUDACXX_ATOMIC_ORDER_H
 
+#include <cuda/std/detail/__config>
+
+#include <cuda/std/__type_traits/underlying_type.h>
+
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 #define _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m) \
@@ -123,4 +127,18 @@ static_assert((is_same<underlying_type<memory_order>::type, __memory_order_under
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+using memory_order = _CUDA_VSTD::memory_order;
+
+constexpr memory_order memory_order_relaxed = _CUDA_VSTD::memory_order_relaxed;
+constexpr memory_order memory_order_consume = _CUDA_VSTD::memory_order_consume;
+constexpr memory_order memory_order_acquire = _CUDA_VSTD::memory_order_acquire;
+constexpr memory_order memory_order_release = _CUDA_VSTD::memory_order_release;
+constexpr memory_order memory_order_acq_rel = _CUDA_VSTD::memory_order_acq_rel;
+constexpr memory_order memory_order_seq_cst = _CUDA_VSTD::memory_order_seq_cst;
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+
 #endif // __LIBCUDACXX_ATOMIC_ORDER_H
diff --git a/libcudacxx/include/cuda/std/__atomic/scopes.h b/libcudacxx/include/cuda/std/__atomic/scopes.h
index 3208227dc8..77f8246e1b 100644
--- a/libcudacxx/include/cuda/std/__atomic/scopes.h
+++ b/libcudacxx/include/cuda/std/__atomic/scopes.h
@@ -32,21 +32,40 @@ template<int _Scope>  struct __scope_enum_to_tag { };
 template<> struct __scope_enum_to_tag<(int)thread_scope_thread> {
     using type = __thread_scope_thread_tag; };
 Until then: */
-template<> struct __scope_enum_to_tag<(int)thread_scope_thread> {
-    using type = __thread_scope_block_tag; };
-template<> struct __scope_enum_to_tag<(int)thread_scope_block> {
-    using type = __thread_scope_block_tag; };
-template<> struct __scope_enum_to_tag<(int)thread_scope_device> {
-    using type = __thread_scope_device_tag; };
-template<> struct __scope_enum_to_tag<(int)thread_scope_system> {
-    using type = __thread_scope_system_tag; };
+template<>
+struct __scope_enum_to_tag<(int)thread_scope_thread> {
+    using __tag = __thread_scope_block_tag;
+};
+template<>
+struct __scope_enum_to_tag<(int)thread_scope_block> {
+    using __tag = __thread_scope_block_tag;
+};
+template<>
+struct __scope_enum_to_tag<(int)thread_scope_device> {
+    using __tag = __thread_scope_device_tag;
+};
+template<>
+struct __scope_enum_to_tag<(int)thread_scope_system> {
+    using __tag = __thread_scope_system_tag;
+};
 
 template <int _Scope>
-_LIBCUDACXX_INLINE_VISIBILITY auto constexpr __scope_tag() ->
-        typename __scope_enum_to_tag<_Scope>::type {
-    return typename __scope_enum_to_tag<_Scope>::type();
-}
+using __scope_to_tag = typename __scope_enum_to_tag<_Scope>::__tag;
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+using _CUDA_VSTD::thread_scope;
+using _CUDA_VSTD::thread_scope_block;
+using _CUDA_VSTD::thread_scope_device;
+using _CUDA_VSTD::thread_scope_system;
+using _CUDA_VSTD::thread_scope_thread;
+
+using _CUDA_VSTD::__thread_scope_block_tag;
+using _CUDA_VSTD::__thread_scope_device_tag;
+using _CUDA_VSTD::__thread_scope_system_tag;
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
 #endif // __LIBCUDACXX_ATOMIC_SCOPES_H
diff --git a/libcudacxx/include/cuda/std/__atomic/storage/base.h b/libcudacxx/include/cuda/std/__atomic/storage/base.h
index 399b0e6ce5..e3c17f8c59 100644
--- a/libcudacxx/include/cuda/std/__atomic/storage/base.h
+++ b/libcudacxx/include/cuda/std/__atomic/storage/base.h
@@ -20,11 +20,10 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-struct __atomic_base_tag {};
-
-template <typename _Tp, typename _Tag = __atomic_base_tag>
+template <typename _Tp>
 struct __atomic_storage {
   using __underlying_t = _Tp;
+  static constexpr __atomic_tag __tag = __atomic_tag::__atomic_base_tag;
 
 #if !defined(_CCCL_COMPILER_GCC) || (__GNUC__ >= 5)
   static_assert(is_trivially_copyable<_Tp>::value,
@@ -33,23 +32,28 @@ struct __atomic_storage {
 
   _CCCL_ALIGNAS(sizeof(_Tp)) _Tp __a_value;
 
-  _CCCL_HOST_DEVICE
+  _CCCL_HOST_DEVICE constexpr explicit inline
   __atomic_storage() noexcept
-    : __a_value() {}
-  _CCCL_HOST_DEVICE constexpr explicit
+    : __a_value{} {}
+
+  _CCCL_HOST_DEVICE constexpr explicit inline
   __atomic_storage(_Tp value) noexcept
     : __a_value(value) {}
 
-  _CCCL_HOST_DEVICE inline auto get() -> __underlying_t* {
+  _CCCL_HOST_DEVICE inline
+  auto get() -> __underlying_t* {
     return &__a_value;
   }
-  _CCCL_HOST_DEVICE inline auto get() volatile -> volatile __underlying_t* {
+  _CCCL_HOST_DEVICE inline
+  auto get() const -> const __underlying_t* {
     return &__a_value;
   }
-  _CCCL_HOST_DEVICE inline auto get() const -> const __underlying_t* {
+  _CCCL_HOST_DEVICE inline
+  auto get() volatile -> volatile __underlying_t* {
     return &__a_value;
   }
-  _CCCL_HOST_DEVICE inline auto get() const volatile -> const volatile __underlying_t* {
+  _CCCL_HOST_DEVICE inline
+  auto get() const volatile -> const volatile __underlying_t* {
     return &__a_value;
   }
 };
diff --git a/libcudacxx/include/cuda/std/__atomic/storage/common.h b/libcudacxx/include/cuda/std/__atomic/storage/common.h
index 406e467aca..4829c4b44b 100644
--- a/libcudacxx/include/cuda/std/__atomic/storage/common.h
+++ b/libcudacxx/include/cuda/std/__atomic/storage/common.h
@@ -16,19 +16,25 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
+enum class __atomic_tag {
+  __atomic_base_tag,
+  __atomic_locked_tag,
+  __atomic_small_tag,
+};
+
 // [atomics.types.generic]p1 guarantees _Tp is trivially copyable. Because
 // the default operator= in an object is not volatile, a byte-by-byte copy
 // is required.
 template <typename _Tp, typename _Tv>
 __enable_if_t<is_assignable<_Tp&, _Tv>::value>
-_CCCL_HOST_DEVICE __atomic_assign_volatile(_Tp& __a_value, _Tv const& __val) {
-  __a_value = __val;
+_CCCL_HOST_DEVICE __atomic_assign_volatile(_Tp* __a_value, _Tv const& __val) {
+  *__a_value = __val;
 }
 
 template <typename _Tp, typename _Tv>
 __enable_if_t<is_assignable<_Tp&, _Tv>::value>
-_CCCL_HOST_DEVICE __atomic_assign_volatile(_Tp volatile& __a_value, _Tv volatile const& __val) {
-  volatile char* __to = reinterpret_cast<volatile char*>(&__a_value);
+_CCCL_HOST_DEVICE __atomic_assign_volatile(_Tp volatile* __a_value, _Tv volatile const& __val) {
+  volatile char* __to = reinterpret_cast<volatile char*>(__a_value);
   volatile char* __end = __to + sizeof(_Tp);
   volatile const char* __from = reinterpret_cast<volatile const char*>(&__val);
   while (__to != __end)
@@ -38,6 +44,26 @@ _CCCL_HOST_DEVICE __atomic_assign_volatile(_Tp volatile& __a_value, _Tv volatile
 template <typename _Tp>
 using __atomic_underlying_t = typename __remove_cvref_t<_Tp>::__underlying_t;
 
+_CCCL_HOST_DEVICE
+inline int __atomic_memcmp(void const * __lhs, void const * __rhs, size_t __count) {
+    NV_DISPATCH_TARGET(
+        NV_IS_DEVICE, (
+            auto __lhs_c = reinterpret_cast<unsigned char const *>(__lhs);
+            auto __rhs_c = reinterpret_cast<unsigned char const *>(__rhs);
+            while (__count--) {
+                auto const __lhs_v = *__lhs_c++;
+                auto const __rhs_v = *__rhs_c++;
+                if (__lhs_v < __rhs_v) { return -1; }
+                if (__lhs_v > __rhs_v) { return 1; }
+            }
+            return 0;
+        ),
+        NV_IS_HOST, (
+            return memcmp(__lhs, __rhs, __count);
+        )
+    )
+}
+
 _LIBCUDACXX_END_NAMESPACE_STD
 
 #endif // _LIBCUDACXX___ATOMIC_STORAGE_COMMON_H
diff --git a/libcudacxx/include/cuda/std/__atomic/storage/locked.h b/libcudacxx/include/cuda/std/__atomic/storage/locked.h
index 08c03d2aab..4b037daa29 100644
--- a/libcudacxx/include/cuda/std/__atomic/storage/locked.h
+++ b/libcudacxx/include/cuda/std/__atomic/storage/locked.h
@@ -16,6 +16,7 @@
 
 #include <cuda/std/type_traits>
 
+#include <cuda/std/__atomic/storage/common.h>
 #include <cuda/std/__atomic/storage/base.h>
 #include <cuda/std/__atomic/order.h>
 #include <cuda/std/__atomic/scopes.h>
@@ -23,178 +24,178 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 // Locked atomics must override the dispatch to be able to implement RMW primitives around the embedded lock.
-struct __atomic_locked_tag {};
-
-template <typename _Tp, typename _Tag = __atomic_locked_tag>
+template <typename _Tp>
 struct __atomic_locked_storage {
   using __underlying_t = typename remove_cv<_Tp>::type;
-
-  _CCCL_HOST_DEVICE
-  __atomic_locked_storage() noexcept
-    : __a_value(), __a_lock(0) {}
-  _CCCL_HOST_DEVICE constexpr explicit
-  __atomic_locked_storage(_Tp value) noexcept
-    : __a_value(value), __a_lock(0) {}
+  static constexpr __atomic_tag __tag = __atomic_tag::__atomic_locked_tag;
 
   _Tp __a_value;
   mutable __atomic_storage<_LIBCUDACXX_ATOMIC_FLAG_TYPE> __a_lock;
 
+  _CCCL_HOST_DEVICE constexpr explicit inline
+  __atomic_locked_storage() noexcept
+    : __a_value{}, __a_lock{} {}
+
+  _CCCL_HOST_DEVICE constexpr explicit inline
+  __atomic_locked_storage(_Tp value) noexcept
+    : __a_value(value), __a_lock{} {}
+
   template <typename _Sco>
-  _CCCL_HOST_DEVICE void __lock(_Sco) const volatile {
-    while(1 == __atomic_exchange_dispatch(__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), memory_order_acquire, _Sco{}))
+  _CCCL_HOST_DEVICE inline
+  void __lock(_Sco) const volatile {
+    while(1 == __atomic_exchange_dispatch(&__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), memory_order_acquire, _Sco{}))
         /*spin*/;
   }
   template <typename _Sco>
-  _CCCL_HOST_DEVICE void __lock(_Sco) const {
-    while(1 == __atomic_exchange_dispatch(__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), memory_order_acquire, _Sco{}))
+  _CCCL_HOST_DEVICE inline
+  void __lock(_Sco) const {
+    while(1 == __atomic_exchange_dispatch(&__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), memory_order_acquire, _Sco{}))
         /*spin*/;
   }
   template <typename _Sco>
-  _CCCL_HOST_DEVICE void __unlock(_Sco) const volatile {
-    __atomic_store_dispatch(__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), memory_order_release, _Sco{});
+  _CCCL_HOST_DEVICE inline
+  void __unlock(_Sco) const volatile {
+    __atomic_store_dispatch(&__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), memory_order_release, _Sco{});
   }
   template <typename _Sco>
-  _CCCL_HOST_DEVICE void __unlock(_Sco) const {
-    __atomic_store_dispatch(__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), memory_order_release, _Sco{});
+  _CCCL_HOST_DEVICE inline
+  void __unlock(_Sco) const {
+    __atomic_store_dispatch(&__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), memory_order_release, _Sco{});
   }
 };
 
-template <template <typename, typename = __atomic_locked_tag> _Sto, typename _Tp>
-_CCCL_HOST_DEVICE
-void __atomic_init_dispatch(_Sto<_Tp>& __a,  _Tp __val) {
-  __atomic_assign_volatile(__a.__a_value, __val);
+// Extract the storage tag and SFINAE on the tag inside the storage object
+template <typename _Sto>
+using __atomic_storage_is_locked = __enable_if_t<__atomic_tag::__atomic_locked_tag == __remove_cvref_t<_Sto>::__tag, int>;
+
+template <typename _Sto, typename _Up, __atomic_storage_is_locked<_Sto> = 0>
+_CCCL_HOST_DEVICE inline
+void __atomic_init_dispatch(_Sto* __a,  _Up __val) {
+  __atomic_assign_volatile(&__a->__a_value, __val);
 }
 
-template <template <typename, typename = __atomic_locked_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
-_CCCL_HOST_DEVICE
-void __atomic_store_dispatch(_Sto<_Tp>& __a,  _Tp __val, memory_order, _Sco) {
-  __a.__lock(_Sco{});
-  __atomic_assign_volatile(__a.__a_value, __val);
-  __a.__unlock(_Sco{});
+template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_locked<_Sto> = 0>
+_CCCL_HOST_DEVICE inline
+void __atomic_store_dispatch(_Sto* __a,  _Up __val, memory_order, _Sco = {}) {
+  __a->__lock(_Sco{});
+  __atomic_assign_volatile(&__a->__a_value, __val);
+  __a->__unlock(_Sco{});
 }
 
-template <template <typename, typename = __atomic_locked_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
-_CCCL_HOST_DEVICE
-_Tp __atomic_load_dispatch(const _Sto<_Tp>& __a, memory_order, _Sco) {
+template <typename _Sto, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_locked<_Sto> = 0>
+_CCCL_HOST_DEVICE inline
+auto __atomic_load_dispatch(const _Sto* __a, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
+  using _Tp = __atomic_underlying_t<_Sto>;
   _Tp __old;
-  __a.__lock(_Sco{});
-  __atomic_assign_volatile(__old, __a.__a_value);
-  __a.__unlock(_Sco{});
+  __a->__lock(_Sco{});
+  __atomic_assign_volatile(&__old, __a->__a_value);
+  __a->__unlock(_Sco{});
   return __old;
 }
 
-template <template <typename, typename = __atomic_locked_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
-_CCCL_HOST_DEVICE
-_Tp __atomic_exchange_dispatch(_Sto<_Tp>& __a, _Tp __value, memory_order, _Sco) {
+template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_locked<_Sto> = 0>
+_CCCL_HOST_DEVICE inline
+auto __atomic_exchange_dispatch(_Sto* __a, _Up __value, memory_order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
+  using _Tp = __atomic_underlying_t<_Sto>;
   _Tp __old;
-  __a.__lock(_Sco{});
-  __atomic_assign_volatile(__old, __a.__a_value);
-  __atomic_assign_volatile(__a.__a_value, __value);
-  __a.__unlock(_Sco{});
+  __a->__lock(_Sco{});
+  __atomic_assign_volatile(&__old, __a->__a_value);
+  __atomic_assign_volatile(&__a->__a_value, __value);
+  __a->__unlock(_Sco{});
   return __old;
 }
 
-template <template <typename, typename = __atomic_locked_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
-_CCCL_HOST_DEVICE
-bool __atomic_compare_exchange_strong_dispatch(_Sto<_Tp>& __a,
-                                          _Tp* __expected, _Tp __value, memory_order, memory_order, _Sco) {
+template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_locked<_Sto> = 0>
+_CCCL_HOST_DEVICE inline
+bool __atomic_compare_exchange_strong_dispatch(_Sto* __a, _Up* __expected, _Up __value, memory_order, memory_order, _Sco = {}) {
+  using _Tp = __atomic_underlying_t<_Sto>;
   _Tp __temp;
-  __a.__lock(_Sco{});
-  __atomic_assign_volatile(__temp, __a.__a_value);
+  __a->__lock(_Sco{});
+  __atomic_assign_volatile(&__temp, __a->__a_value);
   bool __ret = __temp == *__expected;
   if(__ret)
-    __atomic_assign_volatile(__a.__a_value, __value);
+    __atomic_assign_volatile(&__a->__a_value, __value);
   else
-    __atomic_assign_volatile(*__expected, __a.__a_value);
-  __a.__unlock(_Sco{});
+    __atomic_assign_volatile(__expected, __a->__a_value);
+  __a->__unlock(_Sco{});
   return __ret;
 }
 
-template <template <typename, typename = __atomic_locked_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
-_CCCL_HOST_DEVICE
-bool __atomic_compare_exchange_weak_dispatch(_Sto<_Tp>& __a,
-                                        _Tp* __expected, _Tp __value, memory_order, memory_order, _Sco) {
+template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_locked<_Sto> = 0>
+_CCCL_HOST_DEVICE inline
+bool __atomic_compare_exchange_weak_dispatch(_Sto* __a, _Up* __expected, _Up __value, memory_order, memory_order, _Sco = {}) {
+  using _Tp = __atomic_underlying_t<_Sto>;
   _Tp __temp;
-  __a.__lock(_Sco{});
-  __atomic_assign_volatile(__temp, __a.__a_value);
+  __a->__lock(_Sco{});
+  __atomic_assign_volatile(&__temp, __a->__a_value);
   bool __ret = __temp == *__expected;
   if(__ret)
-    __atomic_assign_volatile(__a.__a_value, __value);
+    __atomic_assign_volatile(&__a->__a_value, __value);
   else
-    __atomic_assign_volatile(*__expected, __a.__a_value);
-  __a.__unlock(_Sco{});
+    __atomic_assign_volatile(__expected, __a->__a_value);
+  __a->__unlock(_Sco{});
   return __ret;
 }
 
-template <typename _Tp, typename _Td, typename _Sco>
-_CCCL_HOST_DEVICE
-_Tp __atomic_fetch_add_dispatch(_Sto<_Tp>& __a,
-                           _Td __delta, memory_order, _Sco) {
+template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_locked<_Sto> = 0>
+_CCCL_HOST_DEVICE inline
+auto __atomic_fetch_add_dispatch(_Sto* __a, _Up __delta, memory_order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
+  using _Tp = __atomic_underlying_t<_Sto>;
   _Tp __old;
-  __a.__lock(_Sco{});
-  __atomic_assign_volatile(__old, __a.__a_value);
-  __atomic_assign_volatile(__a.__a_value, _Tp(__old + __delta));
-  __a.__unlock(_Sco{});
+  __a->__lock(_Sco{});
+  __atomic_assign_volatile(&__old, __a->__a_value);
+  __atomic_assign_volatile(&__a->__a_value, _Tp(__old + __delta));
+  __a->__unlock(_Sco{});
   return __old;
 }
 
-template <template <typename, typename = __atomic_locked_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
-_CCCL_HOST_DEVICE
-_Tp __atomic_fetch_add_dispatch(_Sto<_Tp>& __a,
-                           ptrdiff_t __delta, memory_order, _Sco) {
+template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_locked<_Sto> = 0>
+_CCCL_HOST_DEVICE inline
+auto __atomic_fetch_sub_dispatch(_Sto* __a, _Up __delta, memory_order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
+  using _Tp = __atomic_underlying_t<_Sto>;
   _Tp __old;
-  __a.__lock(_Sco{});
-  __atomic_assign_volatile(__old, __a.__a_value);
-  __atomic_assign_volatile(__a.__a_value, __old + __delta);
-  __a.__unlock(_Sco{});
+  __a->__lock(_Sco{});
+  __atomic_assign_volatile(&__old, __a->__a_value);
+  __atomic_assign_volatile(&__a->__a_value, _Tp(__old - __delta));
+  __a->__unlock(_Sco{});
   return __old;
 }
 
-template <typename _Tp, typename _Td, typename _Sco>
-_CCCL_HOST_DEVICE
-_Tp __atomic_fetch_sub_dispatch(_Sto<_Tp>& __a,
-                           _Tp __delta, memory_order, _Sco) {
+template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_locked<_Sto> = 0>
+_CCCL_HOST_DEVICE inline
+auto __atomic_fetch_and_dispatch(_Sto* __a, _Up __pattern, memory_order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
+  using _Tp = __atomic_underlying_t<_Sto>;
   _Tp __old;
-  __a.__lock(_Sco{});
-  __atomic_assign_volatile(__old, __a.__a_value);
-  __atomic_assign_volatile(__a.__a_value, _Tp(__old - __delta));
-  __a.__unlock(_Sco{});
+  __a->__lock(_Sco{});
+  __atomic_assign_volatile(&__old, __a->__a_value);
+  __atomic_assign_volatile(&__a->__a_value, _Tp(__old & __pattern));
+  __a->__unlock(_Sco{});
   return __old;
 }
 
-template <template <typename, typename = __atomic_locked_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
-_CCCL_HOST_DEVICE
-_Tp __atomic_fetch_and_dispatch(_Sto<_Tp>& __a,
-                           _Tp __pattern, memory_order, _Sco) {
-  _Tp __old;
-  __a.__lock(_Sco{});
-  __atomic_assign_volatile(__old, __a.__a_value);
-  __atomic_assign_volatile(__a.__a_value, _Tp(__old & __pattern));
-  __a.__unlock(_Sco{});
-  return __old;
-}
 
-template <template <typename, typename = __atomic_locked_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
-_CCCL_HOST_DEVICE
-_Tp __atomic_fetch_or_dispatch(_Sto<_Tp>& __a,
-                          _Tp __pattern, memory_order, _Sco) {
+template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_locked<_Sto> = 0>
+_CCCL_HOST_DEVICE inline
+auto __atomic_fetch_or_dispatch(_Sto* __a, _Up __pattern, memory_order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
+  using _Tp = __atomic_underlying_t<_Sto>;
   _Tp __old;
-  __a.__lock(_Sco{});
-  __atomic_assign_volatile(__old, __a.__a_value);
-  __atomic_assign_volatile(__a.__a_value, _Tp(__old | __pattern));
-  __a.__unlock(_Sco{});
+  __a->__lock(_Sco{});
+  __atomic_assign_volatile(&__old, __a->__a_value);
+  __atomic_assign_volatile(&__a->__a_value, _Tp(__old | __pattern));
+  __a->__unlock(_Sco{});
   return __old;
 }
 
-template <template <typename, typename = __atomic_locked_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
-_CCCL_HOST_DEVICE
-_Tp __atomic_fetch_xor_dispatch(_Sto<_Tp>& __a,
-                           _Tp __pattern, memory_order, _Sco) {
+
+template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_locked<_Sto> = 0>
+_CCCL_HOST_DEVICE inline
+auto __atomic_fetch_xor_dispatch(_Sto* __a, _Up __pattern, memory_order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
+  using _Tp = __atomic_underlying_t<_Sto>;
   _Tp __old;
-  __a.__lock(_Sco{});
-  __atomic_assign_volatile(__old, __a.__a_value);
-  __atomic_assign_volatile(__a.__a_value, _Tp(__old ^ __pattern));
-  __a.__unlock(_Sco{});
+  __a->__lock(_Sco{});
+  __atomic_assign_volatile(&__old, __a->__a_value);
+  __atomic_assign_volatile(&__a->__a_value, _Tp(__old ^ __pattern));
+  __a->__unlock(_Sco{});
   return __old;
 }
 
diff --git a/libcudacxx/include/cuda/std/__atomic/storage/reference.h b/libcudacxx/include/cuda/std/__atomic/storage/reference.h
index 16d5eee48f..e1e7309738 100644
--- a/libcudacxx/include/cuda/std/__atomic/storage/reference.h
+++ b/libcudacxx/include/cuda/std/__atomic/storage/reference.h
@@ -22,9 +22,10 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 // Reference is compatible with __atomic_base_tag and uses default dispatch
 
-template <typename _Tp, typename _Tag = __atomic_base_tag>
+template <typename _Tp>
 struct __atomic_ref_storage {
   using __underlying_t = _Tp;
+  static constexpr __atomic_tag __tag = __atomic_tag::__atomic_base_tag;
 
 #if !defined(_CCCL_COMPILER_GCC) || (__GNUC__ >= 5)
   static_assert(is_trivially_copyable<_Tp>::value,
@@ -33,11 +34,26 @@ struct __atomic_ref_storage {
 
   _Tp* __a_value;
 
-  _CCCL_HOST_DEVICE constexpr explicit
-  __atomic_ref_storage(_Tp& value) noexcept
-    : __a_value(&value) {}
+  __atomic_ref_storage() = delete;
 
-  _CCCL_HOST_DEVICE inline auto get() -> __underlying_t* {
+  _CCCL_HOST_DEVICE constexpr explicit inline
+  __atomic_ref_storage(_Tp* value) noexcept
+    : __a_value(value) {}
+
+  _CCCL_HOST_DEVICE inline
+  auto get() -> __underlying_t* {
+    return __a_value;
+  }
+  _CCCL_HOST_DEVICE inline
+  auto get() const -> __underlying_t* {
+    return __a_value;
+  }
+  _CCCL_HOST_DEVICE inline
+  auto get() volatile -> volatile __underlying_t* {
+    return __a_value;
+  }
+  _CCCL_HOST_DEVICE inline
+  auto get() const volatile -> volatile __underlying_t* {
     return __a_value;
   }
 };
diff --git a/libcudacxx/include/cuda/std/__atomic/storage/small.h b/libcudacxx/include/cuda/std/__atomic/storage/small.h
index 9d8d3a9791..7b8d0bc49b 100644
--- a/libcudacxx/include/cuda/std/__atomic/storage/small.h
+++ b/libcudacxx/include/cuda/std/__atomic/storage/small.h
@@ -26,8 +26,6 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 // Atomic small types require conversion to/from a proxy type that can be
 // manipulated by PTX without any performance overhead
-struct __atomic_small_tag {};
-
 template <typename _Tp>
 using __atomic_small_proxy_t = __conditional_t<is_signed<_Tp>::value, int32_t, uint32_t>;
 
@@ -57,118 +55,131 @@ _CCCL_HOST_DEVICE inline _Tp __atomic_small_from_32(__atomic_small_proxy_t<_Tp>
     return __temp;
 }
 
-template <typename _Tp, typename _Tag = __atomic_small_tag>
+template <typename _Tp>
 struct __atomic_small_storage {
     using __underlying_t = _Tp;
     using __proxy_t = __atomic_small_proxy_t<_Tp>;
+    static constexpr __atomic_tag __tag = __atomic_tag::__atomic_small_tag;
 
-    __atomic_small_storage() noexcept = default;
+    _CCCL_HOST_DEVICE constexpr inline explicit
+    __atomic_small_storage() noexcept
+        : __a_value{__proxy_t{}} {}
+    ;
 
-    _CCCL_HOST_DEVICE
-    constexpr explicit __atomic_small_storage(_Tp __value) : __a_value(__atomic_small_to_32(__value)) {}
+    _CCCL_HOST_DEVICE constexpr inline explicit
+    __atomic_small_storage(_Tp __value) noexcept
+        : __a_value{__atomic_small_to_32(__value)} {}
 
     __atomic_storage<__proxy_t> __a_value;
 };
 
-template <template <typename, typename = __atomic_small_tag> _Sto, typename _Tp>
-_CCCL_HOST_DEVICE
-void __atomic_init_dispatch(_Sto<_Tp>& __a, _Tp __val) {
-    __atomic_assign_volatile(__a.__a_value, __atomic_small_to_32(__val));
-}
+// Extract the storage tag and SFINAE on the tag inside the storage object
+template <typename _Sto>
+using __atomic_storage_is_small = __enable_if_t<__atomic_tag::__atomic_small_tag == __remove_cvref_t<_Sto>::__tag, int>;
 
-template <template <typename, typename = __atomic_small_tag> _Sto, typename _Tp, typename _Sco = __thread_scope_system_tag>
-_CCCL_HOST_DEVICE inline void __atomic_store_dispatch(_Sto<_Tp>& __a, _Tp __val, memory_order __order, _Sco) {
-    __atomic_store_dispatch(__a.__a_value, __atomic_small_to_32(__val), __order, _Sco{});
+template <typename _Sto, typename _Up, __atomic_storage_is_small<_Sto> = 0>
+_CCCL_HOST_DEVICE inline
+void __atomic_init_dispatch(_Sto* __a, _Up __val) {
+    __atomic_init_dispatch(&__a->__a_value, __atomic_small_to_32(__val));
 }
 
-template <template <typename, typename = __atomic_small_tag> _Sto, typename _Tp>
-_CCCL_HOST_DEVICE inline _Tp __atomic_load_dispatch(_Sto<_Tp> const& __a, memory_order __order, _Sco) {
-    return __atomic_small_from_32<_Tp>(__atomic_load_dispatch(__a.__a_value, __order, _Sco{}));
+template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_small<_Sto> = 0>
+_CCCL_HOST_DEVICE inline
+void __atomic_store_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {}) {
+    __atomic_store_dispatch(&__a->__a_value, __atomic_small_to_32(__val), __order, _Sco{});
 }
 
-template <template <typename, typename = __atomic_small_tag> _Sto, typename _Tp>
-_CCCL_HOST_DEVICE inline _Tp __atomic_exchange_dispatch(_Sto<_Tp>& __a, _Tp __value, memory_order __order, _Sco) {
-    return __atomic_small_from_32<_Tp>(__atomic_exchange_dispatch(__a.__a_value, __atomic_small_to_32(__value), __order, _Sco{}));
+template <typename _Sto, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_small<_Sto> = 0>
+_CCCL_HOST_DEVICE inline
+auto __atomic_load_dispatch(const _Sto* __a, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
+    using _Tp = __atomic_underlying_t<_Sto>;
+    return __atomic_small_from_32<_Tp>(__atomic_load_dispatch(&__a->__a_value, __order, _Sco{}));
 }
-_CCCL_HOST_DEVICE
-inline int __cuda_memcmp(void const * __lhs, void const * __rhs, size_t __count) {
-    NV_DISPATCH_TARGET(
-        NV_IS_DEVICE, (
-            auto __lhs_c = reinterpret_cast<unsigned char const *>(__lhs);
-            auto __rhs_c = reinterpret_cast<unsigned char const *>(__rhs);
-            while (__count--) {
-                auto const __lhs_v = *__lhs_c++;
-                auto const __rhs_v = *__rhs_c++;
-                if (__lhs_v < __rhs_v) { return -1; }
-                if (__lhs_v > __rhs_v) { return 1; }
-            }
-            return 0;
-        ),
-        NV_IS_HOST, (
-            return memcmp(__lhs, __rhs, __count);
-        )
-    )
+
+template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_small<_Sto> = 0>
+_CCCL_HOST_DEVICE inline
+auto __atomic_exchange_dispatch(_Sto* __a, _Up __value, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
+    using _Tp = __atomic_underlying_t<_Sto>;
+    return __atomic_small_from_32<_Tp>(__atomic_exchange_dispatch(&__a->__a_value, __atomic_small_to_32(__value), __order, _Sco{}));
 }
 
-template <template <typename, typename = __atomic_small_tag> _Sto, typename _Tp>
-_CCCL_HOST_DEVICE inline bool __atomic_compare_exchange_weak_dispatch(_Sto<_Tp>& __a, _Tp* __expected, _Tp __value, memory_order __success, memory_order __failure, _Sco) {
+template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_small<_Sto> = 0>
+_CCCL_HOST_DEVICE inline
+bool __atomic_compare_exchange_weak_dispatch(_Sto* __a, _Up* __expected, _Up __value, memory_order __success, memory_order __failure, _Sco = {}) {
+    using _Tp = __atomic_underlying_t<_Sto>;
     auto __temp_expected = __atomic_small_to_32(*__expected);
-    auto const __ret = __atomic_compare_exchange_weak_dispatch(__a.__a_value, &__temp_expected, __atomic_small_to_32(__value), __success, __failure, _Sco{});
+    auto const __ret = __atomic_compare_exchange_weak_dispatch(&__a->__a_value, &__temp_expected, __atomic_small_to_32(__value), __success, __failure, _Sco{});
     auto const __actual = __atomic_small_from_32<_Tp>(__temp_expected);
     constexpr auto __mask = static_cast<decltype(__temp_expected)>((1u << (8*sizeof(_Tp))) - 1);
     if(!__ret) {
-        if(0 == __cuda_memcmp(&__actual, __expected, sizeof(_Tp)))
-            __atomic_fetch_and_dispatch(__a.__a_value, __mask, memory_order_relaxed, _Sco{});
+        if(0 == __atomic_memcmp(&__actual, __expected, sizeof(_Tp)))
+            __atomic_fetch_and_dispatch(&__a->__a_value, __mask, memory_order_relaxed, _Sco{});
         else
             *__expected = __actual;
     }
     return __ret;
 }
 
-template <template <typename, typename = __atomic_small_tag> _Sto, typename _Tp>
-_CCCL_HOST_DEVICE inline bool __atomic_compare_exchange_strong_dispatch(_Sto<_Tp>& __a, _Tp* __expected, _Tp __value, memory_order __success, memory_order __failure, _Sco) {
+template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_small<_Sto> = 0>
+_CCCL_HOST_DEVICE inline
+bool __atomic_compare_exchange_strong_dispatch(_Sto* __a, _Up* __expected, _Up __value, memory_order __success, memory_order __failure, _Sco = {}) {
+    using _Tp = __atomic_underlying_t<_Sto>;
     auto const __old = *__expected;
     while(1) {
-        if(__atomic_compare_exchange_weak_dispatch(__a, __expected, __value, __success, __failure, _Sco{}{}))
+        if(__atomic_compare_exchange_weak_dispatch(__a, __expected, __value, __success, __failure, _Sco{}))
             return true;
-        if(0 != __cuda_memcmp(&__old, __expected, sizeof(_Tp)))
+        if(0 != __atomic_memcmp(&__old, __expected, sizeof(_Tp)))
             return false;
     }
 }
 
-template <template <typename, typename = __atomic_small_tag> _Sto, typename _Tp>
-_CCCL_HOST_DEVICE inline _Tp __atomic_fetch_add_dispatch(_Sto<_Tp>& __a, _Tp __delta, memory_order __order, _Sco) {
-    return __atomic_small_from_32<_Tp>(__atomic_fetch_add_dispatch(__a.__a_value, __atomic_small_to_32(__delta), __order, _Sco{}));
+template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_small<_Sto> = 0>
+_CCCL_HOST_DEVICE inline
+auto __atomic_fetch_add_dispatch(_Sto* __a, _Up __delta, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
+    using _Tp = __atomic_underlying_t<_Sto>;
+    return __atomic_small_from_32<_Tp>(__atomic_fetch_add_dispatch(&__a->__a_value, __atomic_small_to_32(__delta), __order, _Sco{}));
 }
 
-template <template <typename, typename = __atomic_small_tag> _Sto, typename _Tp>
-_CCCL_HOST_DEVICE inline _Tp __atomic_fetch_sub_dispatch(_Sto<_Tp>& __a, _Tp __delta, memory_order __order, _Sco) {
-    return __atomic_small_from_32<_Tp>(__atomic_fetch_sub_dispatch(__a.__a_value, __atomic_small_to_32(__delta), __order, _Sco{}));
+template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_small<_Sto> = 0>
+_CCCL_HOST_DEVICE inline
+auto __atomic_fetch_sub_dispatch(_Sto* __a, _Up __delta, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
+    using _Tp = __atomic_underlying_t<_Sto>;
+    return __atomic_small_from_32<_Tp>(__atomic_fetch_sub_dispatch(&__a->__a_value, __atomic_small_to_32(__delta), __order, _Sco{}));
 }
 
-template <template <typename, typename = __atomic_small_tag> _Sto, typename _Tp>
-_CCCL_HOST_DEVICE inline _Tp __atomic_fetch_and_dispatch(_Sto<_Tp>& __a, _Tp __pattern, memory_order __order, _Sco) {
-    return __atomic_small_from_32<_Tp>(__atomic_fetch_and_dispatch(__a.__a_value, __atomic_small_to_32(__pattern), __order, _Sco{}));
+template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_small<_Sto> = 0>
+_CCCL_HOST_DEVICE inline
+auto __atomic_fetch_and_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
+    using _Tp = __atomic_underlying_t<_Sto>;
+    return __atomic_small_from_32<_Tp>(__atomic_fetch_and_dispatch(&__a->__a_value, __atomic_small_to_32(__pattern), __order, _Sco{}));
 }
 
-template <template <typename, typename = __atomic_small_tag> _Sto, typename _Tp>
-_CCCL_HOST_DEVICE inline _Tp __atomic_fetch_or_dispatch(_Sto<_Tp>& __a, _Tp __pattern, memory_order __order, _Sco) {
-    return __atomic_small_from_32<_Tp>(__atomic_fetch_or_dispatch(__a.__a_value, __atomic_small_to_32(__pattern), __order, _Sco{}));
+template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_small<_Sto> = 0>
+_CCCL_HOST_DEVICE inline
+auto __atomic_fetch_or_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
+    using _Tp = __atomic_underlying_t<_Sto>;
+    return __atomic_small_from_32<_Tp>(__atomic_fetch_or_dispatch(&__a->__a_value, __atomic_small_to_32(__pattern), __order, _Sco{}));
 }
 
-template <template <typename, typename = __atomic_small_tag> _Sto, typename _Tp>
-_CCCL_HOST_DEVICE inline _Tp __atomic_fetch_xor_dispatch(_Sto<_Tp>& __a, _Tp __pattern, memory_order __order, _Sco) {
-    return __atomic_small_from_32<_Tp>(__atomic_fetch_xor_dispatch(__a.__a_value, __atomic_small_to_32(__pattern), __order, _Sco{}));
+template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_small<_Sto> = 0>
+_CCCL_HOST_DEVICE inline
+auto __atomic_fetch_xor_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
+    using _Tp = __atomic_underlying_t<_Sto>;
+    return __atomic_small_from_32<_Tp>(__atomic_fetch_xor_dispatch(&__a->__a_value, __atomic_small_to_32(__pattern), __order, _Sco{}));
 }
 
-template <typename _Tp, typename _Delta, typename _Sco>
-_CCCL_HOST_DEVICE inline _Tp __atomic_fetch_max_dispatch(_Sto<_Tp>& __a, _Tp __val, memory_order __order, _Sco) {
-    return __atomic_small_from_32<_Tp>(__atomic_fetch_max_dispatch(__a.__a_value, __atomic_small_to_32(__val), __order, _Sco{}));
+template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_small<_Sto> = 0>
+_CCCL_HOST_DEVICE inline
+auto __atomic_fetch_max_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
+    using _Tp = __atomic_underlying_t<_Sto>;
+    return __atomic_small_from_32<_Tp>(__atomic_fetch_max_dispatch(&__a->__a_value, __atomic_small_to_32(__val), __order, _Sco{}));
 }
 
-template <typename _Tp, typename _Delta, typename _Sco>
-_CCCL_HOST_DEVICE inline _Tp __atomic_fetch_min_dispatch(_Sto<_Tp>& __a, _Tp __val, memory_order __order, _Sco) {
-    return __atomic_small_from_32<_Tp>(__atomic_fetch_min_dispatch(__a.__a_value, __atomic_small_to_32(__val), __order, _Sco{}));
+template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_small<_Sto> = 0>
+_CCCL_HOST_DEVICE inline
+auto __atomic_fetch_min_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
+    using _Tp = __atomic_underlying_t<_Sto>;
+    return __atomic_small_from_32<_Tp>(__atomic_fetch_min_dispatch(&__a->__a_value, __atomic_small_to_32(__val), __order, _Sco{}));
 }
 
 _LIBCUDACXX_END_NAMESPACE_STD
diff --git a/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h b/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h
index 87ac58ca73..d0fa35f50e 100644
--- a/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h
+++ b/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h
@@ -172,14 +172,14 @@ bool __nonatomic_compare_equal(_Tp const& __lhs, _Tp const& __rhs) {
 template <typename _Tp, typename _Sco>
 _LIBCUDACXX_INLINE_VISIBILITY void __atomic_wait(_Tp const volatile* __a, __atomic_underlying_t<_Tp> const __val, memory_order __order, _Sco = {}) {
     for(int __i = 0; __i < _LIBCUDACXX_POLLING_COUNT; ++__i) {
-        if(!__nonatomic_compare_equal(__atomic_load_dispatch(*__a, __order, _Sco{}, __atomic_tag_t<_Tp>{}), __val))
+        if(!__nonatomic_compare_equal(__atomic_load_dispatch(__a, __order, _Sco{}), __val))
             return;
         if(__i < 12)
             __libcpp_thread_yield_processor();
         else
             __libcpp_thread_yield();
     }
-    while(__nonatomic_compare_equal(__atomic_load_dispatch(*__a, __order, _Sco{}, __atomic_tag_t<_Tp>{}), __val))
+    while(__nonatomic_compare_equal(__atomic_load_dispatch(__a, __order, _Sco{}), __val))
         __atomic_try_wait_slow(__a, __val, __order, _Sco{});
 }
 
diff --git a/libcudacxx/include/cuda/std/__atomic/wait/polling.h b/libcudacxx/include/cuda/std/__atomic/wait/polling.h
index 0a5e06c28f..cac1c3d7e8 100644
--- a/libcudacxx/include/cuda/std/__atomic/wait/polling.h
+++ b/libcudacxx/include/cuda/std/__atomic/wait/polling.h
@@ -41,7 +41,7 @@ struct __atomic_poll_tester {
 
     _CCCL_HOST_DEVICE
     bool operator()() const {
-      return !(__atomic_load_dispatch(*__atom, __order, _Sco{}, __atomic_tag_t<_Tp>{}) == __val);
+      return !(__atomic_load_dispatch(__atom, __order, _Sco{}) == __val);
     }
 };
 
diff --git a/libcudacxx/include/cuda/std/__cuda/atomic.h b/libcudacxx/include/cuda/std/__cuda/atomic.h
index c75d0c54da..95b4268b58 100644
--- a/libcudacxx/include/cuda/std/__cuda/atomic.h
+++ b/libcudacxx/include/cuda/std/__cuda/atomic.h
@@ -12,6 +12,7 @@
 #define _LIBCUDACXX___CUDA_ATOMIC_H
 
 #include <cuda/std/detail/__config>
+#include <cuda/std/atomic>
 
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
@@ -23,100 +24,85 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
-using std::__detail::thread_scope;
-using std::__detail::thread_scope_block;
-using std::__detail::thread_scope_device;
-using std::__detail::thread_scope_system;
-using std::__detail::thread_scope_thread;
-
-namespace __detail
-{
-using std::__detail::__thread_scope_block_tag;
-using std::__detail::__thread_scope_device_tag;
-using std::__detail::__thread_scope_system_tag;
-} // namespace __detail
-
-using memory_order = std::memory_order;
-
-constexpr memory_order memory_order_relaxed = std::memory_order_relaxed;
-constexpr memory_order memory_order_consume = std::memory_order_consume;
-constexpr memory_order memory_order_acquire = std::memory_order_acquire;
-constexpr memory_order memory_order_release = std::memory_order_release;
-constexpr memory_order memory_order_acq_rel = std::memory_order_acq_rel;
-constexpr memory_order memory_order_seq_cst = std::memory_order_seq_cst;
-
 // atomic<T>
 
 template <class _Tp, thread_scope _Sco = thread_scope::thread_scope_system>
-struct atomic : public std::__atomic_base<_Tp, _Sco>
+struct atomic
+  : public std::__atomic_impl<_Tp, _Sco>
 {
-  typedef std::__atomic_base<_Tp, _Sco> __base;
+  using value_type = _Tp;
 
-  constexpr atomic() noexcept = default;
-  _CCCL_HOST_DEVICE constexpr atomic(_Tp __d) noexcept
-      : __base(__d)
-  {}
+  _CCCL_HOST_DEVICE
+  constexpr atomic() noexcept
+      : std::__atomic_impl<_Tp, _Sco>() {}
+  _CCCL_HOST_DEVICE
+  constexpr atomic(_Tp __d) noexcept
+      : std::__atomic_impl<_Tp, _Sco>(__d) {}
 
   _CCCL_HOST_DEVICE _Tp operator=(_Tp __d) volatile noexcept
   {
-    __base::store(__d);
+    this->store(__d);
     return __d;
   }
   _CCCL_HOST_DEVICE _Tp operator=(_Tp __d) noexcept
   {
-    __base::store(__d);
+    this->store(__d);
     return __d;
   }
 
   _CCCL_HOST_DEVICE _Tp fetch_max(const _Tp& __op, memory_order __m = memory_order_seq_cst) volatile noexcept
   {
-    return std::__detail::__cxx_atomic_fetch_max(&this->__a_, __op, __m);
+    return std::__atomic_fetch_max_dispatch(this->__this_atom(), __op, __m, std::__scope_to_tag<_Sco>{});
   }
 
   _CCCL_HOST_DEVICE _Tp fetch_min(const _Tp& __op, memory_order __m = memory_order_seq_cst) volatile noexcept
   {
-    return std::__detail::__cxx_atomic_fetch_min(&this->__a_, __op, __m);
+    return std::__atomic_fetch_min_dispatch(this->__this_atom(), __op, __m, std::__scope_to_tag<_Sco>{});
   }
 };
 
 // atomic<T*>
 
 template <class _Tp, thread_scope _Sco>
-struct atomic<_Tp*, _Sco> : public std::__atomic_base<_Tp*, _Sco>
+struct atomic<_Tp*, _Sco>
+  : public std::__atomic_impl<_Tp*, _Sco>
 {
-  typedef std::__atomic_base<_Tp*, _Sco> __base;
+  using value_type = _Tp*;
 
-  constexpr atomic() noexcept = default;
-  _CCCL_HOST_DEVICE constexpr atomic(_Tp* __d) noexcept
-      : __base(__d)
-  {}
+  _CCCL_HOST_DEVICE
+  constexpr atomic() noexcept
+      : std::__atomic_impl<_Tp*, _Sco>() {}
+
+  _CCCL_HOST_DEVICE
+  constexpr atomic(_Tp* __d) noexcept
+      : std::__atomic_impl<_Tp*, _Sco>(__d) {}
 
   _CCCL_HOST_DEVICE _Tp* operator=(_Tp* __d) volatile noexcept
   {
-    __base::store(__d);
+    this->store(__d);
     return __d;
   }
   _CCCL_HOST_DEVICE _Tp* operator=(_Tp* __d) noexcept
   {
-    __base::store(__d);
+    this->store(__d);
     return __d;
   }
 
   _CCCL_HOST_DEVICE _Tp* fetch_add(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) volatile noexcept
   {
-    return __cxx_atomic_fetch_add(&this->__a_, __op, __m);
+    return std::__atomic_fetch_add_dispatch(this->__this_atom(), __op, __m, std::__scope_to_tag<_Sco>{});
   }
   _CCCL_HOST_DEVICE _Tp* fetch_add(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) noexcept
   {
-    return __cxx_atomic_fetch_add(&this->__a_, __op, __m);
+    return std::__atomic_fetch_add_dispatch(this->__this_atom(), __op, __m, std::__scope_to_tag<_Sco>{});
   }
   _CCCL_HOST_DEVICE _Tp* fetch_sub(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) volatile noexcept
   {
-    return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);
+    return std::__atomic_fetch_sub_dispatch(this->__this_atom(), __op, __m, std::__scope_to_tag<_Sco>{});
   }
   _CCCL_HOST_DEVICE _Tp* fetch_sub(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) noexcept
   {
-    return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);
+    return std::__atomic_fetch_sub_dispatch(this->__this_atom(), __op, __m, std::__scope_to_tag<_Sco>{});
   }
 
   _CCCL_HOST_DEVICE _Tp* operator++(int) volatile noexcept
@@ -172,9 +158,9 @@ struct atomic<_Tp*, _Sco> : public std::__atomic_base<_Tp*, _Sco>
 // atomic_ref<T>
 
 template <class _Tp, thread_scope _Sco = thread_scope::thread_scope_system>
-struct atomic_ref : public std::__atomic_base_ref<_Tp, _Sco>
+struct atomic_ref : public std::__atomic_ref_impl<_Tp, _Sco>
 {
-  typedef std::__atomic_base_ref<_Tp, _Sco> __base;
+  typedef std::__atomic_ref_impl<_Tp, _Sco> __base;
 
   _CCCL_HOST_DEVICE constexpr atomic_ref(_Tp& __d) noexcept
       : __base(__d)
@@ -182,27 +168,27 @@ struct atomic_ref : public std::__atomic_base_ref<_Tp, _Sco>
 
   _CCCL_HOST_DEVICE _Tp operator=(_Tp __d) const noexcept
   {
-    __base::store(__d);
+    this->store(__d);
     return __d;
   }
 
   _CCCL_HOST_DEVICE _Tp fetch_max(const _Tp& __op, memory_order __m = memory_order_seq_cst) const noexcept
   {
-    return std::__detail::__cxx_atomic_fetch_max(&this->__a_, __op, __m);
+    return std::__atomic_fetch_max_dispatch(this->__this_atom(), __op, __m, std::__scope_to_tag<_Sco>{});
   }
 
   _CCCL_HOST_DEVICE _Tp fetch_min(const _Tp& __op, memory_order __m = memory_order_seq_cst) const noexcept
   {
-    return std::__detail::__cxx_atomic_fetch_min(&this->__a_, __op, __m);
+    return std::__atomic_fetch_min_dispatch(this->__this_atom(), __op, __m, std::__scope_to_tag<_Sco>{});
   }
 };
 
 // atomic_ref<T*>
 
 template <class _Tp, thread_scope _Sco>
-struct atomic_ref<_Tp*, _Sco> : public std::__atomic_base_ref<_Tp*, _Sco>
+struct atomic_ref<_Tp*, _Sco> : public std::__atomic_ref_impl<_Tp*, _Sco>
 {
-  typedef std::__atomic_base_ref<_Tp*, _Sco> __base;
+  typedef std::__atomic_ref_impl<_Tp*, _Sco> __base;
 
   _CCCL_HOST_DEVICE constexpr atomic_ref(_Tp*& __d) noexcept
       : __base(__d)
@@ -210,17 +196,17 @@ struct atomic_ref<_Tp*, _Sco> : public std::__atomic_base_ref<_Tp*, _Sco>
 
   _CCCL_HOST_DEVICE _Tp* operator=(_Tp* __d) const noexcept
   {
-    __base::store(__d);
+    this->store(__d);
     return __d;
   }
 
   _CCCL_HOST_DEVICE _Tp* fetch_add(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) const noexcept
   {
-    return __cxx_atomic_fetch_add(&this->__a_, __op, __m);
+    return __atomic_fetch_add_dispatch(this->__this_atom(), __op, __m, std::__scope_to_tag<_Sco>{});
   }
   _CCCL_HOST_DEVICE _Tp* fetch_sub(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) const noexcept
   {
-    return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);
+    return __atomic_fetch_sub_dispatch(this->__this_atom(), __op, __m, std::__scope_to_tag<_Sco>{});
   }
 
   _CCCL_HOST_DEVICE _Tp* operator++(int) const noexcept
@@ -256,13 +242,13 @@ atomic_thread_fence(memory_order __m, thread_scope _Scope = thread_scope::thread
     NV_IS_DEVICE,
     (switch (_Scope) {
       case thread_scope::thread_scope_system:
-        std::__detail::__atomic_thread_fence_cuda((int) __m, __detail::__thread_scope_system_tag());
+        std::__atomic_thread_fence_cuda((int) __m, __thread_scope_system_tag{});
         break;
       case thread_scope::thread_scope_device:
-        std::__detail::__atomic_thread_fence_cuda((int) __m, __detail::__thread_scope_device_tag());
+        std::__atomic_thread_fence_cuda((int) __m, __thread_scope_device_tag{});
         break;
       case thread_scope::thread_scope_block:
-        std::__detail::__atomic_thread_fence_cuda((int) __m, __detail::__thread_scope_block_tag());
+        std::__atomic_thread_fence_cuda((int) __m, __thread_scope_block_tag{});
         break;
       // Atomics scoped to themselves do not require fencing
       case thread_scope::thread_scope_thread:
diff --git a/libcudacxx/include/cuda/std/__cuda/barrier.h b/libcudacxx/include/cuda/std/__cuda/barrier.h
index f78f2d4fe8..7c6ac14030 100644
--- a/libcudacxx/include/cuda/std/__cuda/barrier.h
+++ b/libcudacxx/include/cuda/std/__cuda/barrier.h
@@ -13,6 +13,8 @@
 
 #include <cuda/std/detail/__config>
 
+#include <cuda/std/__atomic/api/atomic_impl.h>
+
 #if defined(__CUDA_MINIMUM_ARCH__) && __CUDA_MINIMUM_ARCH__ < 700
 #  error "CUDA synchronization primitives are only supported for sm_70 and up."
 #endif
@@ -124,7 +126,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 template <>
 class barrier<thread_scope_block, _CUDA_VSTD::__empty_completion> : public __block_scope_barrier_base
 {
-  using __barrier_base = _CUDA_VSTD::__barrier_base<_CUDA_VSTD::__empty_completion, (int) thread_scope_block>;
+  using __barrier_base = _CUDA_VSTD::__barrier_base<_CUDA_VSTD::__empty_completion, thread_scope_block>;
   __barrier_base __barrier;
 
   _CCCL_DEVICE friend inline _CUDA_VSTD::uint64_t*
diff --git a/libcudacxx/include/cuda/std/atomic b/libcudacxx/include/cuda/std/atomic
index 8e9eaf1664..bb8296ec6c 100644
--- a/libcudacxx/include/cuda/std/atomic
+++ b/libcudacxx/include/cuda/std/atomic
@@ -562,6 +562,7 @@ void atomic_signal_fence(memory_order m) noexcept;
 #include <cuda/std/__type_traits/conditional.h>
 #include <cuda/std/__type_traits/enable_if.h>
 #include <cuda/std/__type_traits/is_assignable.h>
+#include <cuda/std/__type_traits/is_const.h>
 #include <cuda/std/__type_traits/is_floating_point.h>
 #include <cuda/std/__type_traits/is_integral.h>
 #include <cuda/std/__type_traits/is_same.h>
@@ -601,8 +602,9 @@ void atomic_signal_fence(memory_order m) noexcept;
 #include <cuda/std/__atomic/wait/polling.h>
 #include <cuda/std/__atomic/wait/notify_wait.h>
 
-_LIBCUDACXX_BEGIN_NAMESPACE_STD
+#include <cuda/std/__atomic/api/atomic_impl.h>
 
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
 _LIBCUDACXX_INLINE_VISIBILITY
@@ -611,575 +613,65 @@ _Tp kill_dependency(_Tp __y) noexcept
     return __y;
 }
 
-template <typename _Tp>
-struct __atomic_impl_traits {
-    static  constexpr bool __atomic_requires_lock = __atomic_is_always_lock_free<_Tp>::__value;
-    static  constexpr bool __atomic_requires_small = sizeof(_Tp) < 4;
-    static  constexpr bool __atomic_supports_reference = sizeof(_Tp) >= 4 && sizeof(_Tp) <= 8;
-
-    using __atomic_storage_t = typename __conditional_t<__atomic_requires_small,
-                                                __atomic_small_storage<_Tp>,
-                                                __conditional_t<__atomic_requires_lock,
-                                                    __atomic_locked_storage<_Tp>,
-                                                    __atomic_storage<_Tp>
-                                                    >>;
-
-    using __atomic_ref_storage_t = typename __atomic_ref_storage<_Tp>;
-};
-
-template <class _Tp, typename _Storage>
-struct __atomic_base_storage {
-    mutable _Storage __a_;
-
-    __atomic_base_storage() = default;
-    __atomic_base_storage(const __atomic_base_storage&) = default;
-    __atomic_base_storage(__atomic_base_storage&&) = default;
-
-    __atomic_base_storage& operator=(const __atomic_base_storage&) = default;
-    __atomic_base_storage& operator=(__atomic_base_storage&&) = default;
-
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr
-    __atomic_base_storage(_Storage&& __a) noexcept : __a_(_CUDA_VSTD::forward<_Storage>(__a)) {}
-};
-
-template <class _Tp, bool _Cq, typename _Storage>
-struct __atomic_base_core : public __atomic_base_storage<_Tp, _Storage>{
-    __atomic_base_core() = default;
-    __atomic_base_core(const __atomic_base_core&) = delete;
-    __atomic_base_core(__atomic_base_core&&) = delete;
-
-    __atomic_base_core& operator=(const __atomic_base_core&) = delete;
-    __atomic_base_core& operator=(__atomic_base_core&&) = delete;
-
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr
-    __atomic_base_core(_Storage&& __a) noexcept : __atomic_base_storage<_Tp, _Storage>(_CUDA_VSTD::forward<_Storage>(__a)) {}
-
-#if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
-    static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
-#endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool is_lock_free() const volatile noexcept
-        {return _LIBCUDACXX_ATOMIC_IS_LOCK_FREE(sizeof(_Tp));}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool is_lock_free() const noexcept
-        {return static_cast<__atomic_base_core const volatile*>(this)->is_lock_free();}
-    _LIBCUDACXX_INLINE_VISIBILITY
-
-    void store(_Tp __d, memory_order __m = memory_order_seq_cst) volatile noexcept
-      _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
-        {__atomic_store_dispatch(this->__a_, __d, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void store(_Tp __d, memory_order __m = memory_order_seq_cst) noexcept
-      _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
-        {__atomic_store_dispatch(this->__a_, __d, __m);}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp load(memory_order __m = memory_order_seq_cst) const volatile noexcept
-      _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
-        {return __atomic_load_dispatch(this->__a_, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp load(memory_order __m = memory_order_seq_cst) const noexcept
-      _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
-        {return __atomic_load_dispatch(this->__a_, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    operator _Tp() const volatile noexcept {return load();}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    operator _Tp() const noexcept          {return load();}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) volatile noexcept
-        {return __atomic_exchange_dispatch(this->__a_, __d, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) noexcept
-        {return __atomic_exchange_dispatch(this->__a_, __d, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_weak(_Tp& __e, _Tp __d,
-                               memory_order __s, memory_order __f) volatile noexcept
-      _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-        {return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __s, __f);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_weak(_Tp& __e, _Tp __d,
-                               memory_order __s, memory_order __f) noexcept
-      _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-        {return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __s, __f);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_strong(_Tp& __e, _Tp __d,
-                                 memory_order __s, memory_order __f) volatile noexcept
-      _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-        {return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __s, __f);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_strong(_Tp& __e, _Tp __d,
-                                 memory_order __s, memory_order __f) noexcept
-      _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-        {return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __s, __f);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_weak(_Tp& __e, _Tp __d,
-                              memory_order __m = memory_order_seq_cst) volatile noexcept {
-        if (memory_order_acq_rel == __m)
-            return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __m, memory_order_acquire);
-        else if (memory_order_release == __m)
-            return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __m, memory_order_relaxed);
-        else
-            return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __m, __m);
-    }
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_weak(_Tp& __e, _Tp __d,
-                               memory_order __m = memory_order_seq_cst) noexcept {
-        if(memory_order_acq_rel == __m)
-            return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __m, memory_order_acquire);
-        else if(memory_order_release == __m)
-            return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __m, memory_order_relaxed);
-        else
-            return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __m, __m);
-    }
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_strong(_Tp& __e, _Tp __d,
-                              memory_order __m = memory_order_seq_cst) volatile noexcept {
-        if (memory_order_acq_rel == __m)
-            return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __m, memory_order_acquire);
-        else if (memory_order_release == __m)
-            return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __m, memory_order_relaxed);
-        else
-            return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __m, __m);
-    }
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_strong(_Tp& __e, _Tp __d,
-                                 memory_order __m = memory_order_seq_cst) noexcept {
-        if (memory_order_acq_rel == __m)
-            return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __m, memory_order_acquire);
-        else if (memory_order_release == __m)
-            return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __m, memory_order_relaxed);
-        else
-            return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __m, __m);
-    }
-
-    _LIBCUDACXX_INLINE_VISIBILITY void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {__atomic_wait_dispatch(this->__a_, __v, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const noexcept
-        {__atomic_wait_dispatch(this->__a_, __v, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY void notify_one() volatile noexcept
-        {__atomic_notify_one_dispatch(this->__a_);}
-    _LIBCUDACXX_INLINE_VISIBILITY void notify_one() noexcept
-        {__atomic_notify_one_dispatch(this->__a_);}
-    _LIBCUDACXX_INLINE_VISIBILITY void notify_all() volatile noexcept
-        {__atomic_notify_all_dispatch(this->__a_);}
-    _LIBCUDACXX_INLINE_VISIBILITY void notify_all() noexcept
-        {__atomic_notify_all_dispatch(this->__a_);}
-};
-
-template <class _Tp, typename _Storage>
-struct __atomic_base_core<_Tp, true, _Storage> : public __atomic_base_storage<_Tp, _Storage>{
-    __atomic_base_core() = default;
-    __atomic_base_core(const __atomic_base_core&) = default;
-    __atomic_base_core(__atomic_base_core&&) = default;
-
-    __atomic_base_core& operator=(const __atomic_base_core&) = default;
-    __atomic_base_core& operator=(__atomic_base_core&&) = default;
-
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr
-    __atomic_base_core(_Storage&& __a) noexcept : __atomic_base_storage<_Tp, _Storage>(_CUDA_VSTD::forward<_Storage>(__a)) {}
-
-#if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
-    static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
-#endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool is_lock_free() const volatile noexcept
-        {return _LIBCUDACXX_ATOMIC_IS_LOCK_FREE(sizeof(_Tp));}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool is_lock_free() const noexcept
-        {return static_cast<__atomic_base_core const volatile*>(this)->is_lock_free();}
-    _LIBCUDACXX_INLINE_VISIBILITY
-
-    void store(_Tp __d, memory_order __m = memory_order_seq_cst) const volatile noexcept
-      _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
-        {__atomic_store_dispatch(this->__a_, __d, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void store(_Tp __d, memory_order __m = memory_order_seq_cst) const noexcept
-      _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
-        {__atomic_store_dispatch(this->__a_, __d, __m);}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp load(memory_order __m = memory_order_seq_cst) const volatile noexcept
-      _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
-        {return __atomic_load_dispatch(this->__a_, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp load(memory_order __m = memory_order_seq_cst) const noexcept
-      _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
-        {return __atomic_load_dispatch(this->__a_, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    operator _Tp() const volatile noexcept {return load();}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    operator _Tp() const noexcept          {return load();}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {return __atomic_exchange_dispatch(this->__a_, __d, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) const noexcept
-        {return __atomic_exchange_dispatch(this->__a_, __d, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_weak(_Tp& __e, _Tp __d,
-                               memory_order __s, memory_order __f) const volatile noexcept
-      _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-        {return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __s, __f);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_weak(_Tp& __e, _Tp __d,
-                               memory_order __s, memory_order __f) const noexcept
-      _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-        {return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __s, __f);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_strong(_Tp& __e, _Tp __d,
-                                 memory_order __s, memory_order __f) const volatile noexcept
-      _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-        {return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __s, __f);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_strong(_Tp& __e, _Tp __d,
-                                 memory_order __s, memory_order __f) const noexcept
-      _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-        {return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __s, __f);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_weak(_Tp& __e, _Tp __d,
-                              memory_order __m = memory_order_seq_cst) const volatile noexcept {
-        if (memory_order_acq_rel == __m)
-            return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __m, memory_order_acquire);
-        else if (memory_order_release == __m)
-            return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __m, memory_order_relaxed);
-        else
-            return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __m, __m);
-    }
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_weak(_Tp& __e, _Tp __d,
-                               memory_order __m = memory_order_seq_cst) const noexcept {
-        if(memory_order_acq_rel == __m)
-            return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __m, memory_order_acquire);
-        else if(memory_order_release == __m)
-            return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __m, memory_order_relaxed);
-        else
-            return __atomic_compare_exchange_weak_dispatch(this->__a_, &__e, __d, __m, __m);
-    }
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_strong(_Tp& __e, _Tp __d,
-                              memory_order __m = memory_order_seq_cst) const volatile noexcept {
-        if (memory_order_acq_rel == __m)
-            return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __m, memory_order_acquire);
-        else if (memory_order_release == __m)
-            return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __m, memory_order_relaxed);
-        else
-            return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __m, __m);
-    }
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_strong(_Tp& __e, _Tp __d,
-                                 memory_order __m = memory_order_seq_cst) const noexcept {
-        if (memory_order_acq_rel == __m)
-            return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __m, memory_order_acquire);
-        else if (memory_order_release == __m)
-            return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __m, memory_order_relaxed);
-        else
-            return __atomic_compare_exchange_strong_dispatch(this->__a_, &__e, __d, __m, __m);
-    }
-
-    _LIBCUDACXX_INLINE_VISIBILITY void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {__atomic_wait_dispatch(this->__a_, __v, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const noexcept
-        {__atomic_wait_dispatch(this->__a_, __v, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY void notify_one() const volatile noexcept
-        {__atomic_notify_one_dispatch(this->__a_);}
-    _LIBCUDACXX_INLINE_VISIBILITY void notify_one() const noexcept
-        {__atomic_notify_one_dispatch(this->__a_);}
-    _LIBCUDACXX_INLINE_VISIBILITY void notify_all() const volatile noexcept
-        {__atomic_notify_all_dispatch(this->__a_);}
-    _LIBCUDACXX_INLINE_VISIBILITY void notify_all() const noexcept
-        {__atomic_notify_all_dispatch(this->__a_);}
-};
-
-template <class _Tp, bool _Cq, typename _Storage>
-struct __atomic_base_arithmetic : public __atomic_base_core<_Tp, _Cq, _Storage> {
-    __atomic_base_arithmetic() = default;
-    __atomic_base_arithmetic(const __atomic_base_arithmetic&) = delete;
-    __atomic_base_arithmetic(__atomic_base_arithmetic&&) = delete;
-
-    __atomic_base_arithmetic& operator=(const __atomic_base_arithmetic&) = delete;
-    __atomic_base_arithmetic& operator=(__atomic_base_arithmetic&&) = delete;
-
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr
-    __atomic_base_arithmetic(_Storage&& __a) noexcept : __atomic_base_core<_Tp, _Cq, _Storage>(_CUDA_VSTD::forward<_Storage>(__a)) {}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
-        {return __atomic_fetch_add_dispatch(this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
-        {return __atomic_fetch_add_dispatch(this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
-        {return __atomic_fetch_sub_dispatch(this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
-        {return __atomic_fetch_sub_dispatch(this->__a_, __op, __m);}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator++(int) volatile noexcept      {return fetch_add(_Tp(1));}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator++(int) noexcept               {return fetch_add(_Tp(1));}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator--(int) volatile noexcept      {return fetch_sub(_Tp(1));}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator--(int) noexcept               {return fetch_sub(_Tp(1));}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator++() volatile noexcept         {return fetch_add(_Tp(1)) + _Tp(1);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator++() noexcept                  {return fetch_add(_Tp(1)) + _Tp(1);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator--() volatile noexcept         {return fetch_sub(_Tp(1)) - _Tp(1);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator--() noexcept                  {return fetch_sub(_Tp(1)) - _Tp(1);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator+=(_Tp __op) volatile noexcept {return fetch_add(__op) + __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator+=(_Tp __op) noexcept          {return fetch_add(__op) + __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator-=(_Tp __op) volatile noexcept {return fetch_sub(__op) - __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator-=(_Tp __op) noexcept          {return fetch_sub(__op) - __op;}
-};
-
-template <class _Tp, typename _Storage>
-struct __atomic_base_arithmetic<_Tp, true, _Storage> : public __atomic_base_core<_Tp, true, _Storage> {
-    __atomic_base_arithmetic() = default;
-    __atomic_base_arithmetic(const __atomic_base_arithmetic&) = default;
-    __atomic_base_arithmetic(__atomic_base_arithmetic&&) = default;
-
-    __atomic_base_arithmetic& operator=(const __atomic_base_arithmetic&) = default;
-    __atomic_base_arithmetic& operator=(__atomic_base_arithmetic&&) = default;
-
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr
-    __atomic_base_arithmetic(_Storage&& __a) noexcept : __atomic_base_core<_Tp, true, _Storage>(_CUDA_VSTD::forward<_Storage>(__a)) {}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {return __atomic_fetch_add_dispatch(this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
-        {return __atomic_fetch_add_dispatch(this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {return __atomic_fetch_sub_dispatch(this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
-        {return __atomic_fetch_sub_dispatch(this->__a_, __op, __m);}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator++(int) const volatile noexcept      {return fetch_add(_Tp(1));}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator++(int) const noexcept               {return fetch_add(_Tp(1));}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator--(int) const volatile noexcept      {return fetch_sub(_Tp(1));}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator--(int) const noexcept               {return fetch_sub(_Tp(1));}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator++() const volatile noexcept         {return fetch_add(_Tp(1)) + _Tp(1);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator++() const noexcept                  {return fetch_add(_Tp(1)) + _Tp(1);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator--() const volatile noexcept         {return fetch_sub(_Tp(1)) - _Tp(1);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator--() const noexcept                  {return fetch_sub(_Tp(1)) - _Tp(1);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator+=(_Tp __op) const volatile noexcept {return fetch_add(__op) + __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator+=(_Tp __op) const noexcept          {return fetch_add(__op) + __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator-=(_Tp __op) const volatile noexcept {return fetch_sub(__op) - __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator-=(_Tp __op) const noexcept          {return fetch_sub(__op) - __op;}
-};
-
-template <class _Tp, bool _Cq, typename _Storage>
-struct __atomic_base_bitwise : public __atomic_base_arithmetic<_Tp, _Cq, _Storage> {
-    __atomic_base_bitwise() = default;
-    __atomic_base_bitwise(const __atomic_base_bitwise&) = delete;
-    __atomic_base_bitwise(__atomic_base_bitwise&&) = delete;
-
-    __atomic_base_bitwise& operator=(const __atomic_base_bitwise&) = delete;
-    __atomic_base_bitwise& operator=(__atomic_base_bitwise&&) = delete;
-
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr
-    __atomic_base_bitwise(_Storage&& __a) noexcept : __atomic_base_arithmetic<_Tp, _Cq, _Storage>(_CUDA_VSTD::forward<_Storage>(__a)) {}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
-        {return __atomic_fetch_and_dispatch(this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
-        {return __atomic_fetch_and_dispatch(this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
-        {return __atomic_fetch_or_dispatch(this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
-        {return __atomic_fetch_or_dispatch(this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
-        {return __atomic_fetch_xor_dispatch(this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
-        {return __atomic_fetch_xor_dispatch(this->__a_, __op, __m);}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator&=(_Tp __op) volatile noexcept {return fetch_and(__op) & __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator&=(_Tp __op) noexcept          {return fetch_and(__op) & __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator|=(_Tp __op) volatile noexcept {return fetch_or(__op) | __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator|=(_Tp __op) noexcept          {return fetch_or(__op) | __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator^=(_Tp __op) volatile noexcept {return fetch_xor(__op) ^ __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator^=(_Tp __op) noexcept          {return fetch_xor(__op) ^ __op;}
-};
-
-template <class _Tp, typename _Storage>
-struct __atomic_base_bitwise<_Tp, true, _Storage> : public __atomic_base_arithmetic<_Tp, true, _Storage> {
-    __atomic_base_bitwise() = default;
-    __atomic_base_bitwise(const __atomic_base_bitwise&) = default;
-    __atomic_base_bitwise(__atomic_base_bitwise&&) = default;
-
-    __atomic_base_bitwise& operator=(const __atomic_base_bitwise&) = default;
-    __atomic_base_bitwise& operator=(__atomic_base_bitwise&&) = default;
-
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr
-    __atomic_base_bitwise(_Storage&& __a) noexcept : __atomic_base_arithmetic<_Tp, true, _Storage>(_CUDA_VSTD::forward<_Storage>(__a)) {}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {return __atomic_fetch_and_dispatch(this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
-        {return __atomic_fetch_and_dispatch(this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {return __atomic_fetch_or_dispatch(this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
-        {return __atomic_fetch_or_dispatch(this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {return __atomic_fetch_xor_dispatch(this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
-        {return __atomic_fetch_xor_dispatch(this->__a_, __op, __m);}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator&=(_Tp __op) const volatile noexcept {return fetch_and(__op) & __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator&=(_Tp __op) const noexcept          {return fetch_and(__op) & __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator|=(_Tp __op) const volatile noexcept {return fetch_or(__op) | __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator|=(_Tp __op) const noexcept          {return fetch_or(__op) | __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator^=(_Tp __op) const volatile noexcept {return fetch_xor(__op) ^ __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator^=(_Tp __op) const noexcept          {return fetch_xor(__op) ^ __op;}
-};
-
-template <typename _Tp, bool _Cq, typename _Storage>
-using __atomic_select_base = __conditional_t<is_floating_point<_Tp>::value,
-                                             __atomic_base_arithmetic<_Tp, _Cq, _Storage>,
-                                             __conditional_t<is_integral<_Tp>::value,
-                                                __atomic_base_bitwise<_Tp, _Cq, _Storage>,
-                                                __atomic_base_core<_Tp, _Cq, _Storage> >>;
-
-template <typename _Tp, typename _Base = __atomic_select_base<_Tp, false, typename __atomic_impl_traits<_Tp>::__atomic_storage_t>>
-struct __atomic_base : public _Base {
-    __atomic_base() = default;
-    __atomic_base(const __atomic_base&) = delete;
-    __atomic_base(__atomic_base&&) = delete;
-
-    __atomic_base& operator=(const __atomic_base&) = delete;
-    __atomic_base& operator=(__atomic_base&&) = delete;
-
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr
-    __atomic_base(const _Tp& __a) noexcept :
-        _Base(__atomic_impl_traits<_Tp>::__atomic_storage_t(__a)) {}
-};
-
-template <typename _Tp, typename _Base = __atomic_select_base<_Tp, true, typename __atomic_impl_traits<_Tp>::__atomic_ref_storage_t>>
-struct __atomic_base_ref : public _Base {
-    __atomic_base_ref() = default;
-    __atomic_base_ref(const __atomic_base_ref&) = default;
-    __atomic_base_ref(__atomic_base_ref&&) = default;
-
-    __atomic_base_ref& operator=(const __atomic_base_ref&) = default;
-    __atomic_base_ref& operator=(__atomic_base_ref&&) = default;
-
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr
-    __atomic_base_ref(_Tp& __a) noexcept :
-        _Base(__atomic_impl_traits<_Tp>::__atomic_ref_storage_t(__a)) {}
-};
-
-#if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
-template <class _Tp, bool _Cq, typename _Storage>
-constexpr bool __atomic_base_core<_Tp, _Cq, _Storage>::is_always_lock_free;
-#endif
-
 // atomic<T>
 template <class _Tp>
 struct atomic
-    : public __atomic_base<_Tp>
+    : public __atomic_impl<_Tp, thread_scope_system>
 {
-    typedef __atomic_base<_Tp> __base;
     using value_type = _Tp;
 
-    atomic() noexcept = default;
     _LIBCUDACXX_INLINE_VISIBILITY
-    constexpr atomic(_Tp __d) noexcept : __base(__d) {}
+    constexpr atomic() noexcept
+        : __atomic_impl<_Tp, thread_scope_system>() {}
+
+    _LIBCUDACXX_INLINE_VISIBILITY
+    constexpr atomic(_Tp __d) noexcept
+        : __atomic_impl<_Tp, thread_scope_system>(__d) {}
 
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp operator=(_Tp __d) volatile noexcept
-        {__base::store(__d); return __d;}
+        {this->store(__d); return __d;}
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp operator=(_Tp __d) noexcept
-        {__base::store(__d); return __d;}
+        {this->store(__d); return __d;}
 };
 
 // atomic<T*>
 
 template <class _Tp>
 struct atomic<_Tp*>
-    : public __atomic_base<_Tp*>
+    : public __atomic_impl<_Tp*, thread_scope_system>
 {
-    typedef __atomic_base<_Tp*> __base;
     using value_type = _Tp*;
 
     atomic() noexcept = default;
+
     _LIBCUDACXX_INLINE_VISIBILITY
-    constexpr atomic(_Tp* __d) noexcept : __base(__d) {}
+    constexpr atomic(_Tp* __d) noexcept : __atomic_impl<_Tp*, thread_scope_system>(__d) {}
 
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp* operator=(_Tp* __d) volatile noexcept
-        {__base::store(__d); return __d;}
+        {this->store(__d); return __d;}
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp* operator=(_Tp* __d) noexcept
-        {__base::store(__d); return __d;}
+        {this->store(__d); return __d;}
 
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp* fetch_add(ptrdiff_t __op, memory_order __m = memory_order_seq_cst)
                                                                         volatile noexcept
-        {return __atomic_fetch_add_dispatch(this->__a_, __op, __m);}
+        {return __atomic_fetch_add_dispatch(this->__this_atom(), __op, __m, __thread_scope_system_tag{});}
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp* fetch_add(ptrdiff_t __op, memory_order __m = memory_order_seq_cst)
                                                                         noexcept
-        {return __atomic_fetch_add_dispatch(this->__a_, __op, __m);}
+        {return __atomic_fetch_add_dispatch(this->__this_atom(), __op, __m, __thread_scope_system_tag{});}
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp* fetch_sub(ptrdiff_t __op, memory_order __m = memory_order_seq_cst)
                                                                         volatile noexcept
-        {return __atomic_fetch_sub_dispatch(this->__a_, __op, __m);}
+        {return __atomic_fetch_sub_dispatch(this->__this_atom(), __op, __m, __thread_scope_system_tag{});}
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp* fetch_sub(ptrdiff_t __op, memory_order __m = memory_order_seq_cst)
                                                                         noexcept
-        {return __atomic_fetch_sub_dispatch(this->__a_, __op, __m);}
+        {return __atomic_fetch_sub_dispatch(this->__this_atom(), __op, __m, __thread_scope_system_tag{});}
 
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp* operator++(int) volatile noexcept            {return fetch_add(1);}
@@ -1208,12 +700,10 @@ struct atomic<_Tp*>
 };
 
 // atomic_ref<T>
-
 template <class _Tp>
- struct atomic_ref
-    : public __atomic_base_ref<_Tp>
+struct atomic_ref
+    : public __atomic_ref_impl<_Tp, thread_scope_system>
 {
-    typedef __atomic_base_ref<_Tp> __base;
     using value_type = _Tp;
 
     static constexpr size_t required_alignment = sizeof(_Tp);
@@ -1221,19 +711,17 @@ template <class _Tp>
     static constexpr bool is_always_lock_free = sizeof(_Tp) <= 8;
 
     _LIBCUDACXX_INLINE_VISIBILITY
-    explicit atomic_ref(_Tp& __ref) : __base(__ref) {}
+    explicit atomic_ref(_Tp& __ref) : __atomic_ref_impl<_Tp, thread_scope_system>(__ref) {}
 
     _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator=(_Tp __v) const volatile noexcept {__base::store(__v); return __v;}
+    _Tp operator=(_Tp __v) const noexcept {this->store(__v); return __v;}
 };
 
 // atomic_ref<T*>
-
 template <class _Tp>
  struct atomic_ref<_Tp*>
-    : public __atomic_base_ref<_Tp*>
+    : public __atomic_ref_impl<_Tp*, thread_scope_system>
 {
-    typedef __atomic_base_ref<_Tp*> __base;
     using value_type = _Tp*;
 
     static constexpr size_t required_alignment = sizeof(_Tp*);
@@ -1241,19 +729,19 @@ template <class _Tp>
     static constexpr bool is_always_lock_free = sizeof(_Tp*) <= 8;
 
     _LIBCUDACXX_INLINE_VISIBILITY
-    explicit atomic_ref(_Tp*& __ref) : __base(__ref) {}
+    explicit atomic_ref(_Tp*& __ref) : __atomic_ref_impl<_Tp*, thread_scope_system>(__ref) {}
 
     _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator=(_Tp* __v) const noexcept {__base::store(__v); return __v;}
+    _Tp* operator=(_Tp* __v) const noexcept {this->store(__v); return __v;}
 
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp* fetch_add(ptrdiff_t __op, memory_order __m = memory_order_seq_cst)
                                                                         const noexcept
-        {return __atomic_fetch_add_dispatch(this->__a_, __op, __m);}
+        {return __atomic_fetch_add_dispatch(this->__this_atom(), __op, __m, __thread_scope_system_tag{});}
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp* fetch_sub(ptrdiff_t __op, memory_order __m = memory_order_seq_cst)
                                                                         const noexcept
-        {return __atomic_fetch_sub_dispatch(this->__a_, __op, __m);}
+        {return __atomic_fetch_sub_dispatch(this->__this_atom(), __op, __m, __thread_scope_system_tag{});}
 
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp* operator++(int) const noexcept                     {return fetch_add(1);}
@@ -1294,7 +782,7 @@ _LIBCUDACXX_INLINE_VISIBILITY
 void
 atomic_init(volatile atomic<_Tp>* __o, _Tp __d) noexcept
 {
-    __atomic_init_dispatch(__o->__a_, __d);
+    __atomic_init_dispatch(__o->__this_atom(), __d);
 }
 
 template <class _Tp>
@@ -1302,7 +790,7 @@ _LIBCUDACXX_INLINE_VISIBILITY
 void
 atomic_init(atomic<_Tp>* __o, _Tp __d) noexcept
 {
-    __atomic_init_dispatch(__o->__a_, __d);
+    __atomic_init_dispatch(__o->__this_atom(), __d);
 }
 
 // atomic_store
@@ -1899,54 +1387,53 @@ atomic_fetch_xor_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 // flag type and operations
 
 typedef struct atomic_flag
+    : protected __atomic_impl<_LIBCUDACXX_ATOMIC_FLAG_TYPE, thread_scope_system>
 {
-    __atomic_impl_traits<_LIBCUDACXX_ATOMIC_FLAG_TYPE>::__atomic_storage_t __a_;
-
     _LIBCUDACXX_INLINE_VISIBILITY
     bool test(memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {return _LIBCUDACXX_ATOMIC_FLAG_TYPE(true)==__atomic_load_dispatch(__a_, __m, __thread_scope_system_tag{}, __atomic_tag_t<decltype(__a_)>{});}
+        {return _LIBCUDACXX_ATOMIC_FLAG_TYPE(true)==__atomic_load_dispatch(this->__get_atom(), __m, __thread_scope_system_tag{});}
     _LIBCUDACXX_INLINE_VISIBILITY
     bool test(memory_order __m = memory_order_seq_cst) const noexcept
-        {return _LIBCUDACXX_ATOMIC_FLAG_TYPE(true)==__atomic_load_dispatch(__a_, __m, __thread_scope_system_tag{}, __atomic_tag_t<decltype(__a_)>{});}
+        {return _LIBCUDACXX_ATOMIC_FLAG_TYPE(true)==__atomic_load_dispatch(this->__get_atom(), __m, __thread_scope_system_tag{});}
 
     _LIBCUDACXX_INLINE_VISIBILITY
     bool test_and_set(memory_order __m = memory_order_seq_cst) volatile noexcept
-        {return __atomic_exchange_dispatch(__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), __m, __thread_scope_system_tag{}, __atomic_tag_t<decltype(__a_)>{});}
+        {return __atomic_exchange_dispatch(this->__get_atom(), _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), __m, __thread_scope_system_tag{});}
     _LIBCUDACXX_INLINE_VISIBILITY
     bool test_and_set(memory_order __m = memory_order_seq_cst) noexcept
-        {return __atomic_exchange_dispatch(__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), __m, __thread_scope_system_tag{}, __atomic_tag_t<decltype(__a_)>{});}
+        {return __atomic_exchange_dispatch(this->__get_atom(), _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), __m, __thread_scope_system_tag{});}
     _LIBCUDACXX_INLINE_VISIBILITY
     void clear(memory_order __m = memory_order_seq_cst) volatile noexcept
-        {__atomic_store_dispatch(__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), __m, __thread_scope_system_tag{}, __atomic_tag_t<decltype(__a_)>{});}
+        {__atomic_store_dispatch(this->__get_atom(), _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), __m, __thread_scope_system_tag{});}
     _LIBCUDACXX_INLINE_VISIBILITY
     void clear(memory_order __m = memory_order_seq_cst) noexcept
-        {__atomic_store_dispatch(__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), __m, __thread_scope_system_tag{}, __atomic_tag_t<decltype(__a_)>{});}
+        {__atomic_store_dispatch(this->__get_atom(), _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), __m, __thread_scope_system_tag{});}
 
 #if !defined(__CUDA_MINIMUM_ARCH__) || __CUDA_MINIMUM_ARCH__ >= 700
     _LIBCUDACXX_INLINE_VISIBILITY
     void wait(bool __v, memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {__atomic_wait(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(__v), __m, __thread_scope_system_tag{});}
+        {__atomic_wait(this->__get_atom(), _LIBCUDACXX_ATOMIC_FLAG_TYPE(__v), __m, __thread_scope_system_tag{});}
     _LIBCUDACXX_INLINE_VISIBILITY
     void wait(bool __v, memory_order __m = memory_order_seq_cst) const noexcept
-        {__atomic_wait(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(__v), __m, __thread_scope_system_tag{});}
+        {__atomic_wait(this->__get_atom(), _LIBCUDACXX_ATOMIC_FLAG_TYPE(__v), __m, __thread_scope_system_tag{});}
     _LIBCUDACXX_INLINE_VISIBILITY
     void notify_one() volatile noexcept
-        {__atomic_notify_one(&__a_, __thread_scope_system_tag{});}
+        {__atomic_notify_one(this->__get_atom(), __thread_scope_system_tag{});}
     _LIBCUDACXX_INLINE_VISIBILITY
     void notify_one() noexcept
-        {__atomic_notify_one(&__a_, __thread_scope_system_tag{});}
+        {__atomic_notify_one(this->__get_atom(), __thread_scope_system_tag{});}
     _LIBCUDACXX_INLINE_VISIBILITY
     void notify_all() volatile noexcept
-        {__atomic_notify_all(&__a_, __thread_scope_system_tag{});}
+        {__atomic_notify_all(this->__get_atom(), __thread_scope_system_tag{});}
     _LIBCUDACXX_INLINE_VISIBILITY
     void notify_all() noexcept
-        {__atomic_notify_all(&__a_, __thread_scope_system_tag{});}
+        {__atomic_notify_all(this->__get_atom(), __thread_scope_system_tag{});}
 #endif
 
     atomic_flag() noexcept = default;
 
     _LIBCUDACXX_INLINE_VISIBILITY constexpr
-    atomic_flag(bool __b) noexcept : __a_(__b) {} // EXTENSION
+    atomic_flag(bool __b) noexcept : __atomic_impl<_LIBCUDACXX_ATOMIC_FLAG_TYPE, thread_scope_system>(__b) {} // EXTENSION
 
     atomic_flag(const atomic_flag&) = delete;
     atomic_flag& operator=(const atomic_flag&) = delete;

From 64b31af58ab36f66eeaa8d756b2c94474b97bb95 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Thu, 2 May 2024 10:53:37 -0700
Subject: [PATCH 14/71] Rearrange headers and update latch/barrier.

---
 .../cuda/std/__atomic/api/atomic_crtp.h       |  42 ----
 .../cuda/std/__atomic/api/atomic_impl.h       | 138 ----------
 .../include/cuda/std/__atomic/api/common.h    | 136 ++++++++++
 .../include/cuda/std/__atomic/api/const.h     | 152 -----------
 .../include/cuda/std/__atomic/api/nonconst.h  | 237 ------------------
 .../include/cuda/std/__atomic/api/owned.h     | 124 +++++++++
 .../include/cuda/std/__atomic/api/reference.h | 100 ++++++++
 .../include/cuda/std/__atomic/functions.h     |  25 ++
 .../cuda_ptx_derived.h}                       |   7 +-
 .../cuda_ptx_generated.h}                     |  13 +
 .../__atomic/{operations => functions}/host.h |  11 +-
 libcudacxx/include/cuda/std/__atomic/order.h  |   6 +-
 .../std/__atomic/platform/msvc_to_builtins.h  |   5 +-
 .../cuda/std/__atomic/platform/platform.h     |  59 -----
 libcudacxx/include/cuda/std/__atomic/scopes.h |  21 +-
 .../include/cuda/std/__atomic/storage/base.h  |  63 -----
 libcudacxx/include/cuda/std/__atomic/types.h  |  40 +++
 .../heterogeneous.h => types/base.h}          |  98 ++++----
 .../std/__atomic/{storage => types}/common.h  |  24 +-
 .../std/__atomic/{storage => types}/locked.h  |  20 +-
 .../__atomic/{storage => types}/reference.h   |  12 +-
 .../std/__atomic/{storage => types}/small.h   |  15 +-
 .../cuda/std/__atomic/wait/notify_wait.h      | 108 +-------
 .../include/cuda/std/__atomic/wait/polling.h  |   6 +-
 libcudacxx/include/cuda/std/__cuda/atomic.h   | 215 ++++------------
 libcudacxx/include/cuda/std/__cuda/barrier.h  |   2 +-
 libcudacxx/include/cuda/std/atomic            | 215 ++++------------
 .../cuda/std/detail/libcxx/include/latch      |   4 +-
 28 files changed, 652 insertions(+), 1246 deletions(-)
 delete mode 100644 libcudacxx/include/cuda/std/__atomic/api/atomic_crtp.h
 delete mode 100644 libcudacxx/include/cuda/std/__atomic/api/atomic_impl.h
 create mode 100644 libcudacxx/include/cuda/std/__atomic/api/common.h
 delete mode 100644 libcudacxx/include/cuda/std/__atomic/api/const.h
 delete mode 100644 libcudacxx/include/cuda/std/__atomic/api/nonconst.h
 create mode 100644 libcudacxx/include/cuda/std/__atomic/api/owned.h
 create mode 100644 libcudacxx/include/cuda/std/__atomic/api/reference.h
 create mode 100644 libcudacxx/include/cuda/std/__atomic/functions.h
 rename libcudacxx/include/cuda/std/__atomic/{operations/atomic_cuda_ptx_derived.h => functions/cuda_ptx_derived.h} (96%)
 rename libcudacxx/include/cuda/std/__atomic/{operations/atomic_cuda_ptx_generated.h => functions/cuda_ptx_generated.h} (99%)
 rename libcudacxx/include/cuda/std/__atomic/{operations => functions}/host.h (96%)
 delete mode 100644 libcudacxx/include/cuda/std/__atomic/platform/platform.h
 delete mode 100644 libcudacxx/include/cuda/std/__atomic/storage/base.h
 create mode 100644 libcudacxx/include/cuda/std/__atomic/types.h
 rename libcudacxx/include/cuda/std/__atomic/{operations/heterogeneous.h => types/base.h} (75%)
 rename libcudacxx/include/cuda/std/__atomic/{storage => types}/common.h (67%)
 rename libcudacxx/include/cuda/std/__atomic/{storage => types}/locked.h (92%)
 rename libcudacxx/include/cuda/std/__atomic/{storage => types}/reference.h (85%)
 rename libcudacxx/include/cuda/std/__atomic/{storage => types}/small.h (94%)

diff --git a/libcudacxx/include/cuda/std/__atomic/api/atomic_crtp.h b/libcudacxx/include/cuda/std/__atomic/api/atomic_crtp.h
deleted file mode 100644
index 1d659120b5..0000000000
--- a/libcudacxx/include/cuda/std/__atomic/api/atomic_crtp.h
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of libcu++, the C++ Standard Library for your entire system,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __LIBCUDACXX___ATOMIC_API_ATOMIC_CRTP_H
-#define __LIBCUDACXX___ATOMIC_API_ATOMIC_CRTP_H
-
-#include <cuda/std/detail/__config>
-
-_LIBCUDACXX_BEGIN_NAMESPACE_STD
-
-// __atomic_crtp_accessor defines a way to statically fetch the atomic storage object
-// which owns the stored atomic.
-template <typename _Impl, typename _Sto>
-struct __atomic_crtp_accessor {
-    _CCCL_HOST_DEVICE
-    inline auto __this_atom() -> _Sto* {
-        return static_cast<_Impl*>(this)->__get_atom();
-    }
-    _CCCL_HOST_DEVICE
-    inline auto __this_atom() const -> const _Sto* {
-        return static_cast<const _Impl*>(this)->__get_atom();
-    }
-        _CCCL_HOST_DEVICE
-    inline auto __this_atom() volatile -> volatile _Sto* {
-        return static_cast<volatile _Impl*>(this)->__get_atom();
-    }
-        _CCCL_HOST_DEVICE
-    inline auto __this_atom() const volatile -> const volatile _Sto* {
-        return static_cast<const volatile _Impl*>(this)->__get_atom();
-    }
-};
-
-_LIBCUDACXX_END_NAMESPACE_STD
-
-#endif __LIBCUDACXX___ATOMIC_API_ATOMIC_CRTP_H
diff --git a/libcudacxx/include/cuda/std/__atomic/api/atomic_impl.h b/libcudacxx/include/cuda/std/__atomic/api/atomic_impl.h
deleted file mode 100644
index fc6dd30b0c..0000000000
--- a/libcudacxx/include/cuda/std/__atomic/api/atomic_impl.h
+++ /dev/null
@@ -1,138 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of libcu++, the C++ Standard Library for your entire system,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __LIBCUDACXX___ATOMIC_API_ATOMIC_IMPL_H
-#define __LIBCUDACXX___ATOMIC_API_ATOMIC_IMPL_H
-
-#include <cuda/std/detail/__config>
-
-#include <cuda/std/__atomic/api/const.h>
-#include <cuda/std/__atomic/api/nonconst.h>
-
-#include <cuda/std/__atomic/storage/base.h>
-#include <cuda/std/__atomic/storage/reference.h>
-#include <cuda/std/__atomic/storage/small.h>
-#include <cuda/std/__atomic/storage/locked.h>
-
-_LIBCUDACXX_BEGIN_NAMESPACE_STD
-
-template <typename _Tp>
-struct __atomic_traits {
-    static constexpr bool __atomic_requires_lock = !__atomic_is_always_lock_free<_Tp>::__value;
-    static constexpr bool __atomic_requires_small = sizeof(_Tp) < 4;
-    static constexpr bool __atomic_supports_reference = __atomic_is_always_lock_free<_Tp>::__value && (sizeof(_Tp) >= 4 && sizeof(_Tp) <= 8);
-};
-
-template <typename _Tp>
-using __atomic_get_storage_t = typename __conditional_t<__atomic_traits<_Tp>::__atomic_requires_small,
-                                            __atomic_small_storage<_Tp>,
-                                            __conditional_t<__atomic_traits<_Tp>::__atomic_requires_lock,
-                                                __atomic_locked_storage<_Tp>,
-                                                __atomic_storage<_Tp>
-                                                >>;
-
-template <typename _Tp, typename _Crtp, typename _Sco = __thread_scope_system_tag>
-using __atomic_impl_t = __conditional_t<is_floating_point<_Tp>::value,
-                                            __atomic_arithmetic<_Tp, _Crtp, _Sco>,
-                                            __conditional_t<is_integral<_Tp>::value,
-                                                __atomic_bitwise<_Tp, _Crtp, _Sco>,
-                                                __atomic_common<_Tp, _Crtp, _Sco> >>;
-
-template <typename _Tp, typename _Crtp, typename _Sco = __thread_scope_system_tag>
-using __atomic_const_impl_t = __conditional_t<is_floating_point<_Tp>::value,
-                                            __atomic_arithmetic_const<_Tp, _Crtp, _Sco>,
-                                            __conditional_t<is_integral<_Tp>::value,
-                                                __atomic_bitwise_const<_Tp, _Crtp, _Sco>,
-                                                __atomic_common_const<_Tp, _Crtp, _Sco> >>;
-
-
-template <typename _Tp, thread_scope _Sco>
-struct __atomic_impl :
-    public __atomic_impl_t<_Tp, __atomic_crtp_accessor<__atomic_impl<_Tp,_Sco>, __atomic_get_storage_t<_Tp>>, __scope_to_tag<_Sco>> {
-
-    using __storage = __atomic_get_storage_t<_Tp>;
-    __storage __a;
-
-    _CCCL_HOST_DEVICE constexpr inline
-    __atomic_impl(_Tp __v) noexcept : __a(__v) {}
-
-    _CCCL_HOST_DEVICE constexpr inline
-    __storage* __get_atom() {
-        return &__a;
-    }
-    _CCCL_HOST_DEVICE constexpr inline
-    const __storage* __get_atom() const {
-        return &__a;
-    }
-    _CCCL_HOST_DEVICE constexpr inline
-    volatile __storage* __get_atom() volatile {
-        return &__a;
-    }
-    _CCCL_HOST_DEVICE constexpr inline
-    const volatile __storage* __get_atom() const volatile {
-        return &__a;
-    }
-
-    constexpr inline
-    __atomic_impl() noexcept = default;
-    constexpr inline
-    __atomic_impl(const __atomic_impl&) = delete;
-    constexpr inline
-    __atomic_impl(__atomic_impl&&) = delete;
-
-    constexpr inline
-    __atomic_impl& operator=(const __atomic_impl&) = delete;
-    constexpr inline
-    __atomic_impl& operator=(__atomic_impl&&) = delete;
-};
-
-template <typename _Tp, thread_scope _Sco>
-struct __atomic_ref_impl :
-    public __atomic_const_impl_t<_Tp, __atomic_crtp_accessor<__atomic_ref_impl<_Tp,_Sco>, __atomic_ref_storage<_Tp>>, __scope_to_tag<_Sco>> {
-
-    using __storage = __atomic_ref_storage<_Tp>;
-    __storage __a;
-
-    _CCCL_HOST_DEVICE constexpr inline
-    __storage* __get_atom() {
-        return &__a;
-    }
-    _CCCL_HOST_DEVICE constexpr inline
-    const __storage* __get_atom() const {
-        return &__a;
-    }
-    _CCCL_HOST_DEVICE constexpr inline
-    volatile __storage* __get_atom() volatile {
-        return &__a;
-    }
-    _CCCL_HOST_DEVICE constexpr inline
-    const volatile __storage* __get_atom() const volatile {
-        return &__a;
-    }
-
-    _CCCL_HOST_DEVICE constexpr inline
-    __atomic_ref_impl(_Tp& __v) : __a(&__v) {}
-
-    constexpr inline
-    __atomic_ref_impl() = delete;
-    constexpr inline
-    __atomic_ref_impl(const __atomic_ref_impl&) noexcept = default;
-    constexpr inline
-    __atomic_ref_impl(__atomic_ref_impl&&) = delete;
-
-    constexpr inline
-    __atomic_ref_impl& operator=(const __atomic_ref_impl&) = delete;
-    constexpr inline
-    __atomic_ref_impl& operator=(__atomic_ref_impl&&) = delete;
-};
-
-_LIBCUDACXX_END_NAMESPACE_STD
-
-#endif __LIBCUDACXX___ATOMIC_API_ATOMIC_IMPL_H
diff --git a/libcudacxx/include/cuda/std/__atomic/api/common.h b/libcudacxx/include/cuda/std/__atomic/api/common.h
new file mode 100644
index 0000000000..7591c5d0d4
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/api/common.h
@@ -0,0 +1,136 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __LIBCUDACXX___ATOMIC_API_COMMON_H
+#define __LIBCUDACXX___ATOMIC_API_COMMON_H
+
+#include <cuda/std/detail/__config>
+
+#include <cuda/std/__atomic/types/base.h>
+
+// API definitions for the base atomic implementation
+#define _LIBCUDACXX_ATOMIC_COMMON_IMPL(_CONST, _VOLATILE)                                                         \
+    _CCCL_HOST_DEVICE inline                                                                                      \
+    bool is_lock_free() const _VOLATILE noexcept                                                                  \
+        {return _LIBCUDACXX_ATOMIC_IS_LOCK_FREE(sizeof(_Tp));}                                                    \
+    _CCCL_HOST_DEVICE inline                                                                                      \
+    void store(_Tp __d, memory_order __m = memory_order_seq_cst) _CONST _VOLATILE noexcept                        \
+      _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)                                                                   \
+        {__atomic_store_dispatch(&__a, __d, __m, _Sco{});}                                                        \
+    _CCCL_HOST_DEVICE inline                                                                                      \
+    _Tp load(memory_order __m = memory_order_seq_cst) const _VOLATILE noexcept                                    \
+      _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)                                                                    \
+        {return __atomic_load_dispatch(&__a, __m, _Sco{});}                                                       \
+    _CCCL_HOST_DEVICE inline                                                                                      \
+    operator _Tp() const _VOLATILE noexcept {return load();}                                                      \
+    _CCCL_HOST_DEVICE inline                                                                                      \
+    _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) _CONST _VOLATILE noexcept                      \
+        {return __atomic_exchange_dispatch(&__a, __d, __m, _Sco{});}                                              \
+    _CCCL_HOST_DEVICE inline                                                                                      \
+    bool compare_exchange_weak(_Tp& __e, _Tp __d,                                                                 \
+                               memory_order __s, memory_order __f) _CONST _VOLATILE noexcept                      \
+      _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)                                                           \
+        {return __atomic_compare_exchange_weak_dispatch(&__a, &__e, __d, __s, __f, _Sco{});}                      \
+    _CCCL_HOST_DEVICE inline                                                                                      \
+    bool compare_exchange_strong(_Tp& __e, _Tp __d,                                                               \
+                                 memory_order __s, memory_order __f) _CONST _VOLATILE noexcept                    \
+      _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)                                                           \
+        {return __atomic_compare_exchange_strong_dispatch(&__a, &__e, __d, __s, __f, _Sco{});}                    \
+    _CCCL_HOST_DEVICE inline                                                                                      \
+    bool compare_exchange_weak(_Tp& __e, _Tp __d,                                                                 \
+                              memory_order __m = memory_order_seq_cst) _CONST _VOLATILE noexcept {                \
+        if (memory_order_acq_rel == __m)                                                                          \
+            return __atomic_compare_exchange_weak_dispatch(&__a, &__e, __d, __m, memory_order_acquire, _Sco{});   \
+        else if (memory_order_release == __m)                                                                     \
+            return __atomic_compare_exchange_weak_dispatch(&__a, &__e, __d, __m, memory_order_relaxed, _Sco{});   \
+        else                                                                                                      \
+            return __atomic_compare_exchange_weak_dispatch(&__a, &__e, __d, __m, __m, _Sco{});                    \
+    }                                                                                                             \
+    _CCCL_HOST_DEVICE inline                                                                                      \
+    bool compare_exchange_strong(_Tp& __e, _Tp __d,                                                               \
+                              memory_order __m = memory_order_seq_cst) _CONST _VOLATILE noexcept {                \
+        if (memory_order_acq_rel == __m)                                                                          \
+            return __atomic_compare_exchange_strong_dispatch(&__a, &__e, __d, __m, memory_order_acquire, _Sco{}); \
+        else if (memory_order_release == __m)                                                                     \
+            return __atomic_compare_exchange_strong_dispatch(&__a, &__e, __d, __m, memory_order_relaxed, _Sco{}); \
+        else                                                                                                      \
+            return __atomic_compare_exchange_strong_dispatch(&__a, &__e, __d, __m, __m, _Sco{});                  \
+    }                                                                                                             \
+    _CCCL_HOST_DEVICE inline                                                                                      \
+    void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const _VOLATILE noexcept                          \
+        {__atomic_wait(&__a, __v, __m, _Sco{});}                                                                  \
+    _CCCL_HOST_DEVICE inline                                                                                      \
+    void notify_one() _CONST _VOLATILE noexcept                                                                   \
+        {__atomic_notify_one(&__a, _Sco{});}                                                                      \
+    _CCCL_HOST_DEVICE inline                                                                                      \
+    void notify_all() _CONST _VOLATILE noexcept                                                                   \
+        {__atomic_notify_all(&__a, _Sco{});}                                                                      \
+
+// API definitions for arithmetic atomics
+#define _LIBCUDACXX_ATOMIC_ARITHMETIC_IMPL(_CONST, _VOLATILE)                                                     \
+    _CCCL_HOST_DEVICE inline                                                                                      \
+    _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) _CONST _VOLATILE noexcept                    \
+        {return __atomic_fetch_add_dispatch(&__a, __op, __m, _Sco{});}                                            \
+    _CCCL_HOST_DEVICE inline                                                                                      \
+    _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) _CONST _VOLATILE noexcept                    \
+        {return __atomic_fetch_sub_dispatch(&__a, __op, __m, _Sco{});}                                            \
+    _CCCL_HOST_DEVICE inline                                                                                      \
+    _Tp operator++(int) _CONST _VOLATILE noexcept      {return fetch_add(_Tp(1));}                                \
+    _CCCL_HOST_DEVICE inline                                                                                      \
+    _Tp operator--(int) _CONST _VOLATILE noexcept      {return fetch_sub(_Tp(1));}                                \
+    _CCCL_HOST_DEVICE inline                                                                                      \
+    _Tp operator++() _CONST _VOLATILE noexcept         {return fetch_add(_Tp(1)) + _Tp(1);}                       \
+    _CCCL_HOST_DEVICE inline                                                                                      \
+    _Tp operator--() _CONST _VOLATILE noexcept         {return fetch_sub(_Tp(1)) - _Tp(1);}                       \
+    _CCCL_HOST_DEVICE inline                                                                                      \
+    _Tp operator+=(_Tp __op) _CONST _VOLATILE noexcept {return fetch_add(__op) + __op;}                           \
+    _CCCL_HOST_DEVICE inline                                                                                      \
+    _Tp operator-=(_Tp __op) _CONST _VOLATILE noexcept {return fetch_sub(__op) - __op;}                           \
+
+// API definitions for bitwise atomics
+#define _LIBCUDACXX_ATOMIC_BITWISE_IMPL(_CONST, _VOLATILE)                                                        \
+    _CCCL_HOST_DEVICE inline                                                                                      \
+    _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) _CONST _VOLATILE noexcept                    \
+        {return __atomic_fetch_and_dispatch(&__a, __op, __m, _Sco{});}                                            \
+    _CCCL_HOST_DEVICE inline                                                                                      \
+    _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) _CONST _VOLATILE noexcept                     \
+        {return __atomic_fetch_or_dispatch(&__a, __op, __m, _Sco{});}                                             \
+    _CCCL_HOST_DEVICE inline                                                                                      \
+    _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) _CONST _VOLATILE noexcept                    \
+        {return __atomic_fetch_xor_dispatch(&__a, __op, __m, _Sco{});}                                            \
+    _CCCL_HOST_DEVICE inline                                                                                      \
+    _Tp operator&=(_Tp __op) _CONST _VOLATILE noexcept {return fetch_and(__op) & __op;}                           \
+    _CCCL_HOST_DEVICE inline                                                                                      \
+    _Tp operator|=(_Tp __op) _CONST _VOLATILE noexcept {return fetch_or(__op) | __op;}                            \
+    _CCCL_HOST_DEVICE inline                                                                                      \
+    _Tp operator^=(_Tp __op) _CONST _VOLATILE noexcept {return fetch_xor(__op) ^ __op;}                           \
+
+// API definitions for atomics with pointers
+#define _LIBCUDACXX_ATOMIC_POINTER_IMPL(_CONST, _VOLATILE)                                                        \
+    _CCCL_HOST_DEVICE inline                                                                                      \
+    _Tp fetch_add(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) _CONST _VOLATILE noexcept              \
+        {return __atomic_fetch_add_dispatch(&__a, __op, __m, __thread_scope_system_tag{});}                       \
+    _CCCL_HOST_DEVICE inline                                                                                      \
+    _Tp fetch_sub(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) _CONST _VOLATILE noexcept              \
+        {return __atomic_fetch_sub_dispatch(&__a, __op, __m, __thread_scope_system_tag{});}                       \
+    _CCCL_HOST_DEVICE inline                                                                                      \
+    _Tp operator++(int) _CONST _VOLATILE noexcept            {return fetch_add(1);}                               \
+    _CCCL_HOST_DEVICE inline                                                                                      \
+    _Tp operator--(int) _CONST _VOLATILE noexcept            {return fetch_sub(1);}                               \
+    _CCCL_HOST_DEVICE inline                                                                                      \
+    _Tp operator++() _CONST _VOLATILE noexcept               {return fetch_add(1) + 1;}                           \
+    _CCCL_HOST_DEVICE inline                                                                                      \
+    _Tp operator--() _CONST _VOLATILE noexcept               {return fetch_sub(1) - 1;}                           \
+    _CCCL_HOST_DEVICE inline                                                                                      \
+    _Tp operator+=(ptrdiff_t __op) _CONST _VOLATILE noexcept {return fetch_add(__op) + __op;}                     \
+    _CCCL_HOST_DEVICE inline                                                                                      \
+    _Tp operator-=(ptrdiff_t __op) _CONST _VOLATILE noexcept {return fetch_sub(__op) - __op;}                     \
+
+#endif  // __LIBCUDACXX___ATOMIC_API_COMMON_H
diff --git a/libcudacxx/include/cuda/std/__atomic/api/const.h b/libcudacxx/include/cuda/std/__atomic/api/const.h
deleted file mode 100644
index d3904f2701..0000000000
--- a/libcudacxx/include/cuda/std/__atomic/api/const.h
+++ /dev/null
@@ -1,152 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of libcu++, the C++ Standard Library for your entire system,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __LIBCUDACXX___ATOMIC_API_CONST_H
-#define __LIBCUDACXX___ATOMIC_API_CONST_H
-
-#include <cuda/std/detail/__config>
-
-#include <cuda/std/__atomic/api/atomic_crtp.h>
-
-#include <cuda/std/__atomic/wait/polling.h>
-#include <cuda/std/__atomic/wait/notify_wait.h>
-
-#include <cuda/std/__atomic/order.h>
-#include <cuda/std/__atomic/scopes.h>
-
-_LIBCUDACXX_BEGIN_NAMESPACE_STD
-
-template <typename _Tp, typename _Crtp, typename _Sco = __thread_scope_system_tag>
-struct __atomic_common_const : public _Crtp {
-#if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
-    static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
-#endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
-
-    _CCCL_HOST_DEVICE inline
-    bool is_lock_free() const noexcept
-        {return _LIBCUDACXX_ATOMIC_IS_LOCK_FREE(sizeof(_Tp));}
-
-    _CCCL_HOST_DEVICE inline
-    void store(_Tp __d, memory_order __m = memory_order_seq_cst) const noexcept
-      _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
-        {__atomic_store_dispatch(this->__this_atom(), __d, __m, _Sco{});}
-
-    _CCCL_HOST_DEVICE inline
-    _Tp load(memory_order __m = memory_order_seq_cst) const noexcept
-      _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
-        {return __atomic_load_dispatch(this->__this_atom(), __m, _Sco{});}
-
-    _CCCL_HOST_DEVICE inline
-    operator _Tp() const noexcept          {return load();}
-
-    _CCCL_HOST_DEVICE inline
-    _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) const noexcept
-        {return __atomic_exchange_dispatch(this->__this_atom(), __d, __m, _Sco{});}
-
-    _CCCL_HOST_DEVICE inline
-    bool compare_exchange_weak(_Tp& __e, _Tp __d,
-                               memory_order __s, memory_order __f) const noexcept
-      _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-        {return __atomic_compare_exchange_weak_dispatch(this->__this_atom(), &__e, __d, __s, __f, _Sco{});}
-
-    _CCCL_HOST_DEVICE inline
-    bool compare_exchange_strong(_Tp& __e, _Tp __d,
-                                 memory_order __s, memory_order __f) const noexcept
-      _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-        {return __atomic_compare_exchange_strong_dispatch(this->__this_atom(), &__e, __d, __s, __f, _Sco{});}
-
-    _CCCL_HOST_DEVICE inline
-    bool compare_exchange_weak(_Tp& __e, _Tp __d,
-                               memory_order __m = memory_order_seq_cst) const noexcept {
-        if(memory_order_acq_rel == __m)
-            return __atomic_compare_exchange_weak_dispatch(this->__this_atom(), &__e, __d, __m, memory_order_acquire, _Sco{});
-        else if(memory_order_release == __m)
-            return __atomic_compare_exchange_weak_dispatch(this->__this_atom(), &__e, __d, __m, memory_order_relaxed, _Sco{});
-        else
-            return __atomic_compare_exchange_weak_dispatch(this->__this_atom(), &__e, __d, __m, __m, _Sco{});
-    }
-
-    _CCCL_HOST_DEVICE inline
-    bool compare_exchange_strong(_Tp& __e, _Tp __d,
-                                 memory_order __m = memory_order_seq_cst) const noexcept {
-        if (memory_order_acq_rel == __m)
-            return __atomic_compare_exchange_strong_dispatch(this->__this_atom(), &__e, __d, __m, memory_order_acquire, _Sco{});
-        else if (memory_order_release == __m)
-            return __atomic_compare_exchange_strong_dispatch(this->__this_atom(), &__e, __d, __m, memory_order_relaxed, _Sco{});
-        else
-            return __atomic_compare_exchange_strong_dispatch(this->__this_atom(), &__e, __d, __m, __m, _Sco{});
-    }
-
-    _CCCL_HOST_DEVICE inline void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const noexcept
-        {__atomic_wait(this->__this_atom(), __v, __m, _Sco{});}
-
-    _CCCL_HOST_DEVICE inline void notify_one() const noexcept
-        {__atomic_notify_one(this->__this_atom(), _Sco{});}
-
-    _CCCL_HOST_DEVICE inline void notify_all() const noexcept
-        {__atomic_notify_all(this->__this_atom(), _Sco{});}
-};
-
-template <typename _Tp, typename _Crtp, typename _Sco = __thread_scope_system_tag>
-struct __atomic_arithmetic_const : public __atomic_common_const<_Tp, _Crtp, _Sco> {
-    _CCCL_HOST_DEVICE inline
-    _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
-        {return __atomic_fetch_add_dispatch(this->__this_atom(), __op, __m, _Sco{});}
-
-    _CCCL_HOST_DEVICE inline
-    _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
-        {return __atomic_fetch_sub_dispatch(this->__this_atom(), __op, __m, _Sco{});}
-
-    _CCCL_HOST_DEVICE inline
-    _Tp operator++(int) const noexcept               {return fetch_add(_Tp(1));}
-
-    _CCCL_HOST_DEVICE inline
-    _Tp operator--(int) const noexcept               {return fetch_sub(_Tp(1));}
-
-    _CCCL_HOST_DEVICE inline
-    _Tp operator++() const noexcept                  {return fetch_add(_Tp(1)) + _Tp(1);}
-
-    _CCCL_HOST_DEVICE inline
-    _Tp operator--() const noexcept                  {return fetch_sub(_Tp(1)) - _Tp(1);}
-
-    _CCCL_HOST_DEVICE inline
-    _Tp operator+=(_Tp __op) const noexcept          {return fetch_add(__op) + __op;}
-
-    _CCCL_HOST_DEVICE inline
-    _Tp operator-=(_Tp __op) const noexcept          {return fetch_sub(__op) - __op;}
-};
-
-template <typename _Tp, typename _Crtp, typename _Sco = __thread_scope_system_tag>
-struct __atomic_bitwise_const : public __atomic_arithmetic_const<_Tp, _Crtp, _Sco> {
-    _CCCL_HOST_DEVICE inline
-    _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
-        {return __atomic_fetch_and_dispatch(this->__this_atom(), __op, __m, _Sco{});}
-
-    _CCCL_HOST_DEVICE inline
-    _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
-        {return __atomic_fetch_or_dispatch(this->__this_atom(), __op, __m, _Sco{});}
-
-    _CCCL_HOST_DEVICE inline
-    _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
-        {return __atomic_fetch_xor_dispatch(this->__this_atom(), __op, __m, _Sco{});}
-
-    _CCCL_HOST_DEVICE inline
-    _Tp operator&=(_Tp __op) const noexcept          {return fetch_and(__op) & __op;}
-
-    _CCCL_HOST_DEVICE inline
-    _Tp operator|=(_Tp __op) const noexcept          {return fetch_or(__op) | __op;}
-
-    _CCCL_HOST_DEVICE inline
-    _Tp operator^=(_Tp __op) const noexcept          {return fetch_xor(__op) ^ __op;}
-};
-
-_LIBCUDACXX_END_NAMESPACE_STD
-
-#endif __LIBCUDACXX___ATOMIC_API_CONST_H
diff --git a/libcudacxx/include/cuda/std/__atomic/api/nonconst.h b/libcudacxx/include/cuda/std/__atomic/api/nonconst.h
deleted file mode 100644
index 6b147d66c7..0000000000
--- a/libcudacxx/include/cuda/std/__atomic/api/nonconst.h
+++ /dev/null
@@ -1,237 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of libcu++, the C++ Standard Library for your entire system,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __LIBCUDACXX___ATOMIC_API_NONCONST_H
-#define __LIBCUDACXX___ATOMIC_API_NONCONST_H
-
-#include <cuda/std/detail/__config>
-
-#include <cuda/std/__atomic/api/atomic_crtp.h>
-
-#include <cuda/std/__atomic/wait/polling.h>
-#include <cuda/std/__atomic/wait/notify_wait.h>
-
-#include <cuda/std/__atomic/order.h>
-#include <cuda/std/__atomic/scopes.h>
-
-_LIBCUDACXX_BEGIN_NAMESPACE_STD
-
-template <typename _Tp, typename _Crtp, typename _Sco = __thread_scope_system_tag>
-struct __atomic_common : public _Crtp {
-#if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
-    static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
-#endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
-
-    _CCCL_HOST_DEVICE inline
-    bool is_lock_free() const volatile noexcept
-        {return _LIBCUDACXX_ATOMIC_IS_LOCK_FREE(sizeof(_Tp));}
-    _CCCL_HOST_DEVICE inline
-    bool is_lock_free() const noexcept
-        {return _LIBCUDACXX_ATOMIC_IS_LOCK_FREE(sizeof(_Tp));}
-
-    _CCCL_HOST_DEVICE inline
-    void store(_Tp __d, memory_order __m = memory_order_seq_cst) volatile noexcept
-      _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
-        {__atomic_store_dispatch(this->__this_atom(), __d, __m, _Sco{});}
-    _CCCL_HOST_DEVICE inline
-    void store(_Tp __d, memory_order __m = memory_order_seq_cst) noexcept
-      _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
-        {__atomic_store_dispatch(this->__this_atom(), __d, __m, _Sco{});}
-
-    _CCCL_HOST_DEVICE inline
-    _Tp load(memory_order __m = memory_order_seq_cst) const volatile noexcept
-      _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
-        {return __atomic_load_dispatch(this->__this_atom(), __m, _Sco{});}
-    _CCCL_HOST_DEVICE inline
-    _Tp load(memory_order __m = memory_order_seq_cst) const noexcept
-      _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
-        {return __atomic_load_dispatch(this->__this_atom(), __m, _Sco{});}
-
-    _CCCL_HOST_DEVICE inline
-    operator _Tp() const volatile noexcept {return load();}
-    _CCCL_HOST_DEVICE inline
-    operator _Tp() const noexcept          {return load();}
-
-    _CCCL_HOST_DEVICE inline
-    _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) volatile noexcept
-        {return __atomic_exchange_dispatch(this->__this_atom(), __d, __m, _Sco{});}
-    _CCCL_HOST_DEVICE inline
-    _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) noexcept
-        {return __atomic_exchange_dispatch(this->__this_atom(), __d, __m, _Sco{});}
-
-    _CCCL_HOST_DEVICE inline
-    bool compare_exchange_weak(_Tp& __e, _Tp __d,
-                               memory_order __s, memory_order __f) volatile noexcept
-      _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-        {return __atomic_compare_exchange_weak_dispatch(this->__this_atom(), &__e, __d, __s, __f, _Sco{});}
-    _CCCL_HOST_DEVICE inline
-    bool compare_exchange_weak(_Tp& __e, _Tp __d,
-                               memory_order __s, memory_order __f) noexcept
-      _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-        {return __atomic_compare_exchange_weak_dispatch(this->__this_atom(), &__e, __d, __s, __f, _Sco{});}
-
-    _CCCL_HOST_DEVICE inline
-    bool compare_exchange_strong(_Tp& __e, _Tp __d,
-                                 memory_order __s, memory_order __f) volatile noexcept
-      _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-        {return __atomic_compare_exchange_strong_dispatch(this->__this_atom(), &__e, __d, __s, __f, _Sco{});}
-    _CCCL_HOST_DEVICE inline
-    bool compare_exchange_strong(_Tp& __e, _Tp __d,
-                                 memory_order __s, memory_order __f) noexcept
-      _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-        {return __atomic_compare_exchange_strong_dispatch(this->__this_atom(), &__e, __d, __s, __f, _Sco{});}
-
-    _CCCL_HOST_DEVICE inline
-    bool compare_exchange_weak(_Tp& __e, _Tp __d,
-                              memory_order __m = memory_order_seq_cst) volatile noexcept {
-        if (memory_order_acq_rel == __m)
-            return __atomic_compare_exchange_weak_dispatch(this->__this_atom(), &__e, __d, __m, memory_order_acquire, _Sco{});
-        else if (memory_order_release == __m)
-            return __atomic_compare_exchange_weak_dispatch(this->__this_atom(), &__e, __d, __m, memory_order_relaxed, _Sco{});
-        else
-            return __atomic_compare_exchange_weak_dispatch(this->__this_atom(), &__e, __d, __m, __m, _Sco{});
-    }
-    _CCCL_HOST_DEVICE inline
-    bool compare_exchange_weak(_Tp& __e, _Tp __d,
-                               memory_order __m = memory_order_seq_cst) noexcept {
-        if(memory_order_acq_rel == __m)
-            return __atomic_compare_exchange_weak_dispatch(this->__this_atom(), &__e, __d, __m, memory_order_acquire, _Sco{});
-        else if(memory_order_release == __m)
-            return __atomic_compare_exchange_weak_dispatch(this->__this_atom(), &__e, __d, __m, memory_order_relaxed, _Sco{});
-        else
-            return __atomic_compare_exchange_weak_dispatch(this->__this_atom(), &__e, __d, __m, __m, _Sco{});
-    }
-
-    _CCCL_HOST_DEVICE inline
-    bool compare_exchange_strong(_Tp& __e, _Tp __d,
-                              memory_order __m = memory_order_seq_cst) volatile noexcept {
-        if (memory_order_acq_rel == __m)
-            return __atomic_compare_exchange_strong_dispatch(this->__this_atom(), &__e, __d, __m, memory_order_acquire, _Sco{});
-        else if (memory_order_release == __m)
-            return __atomic_compare_exchange_strong_dispatch(this->__this_atom(), &__e, __d, __m, memory_order_relaxed, _Sco{});
-        else
-            return __atomic_compare_exchange_strong_dispatch(this->__this_atom(), &__e, __d, __m, __m, _Sco{});
-    }
-    _CCCL_HOST_DEVICE inline
-    bool compare_exchange_strong(_Tp& __e, _Tp __d,
-                                 memory_order __m = memory_order_seq_cst) noexcept {
-        if (memory_order_acq_rel == __m)
-            return __atomic_compare_exchange_strong_dispatch(this->__this_atom(), &__e, __d, __m, memory_order_acquire, _Sco{});
-        else if (memory_order_release == __m)
-            return __atomic_compare_exchange_strong_dispatch(this->__this_atom(), &__e, __d, __m, memory_order_relaxed, _Sco{});
-        else
-            return __atomic_compare_exchange_strong_dispatch(this->__this_atom(), &__e, __d, __m, __m, _Sco{});
-    }
-
-    _CCCL_HOST_DEVICE inline void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {__atomic_wait(this->__this_atom(), __v, __m, _Sco{});}
-    _CCCL_HOST_DEVICE inline void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const noexcept
-        {__atomic_wait(this->__this_atom(), __v, __m, _Sco{});}
-
-    _CCCL_HOST_DEVICE inline void notify_one() volatile noexcept
-        {__atomic_notify_one(this->__this_atom(), _Sco{});}
-    _CCCL_HOST_DEVICE inline void notify_one() noexcept
-        {__atomic_notify_one(this->__this_atom(), _Sco{});}
-
-    _CCCL_HOST_DEVICE inline void notify_all() volatile noexcept
-        {__atomic_notify_all(this->__this_atom(), _Sco{});}
-    _CCCL_HOST_DEVICE inline void notify_all() noexcept
-        {__atomic_notify_all(this->__this_atom(), _Sco{});}
-};
-
-template <typename _Tp, typename _Crtp, typename _Sco = __thread_scope_system_tag>
-struct __atomic_arithmetic : public __atomic_common<_Tp, _Crtp, _Sco> {
-    _CCCL_HOST_DEVICE inline
-    _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
-        {return __atomic_fetch_add_dispatch(this->__this_atom(), __op, __m, _Sco{});}
-    _CCCL_HOST_DEVICE inline
-    _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
-        {return __atomic_fetch_add_dispatch(this->__this_atom(), __op, __m, _Sco{});}
-
-    _CCCL_HOST_DEVICE inline
-    _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
-        {return __atomic_fetch_sub_dispatch(this->__this_atom(), __op, __m, _Sco{});}
-    _CCCL_HOST_DEVICE inline
-    _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
-        {return __atomic_fetch_sub_dispatch(this->__this_atom(), __op, __m, _Sco{});}
-
-    _CCCL_HOST_DEVICE inline
-    _Tp operator++(int) volatile noexcept      {return fetch_add(_Tp(1));}
-    _CCCL_HOST_DEVICE inline
-    _Tp operator++(int) noexcept               {return fetch_add(_Tp(1));}
-
-    _CCCL_HOST_DEVICE inline
-    _Tp operator--(int) volatile noexcept      {return fetch_sub(_Tp(1));}
-    _CCCL_HOST_DEVICE inline
-    _Tp operator--(int) noexcept               {return fetch_sub(_Tp(1));}
-
-    _CCCL_HOST_DEVICE inline
-    _Tp operator++() volatile noexcept         {return fetch_add(_Tp(1)) + _Tp(1);}
-    _CCCL_HOST_DEVICE inline
-    _Tp operator++() noexcept                  {return fetch_add(_Tp(1)) + _Tp(1);}
-
-    _CCCL_HOST_DEVICE inline
-    _Tp operator--() volatile noexcept         {return fetch_sub(_Tp(1)) - _Tp(1);}
-    _CCCL_HOST_DEVICE inline
-    _Tp operator--() noexcept                  {return fetch_sub(_Tp(1)) - _Tp(1);}
-
-    _CCCL_HOST_DEVICE inline
-    _Tp operator+=(_Tp __op) volatile noexcept {return fetch_add(__op) + __op;}
-    _CCCL_HOST_DEVICE inline
-    _Tp operator+=(_Tp __op) noexcept          {return fetch_add(__op) + __op;}
-
-    _CCCL_HOST_DEVICE inline
-    _Tp operator-=(_Tp __op) volatile noexcept {return fetch_sub(__op) - __op;}
-    _CCCL_HOST_DEVICE inline
-    _Tp operator-=(_Tp __op) noexcept          {return fetch_sub(__op) - __op;}
-};
-
-template <typename _Tp, typename _Crtp, typename _Sco = __thread_scope_system_tag>
-struct __atomic_bitwise : public __atomic_arithmetic<_Tp, _Crtp, _Sco> {
-    _CCCL_HOST_DEVICE inline
-    _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
-        {return __atomic_fetch_and_dispatch(this->__this_atom(), __op, __m, _Sco{});}
-    _CCCL_HOST_DEVICE inline
-    _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
-        {return __atomic_fetch_and_dispatch(this->__this_atom(), __op, __m, _Sco{});}
-
-    _CCCL_HOST_DEVICE inline
-    _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
-        {return __atomic_fetch_or_dispatch(this->__this_atom(), __op, __m, _Sco{});}
-    _CCCL_HOST_DEVICE inline
-    _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
-        {return __atomic_fetch_or_dispatch(this->__this_atom(), __op, __m, _Sco{});}
-
-    _CCCL_HOST_DEVICE inline
-    _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
-        {return __atomic_fetch_xor_dispatch(this->__this_atom(), __op, __m, _Sco{});}
-    _CCCL_HOST_DEVICE inline
-    _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
-        {return __atomic_fetch_xor_dispatch(this->__this_atom(), __op, __m, _Sco{});}
-
-    _CCCL_HOST_DEVICE inline
-    _Tp operator&=(_Tp __op) volatile noexcept {return fetch_and(__op) & __op;}
-    _CCCL_HOST_DEVICE inline
-    _Tp operator&=(_Tp __op) noexcept          {return fetch_and(__op) & __op;}
-
-    _CCCL_HOST_DEVICE inline
-    _Tp operator|=(_Tp __op) volatile noexcept {return fetch_or(__op) | __op;}
-    _CCCL_HOST_DEVICE inline
-    _Tp operator|=(_Tp __op) noexcept          {return fetch_or(__op) | __op;}
-
-    _CCCL_HOST_DEVICE inline
-    _Tp operator^=(_Tp __op) volatile noexcept {return fetch_xor(__op) ^ __op;}
-    _CCCL_HOST_DEVICE inline
-    _Tp operator^=(_Tp __op) noexcept          {return fetch_xor(__op) ^ __op;}
-};
-
-_LIBCUDACXX_END_NAMESPACE_STD
-
-#endif __LIBCUDACXX___ATOMIC_API_NONCONST_H
diff --git a/libcudacxx/include/cuda/std/__atomic/api/owned.h b/libcudacxx/include/cuda/std/__atomic/api/owned.h
new file mode 100644
index 0000000000..bc08f023bf
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/api/owned.h
@@ -0,0 +1,124 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __LIBCUDACXX___ATOMIC_API_OWNED_H
+#define __LIBCUDACXX___ATOMIC_API_OWNED_H
+
+#include <cuda/std/detail/__config>
+
+#include <cuda/std/__type_traits/conditional.h>
+
+#include <cuda/std/__atomic/wait/polling.h>
+#include <cuda/std/__atomic/wait/notify_wait.h>
+
+#include <cuda/std/__atomic/order.h>
+#include <cuda/std/__atomic/scopes.h>
+#include <cuda/std/__atomic/types.h>
+
+#include <cuda/std/__atomic/api/common.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+template <typename _Tp, typename _Sco>
+struct __atomic_common {
+    _CCCL_HOST_DEVICE constexpr inline
+    __atomic_common(_Tp __v) : __a(__v) {}
+
+    _CCCL_HOST_DEVICE constexpr inline
+    __atomic_common() : __a() {}
+
+    __atomic_storage_t<_Tp> __a;
+
+#if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+    static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
+#endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+
+    _LIBCUDACXX_ATOMIC_COMMON_IMPL(,)
+    _LIBCUDACXX_ATOMIC_COMMON_IMPL(,volatile)
+};
+
+template <typename _Tp, typename _Sco = __thread_scope_system_tag>
+struct __atomic_arithmetic {
+    _CCCL_HOST_DEVICE constexpr inline
+    __atomic_arithmetic(_Tp __v) : __a(__v) {}
+
+    _CCCL_HOST_DEVICE constexpr inline
+    __atomic_arithmetic() : __a() {}
+
+    __atomic_storage_t<_Tp> __a;
+
+#if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+    static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
+#endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+
+    _LIBCUDACXX_ATOMIC_COMMON_IMPL(,)
+    _LIBCUDACXX_ATOMIC_COMMON_IMPL(,volatile)
+
+    _LIBCUDACXX_ATOMIC_ARITHMETIC_IMPL(,)
+    _LIBCUDACXX_ATOMIC_ARITHMETIC_IMPL(,volatile)
+};
+
+template <typename _Tp, typename _Sco = __thread_scope_system_tag>
+struct __atomic_bitwise {
+    _CCCL_HOST_DEVICE constexpr inline
+    __atomic_bitwise(_Tp __v) : __a(__v) {}
+
+    _CCCL_HOST_DEVICE constexpr inline
+    __atomic_bitwise() : __a() {}
+
+    __atomic_storage_t<_Tp> __a;
+
+#if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+    static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
+#endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+
+    _LIBCUDACXX_ATOMIC_COMMON_IMPL(,)
+    _LIBCUDACXX_ATOMIC_COMMON_IMPL(,volatile)
+
+    _LIBCUDACXX_ATOMIC_ARITHMETIC_IMPL(,)
+    _LIBCUDACXX_ATOMIC_ARITHMETIC_IMPL(,volatile)
+
+    _LIBCUDACXX_ATOMIC_BITWISE_IMPL(,)
+    _LIBCUDACXX_ATOMIC_BITWISE_IMPL(,volatile)
+};
+
+template <typename _Tp, typename _Sco = __thread_scope_system_tag>
+struct __atomic_pointer {
+    _CCCL_HOST_DEVICE constexpr inline
+    __atomic_pointer(_Tp __v) : __a(__v) {}
+
+    _CCCL_HOST_DEVICE constexpr inline
+    __atomic_pointer() : __a() {}
+
+    __atomic_storage_t<_Tp> __a;
+
+#if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+    static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
+#endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+
+    _LIBCUDACXX_ATOMIC_COMMON_IMPL(,)
+    _LIBCUDACXX_ATOMIC_COMMON_IMPL(,volatile)
+
+    _LIBCUDACXX_ATOMIC_POINTER_IMPL(,)
+    _LIBCUDACXX_ATOMIC_POINTER_IMPL(,volatile)
+};
+
+template <typename _Tp, thread_scope _Sco = thread_scope_system>
+using __atomic_impl = _If<is_pointer<_Tp>::value,
+                        __atomic_pointer<_Tp, __scope_to_tag<_Sco>>,
+                        _If<is_floating_point<_Tp>::value,
+                            __atomic_arithmetic<_Tp, __scope_to_tag<_Sco>>,
+                            _If<is_integral<_Tp>::value,
+                                __atomic_bitwise<_Tp, __scope_to_tag<_Sco>>,
+                                __atomic_common<_Tp, __scope_to_tag<_Sco>> >>>;
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // __LIBCUDACXX___ATOMIC_API_OWNED_H
diff --git a/libcudacxx/include/cuda/std/__atomic/api/reference.h b/libcudacxx/include/cuda/std/__atomic/api/reference.h
new file mode 100644
index 0000000000..df87d91f97
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/api/reference.h
@@ -0,0 +1,100 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __LIBCUDACXX___ATOMIC_API_REFERENCE_H
+#define __LIBCUDACXX___ATOMIC_API_REFERENCE_H
+
+#include <cuda/std/detail/__config>
+
+#include <cuda/std/__type_traits/conditional.h>
+
+#include <cuda/std/__atomic/wait/polling.h>
+#include <cuda/std/__atomic/wait/notify_wait.h>
+
+#include <cuda/std/__atomic/order.h>
+#include <cuda/std/__atomic/scopes.h>
+#include <cuda/std/__atomic/types/reference.h>
+
+#include <cuda/std/__atomic/api/common.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+template <typename _Tp, typename _Sco>
+struct __atomic_ref_common {
+    _CCCL_HOST_DEVICE constexpr inline
+    __atomic_ref_common(_Tp& __v) : __a(&__v) {}
+
+    __atomic_ref_storage<_Tp> __a;
+
+#if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+    static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
+#endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+
+    _LIBCUDACXX_ATOMIC_COMMON_IMPL(const,)
+};
+
+template <typename _Tp, typename _Sco>
+struct __atomic_ref_arithmetic {
+    _CCCL_HOST_DEVICE constexpr inline
+    __atomic_ref_arithmetic(_Tp& __v) : __a(&__v) {}
+
+    __atomic_ref_storage<_Tp> __a;
+
+#if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+    static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
+#endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+
+    _LIBCUDACXX_ATOMIC_COMMON_IMPL(const,)
+    _LIBCUDACXX_ATOMIC_ARITHMETIC_IMPL(const,)
+};
+
+template <typename _Tp, typename _Sco>
+struct __atomic_ref_bitwise {
+    _CCCL_HOST_DEVICE constexpr inline
+    __atomic_ref_bitwise(_Tp& __v) : __a(&__v) {}
+
+    __atomic_ref_storage<_Tp> __a;
+
+#if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+    static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
+#endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+
+    _LIBCUDACXX_ATOMIC_COMMON_IMPL(const,)
+    _LIBCUDACXX_ATOMIC_ARITHMETIC_IMPL(const,)
+    _LIBCUDACXX_ATOMIC_BITWISE_IMPL(const,)
+};
+
+template <typename _Tp, typename _Sco = __thread_scope_system_tag>
+struct __atomic_ref_pointer {
+    _CCCL_HOST_DEVICE constexpr inline
+    __atomic_ref_pointer(_Tp& __v) : __a(&__v) {}
+
+    __atomic_ref_storage<_Tp> __a;
+
+#if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+    static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
+#endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+
+    _LIBCUDACXX_ATOMIC_COMMON_IMPL(const,)
+    _LIBCUDACXX_ATOMIC_POINTER_IMPL(const,)
+};
+
+template <typename _Tp, thread_scope _Sco = thread_scope_system>
+using __atomic_ref_impl = _If<is_pointer<_Tp>::value,
+                            __atomic_ref_pointer<_Tp, __scope_to_tag<_Sco>>,
+                            _If<is_floating_point<_Tp>::value,
+                                __atomic_ref_arithmetic<_Tp, __scope_to_tag<_Sco>>,
+                                _If<is_integral<_Tp>::value,
+                                    __atomic_ref_bitwise<_Tp, __scope_to_tag<_Sco>>,
+                                    __atomic_ref_common<_Tp, __scope_to_tag<_Sco>> >>>;
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // __LIBCUDACXX___ATOMIC_API_REFERENCE_H
diff --git a/libcudacxx/include/cuda/std/__atomic/functions.h b/libcudacxx/include/cuda/std/__atomic/functions.h
new file mode 100644
index 0000000000..fcbb0a6027
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/functions.h
@@ -0,0 +1,25 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __LIBCUDACXX___ATOMIC_FUNCTIONS_H
+#define __LIBCUDACXX___ATOMIC_FUNCTIONS_H
+
+#include <cuda/std/detail/__config>
+
+#include <cuda/std/__atomic/platform.h>
+
+// Device atomics
+#include <cuda/std/__atomic/functions/cuda_ptx_generated.h>
+#include <cuda/std/__atomic/functions/cuda_ptx_derived.h>
+
+// Host atomics
+#include <cuda/std/__atomic/functions/host.h>
+
+#endif // __LIBCUDACXX___ATOMIC_FUNCTIONS_H
diff --git a/libcudacxx/include/cuda/std/__atomic/operations/atomic_cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
similarity index 96%
rename from libcudacxx/include/cuda/std/__atomic/operations/atomic_cuda_ptx_derived.h
rename to libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
index e5df2ac630..0c4367f21c 100644
--- a/libcudacxx/include/cuda/std/__atomic/operations/atomic_cuda_ptx_derived.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
@@ -8,10 +8,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef __LIBCUDACXX___ATOMIC_OPERATIONS_DERIVED_H
-#define __LIBCUDACXX___ATOMIC_OPERATIONS_DERIVED_H
+#ifndef __LIBCUDACXX___ATOMIC_FUNCTIONS_DERIVED_H
+#define __LIBCUDACXX___ATOMIC_FUNCTIONS_DERIVED_H
 
 #include <cuda/std/detail/__config>
+#include <cuda/std/__atomic/functions/cuda_ptx_generated.h>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
@@ -198,4 +199,4 @@ static inline _CCCL_DEVICE void __atomic_signal_fence_cuda(int)
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
-#endif __LIBCUDACXX___ATOMIC_OPERATIONS_DERIVED_H
+#endif __LIBCUDACXX___ATOMIC_FUNCTIONS_DERIVED_H
diff --git a/libcudacxx/include/cuda/std/__atomic/operations/atomic_cuda_ptx_generated.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
similarity index 99%
rename from libcudacxx/include/cuda/std/__atomic/operations/atomic_cuda_ptx_generated.h
rename to libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
index 52330eab5f..7a6d4020c6 100644
--- a/libcudacxx/include/cuda/std/__atomic/operations/atomic_cuda_ptx_generated.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
@@ -11,6 +11,17 @@
 // This is a autogenerated file, we want to ensure that it contains exactly the contents we want to generate
 // clang-format off
 
+#ifndef _LIBCUDACXX___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H
+#define _LIBCUDACXX___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H
+
+#include <cuda/std/detail/__config>
+#include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/is_signed.h>
+#include <cuda/std/__type_traits/is_unsigned.h>
+
+#include <cuda/std/__atomic/scopes.h>
+#include <cuda/std/__atomic/order.h>
+
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 static inline _CCCL_DEVICE void __cuda_membar_block() { asm volatile("membar.cta;":::"memory"); }
@@ -6547,4 +6558,6 @@ _CCCL_DEVICE _Type* __atomic_fetch_sub_cuda(_Type **__ptr, ptrdiff_t __val, int
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
+#endif // _LIBCUDACXX___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H
+
 // clang-format on
diff --git a/libcudacxx/include/cuda/std/__atomic/operations/host.h b/libcudacxx/include/cuda/std/__atomic/functions/host.h
similarity index 96%
rename from libcudacxx/include/cuda/std/__atomic/operations/host.h
rename to libcudacxx/include/cuda/std/__atomic/functions/host.h
index cba9d9c5f9..45e0d865c9 100644
--- a/libcudacxx/include/cuda/std/__atomic/operations/host.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/host.h
@@ -1,4 +1,3 @@
-// -*- C++ -*-
 //===----------------------------------------------------------------------===//
 //
 // Part of libcu++, the C++ Standard Library for your entire system,
@@ -9,10 +8,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCUDACXX___ATOMICS_HOST_H
-#define _LIBCUDACXX___ATOMICS_HOST_H
+#ifndef _LIBCUDACXX___ATOMICS_FUNCTIONS_HOST_H
+#define _LIBCUDACXX___ATOMICS_FUNCTIONS_HOST_H
 
-#include <cuda/std/__atomic/platform/platform.h>
+#include <cuda/std/detail/__config>
+
+#include <cuda/std/__atomic/platform.h>
 #include <cuda/std/__atomic/order.h>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
@@ -186,4 +187,4 @@ inline _Tp __atomic_fetch_min_host(_Tp* __a, _Td __val,
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
-#endif // _LIBCUDACXX___ATOMICS_HOST_H
+#endif // _LIBCUDACXX___ATOMICS_FUNCTIONS_HOST_H
diff --git a/libcudacxx/include/cuda/std/__atomic/order.h b/libcudacxx/include/cuda/std/__atomic/order.h
index 3e88f6f1e1..3e63fecc65 100644
--- a/libcudacxx/include/cuda/std/__atomic/order.h
+++ b/libcudacxx/include/cuda/std/__atomic/order.h
@@ -8,8 +8,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef __LIBCUDACXX_ATOMIC_ORDER_H
-#define __LIBCUDACXX_ATOMIC_ORDER_H
+#ifndef __LIBCUDACXX___ATOMIC_ORDER_H
+#define __LIBCUDACXX___ATOMIC_ORDER_H
 
 #include <cuda/std/detail/__config>
 
@@ -141,4 +141,4 @@ constexpr memory_order memory_order_seq_cst = _CUDA_VSTD::memory_order_seq_cst;
 _LIBCUDACXX_END_NAMESPACE_CUDA
 
 
-#endif // __LIBCUDACXX_ATOMIC_ORDER_H
+#endif // __LIBCUDACXX___ATOMIC_ORDER_H
diff --git a/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h b/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h
index f1ddff6dfd..de0ca90300 100644
--- a/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h
+++ b/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h
@@ -1,4 +1,3 @@
-// -*- C++ -*-
 //===----------------------------------------------------------------------===//
 //
 // Part of libcu++, the C++ Standard Library for your entire system,
@@ -12,6 +11,8 @@
 #ifndef __LIBCUDACXX___ATOMIC_PLATFORM_MSVC_H
 #define __LIBCUDACXX___ATOMIC_PLATFORM_MSVC_H
 
+#include <cuda/std/detail/__config>
+
 #ifndef _MSC_VER
 #  error "This file is only for CL.EXE's benefit"
 #endif
@@ -19,6 +20,8 @@
 #include <intrin.h>
 #include <cuda/std/cassert>
 
+#include <cuda/std/__atomic/order.h>
+
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 #define _LIBCUDACXX_COMPILER_BARRIER() _ReadWriteBarrier()
diff --git a/libcudacxx/include/cuda/std/__atomic/platform/platform.h b/libcudacxx/include/cuda/std/__atomic/platform/platform.h
deleted file mode 100644
index cabb9de827..0000000000
--- a/libcudacxx/include/cuda/std/__atomic/platform/platform.h
+++ /dev/null
@@ -1,59 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of libcu++, the C++ Standard Library for your entire system,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#if defined(_CCCL_COMPILER_MSVC)
-#include <cuda/std/__atomic/platform/msvc_to_builtins.h>
-#endif
-
-#if defined(__CLANG_ATOMIC_BOOL_LOCK_FREE)
-# define ATOMIC_BOOL_LOCK_FREE      __CLANG_ATOMIC_BOOL_LOCK_FREE
-# define ATOMIC_CHAR_LOCK_FREE      __CLANG_ATOMIC_CHAR_LOCK_FREE
-# define ATOMIC_CHAR16_T_LOCK_FREE  __CLANG_ATOMIC_CHAR16_T_LOCK_FREE
-# define ATOMIC_CHAR32_T_LOCK_FREE  __CLANG_ATOMIC_CHAR32_T_LOCK_FREE
-# define ATOMIC_WCHAR_T_LOCK_FREE   __CLANG_ATOMIC_WCHAR_T_LOCK_FREE
-# define ATOMIC_SHORT_LOCK_FREE     __CLANG_ATOMIC_SHORT_LOCK_FREE
-# define ATOMIC_INT_LOCK_FREE       __CLANG_ATOMIC_INT_LOCK_FREE
-# define ATOMIC_LONG_LOCK_FREE      __CLANG_ATOMIC_LONG_LOCK_FREE
-# define ATOMIC_LLONG_LOCK_FREE     __CLANG_ATOMIC_LLONG_LOCK_FREE
-# define ATOMIC_POINTER_LOCK_FREE   __CLANG_ATOMIC_POINTER_LOCK_FREE
-#elif defined(__GCC_ATOMIC_BOOL_LOCK_FREE)
-# define ATOMIC_BOOL_LOCK_FREE      __GCC_ATOMIC_BOOL_LOCK_FREE
-# define ATOMIC_CHAR_LOCK_FREE      __GCC_ATOMIC_CHAR_LOCK_FREE
-# define ATOMIC_CHAR16_T_LOCK_FREE  __GCC_ATOMIC_CHAR16_T_LOCK_FREE
-# define ATOMIC_CHAR32_T_LOCK_FREE  __GCC_ATOMIC_CHAR32_T_LOCK_FREE
-# define ATOMIC_WCHAR_T_LOCK_FREE   __GCC_ATOMIC_WCHAR_T_LOCK_FREE
-# define ATOMIC_SHORT_LOCK_FREE     __GCC_ATOMIC_SHORT_LOCK_FREE
-# define ATOMIC_INT_LOCK_FREE       __GCC_ATOMIC_INT_LOCK_FREE
-# define ATOMIC_LONG_LOCK_FREE      __GCC_ATOMIC_LONG_LOCK_FREE
-# define ATOMIC_LLONG_LOCK_FREE     __GCC_ATOMIC_LLONG_LOCK_FREE
-# define ATOMIC_POINTER_LOCK_FREE   __GCC_ATOMIC_POINTER_LOCK_FREE
-#endif
-
-#if !defined(__CLANG_ATOMIC_BOOL_LOCK_FREE) && !defined(__GCC_ATOMIC_BOOL_LOCK_FREE)
-#define ATOMIC_BOOL_LOCK_FREE      2
-#define ATOMIC_CHAR_LOCK_FREE      2
-#define ATOMIC_CHAR16_T_LOCK_FREE  2
-#define ATOMIC_CHAR32_T_LOCK_FREE  2
-#define ATOMIC_WCHAR_T_LOCK_FREE   2
-#define ATOMIC_SHORT_LOCK_FREE     2
-#define ATOMIC_INT_LOCK_FREE       2
-#define ATOMIC_LONG_LOCK_FREE      2
-#define ATOMIC_LLONG_LOCK_FREE     2
-#define ATOMIC_POINTER_LOCK_FREE   2
-#endif //!defined(__CLANG_ATOMIC_BOOL_LOCK_FREE) && !defined(__GCC_ATOMIC_BOOL_LOCK_FREE)
-
-#if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
-template<typename _Tp> struct __atomic_is_always_lock_free {
-    enum { __value = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0) }; };
-#else
-template<typename _Tp> struct __atomic_is_always_lock_free {
-    enum { __value = sizeof(_Tp) <= 8 }; };
-#endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
diff --git a/libcudacxx/include/cuda/std/__atomic/scopes.h b/libcudacxx/include/cuda/std/__atomic/scopes.h
index 77f8246e1b..3cc3bf22c3 100644
--- a/libcudacxx/include/cuda/std/__atomic/scopes.h
+++ b/libcudacxx/include/cuda/std/__atomic/scopes.h
@@ -1,5 +1,17 @@
-#ifndef __LIBCUDACXX_ATOMIC_SCOPES_H
-#define __LIBCUDACXX_ATOMIC_SCOPES_H
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __LIBCUDACXX___ATOMIC_SCOPES_H
+#define __LIBCUDACXX___ATOMIC_SCOPES_H
+
+#include <cuda/std/detail/__config>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
@@ -19,9 +31,6 @@ enum thread_scope {
     thread_scope_thread = __ATOMIC_THREAD
 };
 
-#define _LIBCUDACXX_ATOMIC_SCOPE_TYPE ::cuda::thread_scope
-#define _LIBCUDACXX_ATOMIC_SCOPE_DEFAULT ::cuda::thread_scope::system
-
 struct __thread_scope_thread_tag { };
 struct __thread_scope_block_tag { };
 struct __thread_scope_device_tag { };
@@ -68,4 +77,4 @@ using _CUDA_VSTD::__thread_scope_system_tag;
 
 _LIBCUDACXX_END_NAMESPACE_CUDA
 
-#endif // __LIBCUDACXX_ATOMIC_SCOPES_H
+#endif // __LIBCUDACXX___ATOMIC_SCOPES_H
diff --git a/libcudacxx/include/cuda/std/__atomic/storage/base.h b/libcudacxx/include/cuda/std/__atomic/storage/base.h
deleted file mode 100644
index e3c17f8c59..0000000000
--- a/libcudacxx/include/cuda/std/__atomic/storage/base.h
+++ /dev/null
@@ -1,63 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of libcu++, the C++ Standard Library for your entire system,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCUDACXX___ATOMIC_STORAGE_BASE_H
-#define _LIBCUDACXX___ATOMIC_STORAGE_BASE_H
-
-#include <cuda/std/detail/__config>
-
-#include <cuda/std/type_traits>
-
-#include <cuda/std/__atomic/storage/common.h>
-
-_LIBCUDACXX_BEGIN_NAMESPACE_STD
-
-template <typename _Tp>
-struct __atomic_storage {
-  using __underlying_t = _Tp;
-  static constexpr __atomic_tag __tag = __atomic_tag::__atomic_base_tag;
-
-#if !defined(_CCCL_COMPILER_GCC) || (__GNUC__ >= 5)
-  static_assert(is_trivially_copyable<_Tp>::value,
-    "std::atomic<Tp> requires that 'Tp' be a trivially copyable type");
-#endif
-
-  _CCCL_ALIGNAS(sizeof(_Tp)) _Tp __a_value;
-
-  _CCCL_HOST_DEVICE constexpr explicit inline
-  __atomic_storage() noexcept
-    : __a_value{} {}
-
-  _CCCL_HOST_DEVICE constexpr explicit inline
-  __atomic_storage(_Tp value) noexcept
-    : __a_value(value) {}
-
-  _CCCL_HOST_DEVICE inline
-  auto get() -> __underlying_t* {
-    return &__a_value;
-  }
-  _CCCL_HOST_DEVICE inline
-  auto get() const -> const __underlying_t* {
-    return &__a_value;
-  }
-  _CCCL_HOST_DEVICE inline
-  auto get() volatile -> volatile __underlying_t* {
-    return &__a_value;
-  }
-  _CCCL_HOST_DEVICE inline
-  auto get() const volatile -> const volatile __underlying_t* {
-    return &__a_value;
-  }
-};
-
-_LIBCUDACXX_END_NAMESPACE_STD
-
-#endif // _LIBCUDACXX___ATOMIC_STORAGE_BASE_H
diff --git a/libcudacxx/include/cuda/std/__atomic/types.h b/libcudacxx/include/cuda/std/__atomic/types.h
new file mode 100644
index 0000000000..70c3fe2db5
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/types.h
@@ -0,0 +1,40 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __LIBCUDACXX___ATOMIC_TYPES_H
+#define __LIBCUDACXX___ATOMIC_TYPES_H
+
+#include <cuda/std/detail/__config>
+
+#include <cuda/std/__atomic/types/base.h>
+#include <cuda/std/__atomic/types/locked.h>
+#include <cuda/std/__atomic/types/small.h>
+#include <cuda/std/__atomic/types/reference.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+template <typename _Tp>
+struct __atomic_traits {
+    static constexpr bool __atomic_requires_lock = !__atomic_is_always_lock_free<_Tp>::__value;
+    static constexpr bool __atomic_requires_small = sizeof(_Tp) < 4;
+    static constexpr bool __atomic_supports_reference = __atomic_is_always_lock_free<_Tp>::__value && (sizeof(_Tp) >= 4 && sizeof(_Tp) <= 8);
+};
+
+template <typename _Tp>
+using __atomic_storage_t = typename _If<__atomic_traits<_Tp>::__atomic_requires_small,
+                                            __atomic_small_storage<_Tp>,
+                                            _If<__atomic_traits<_Tp>::__atomic_requires_lock,
+                                                __atomic_locked_storage<_Tp>,
+                                                __atomic_storage<_Tp>
+                                                >>;
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // __LIBCUDACXX___ATOMIC_TYPES_H
diff --git a/libcudacxx/include/cuda/std/__atomic/operations/heterogeneous.h b/libcudacxx/include/cuda/std/__atomic/types/base.h
similarity index 75%
rename from libcudacxx/include/cuda/std/__atomic/operations/heterogeneous.h
rename to libcudacxx/include/cuda/std/__atomic/types/base.h
index 4111003489..942c029647 100644
--- a/libcudacxx/include/cuda/std/__atomic/operations/heterogeneous.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/base.h
@@ -8,27 +8,59 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef __LIBCUDACXX___ATOMIC_DISPATCH_H
-#define __LIBCUDACXX___ATOMIC_DISPATCH_H
+#ifndef _LIBCUDACXX___ATOMIC_TYPES_BASE_H
+#define _LIBCUDACXX___ATOMIC_TYPES_BASE_H
 
 #include <cuda/std/detail/__config>
 
-#include <cuda/std/__atomic/scopes.h>
-#include <cuda/std/__atomic/order.h>
-#include <cuda/std/__atomic/storage/common.h>
+#include <cuda/std/__type_traits/remove_cvref.h>
+#include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/is_trivially_copyable.h>
 
-#include <cuda/std/__atomic/operations/host.h>
-#include <cuda/std/__atomic/operations/atomic_cuda_ptx_generated.h>
-#include <cuda/std/__atomic/operations/atomic_cuda_ptx_derived.h>
+#include <cuda/std/__atomic/functions.h>
 
-// Dispatch directly calls PTX/Host backends for atomic objects.
-// By default these objects support extracting the address contained with operator()()
-// this provides some amount of syntactic sugar to avoid duplicating every function that requires `volatile`.
-// `_Tp` is able to be volatile and will simply be instatiated into a new function.
-// It is up to the underlying backends to implement the correct volatile behavior
+#include <cuda/std/__atomic/types/common.h>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
+template <typename _Tp>
+struct __atomic_storage {
+  using __underlying_t = _Tp;
+  static constexpr __atomic_tag __tag = __atomic_tag::__atomic_base_tag;
+
+#if !defined(_CCCL_COMPILER_GCC) || (__GNUC__ >= 5)
+  static_assert(is_trivially_copyable<_Tp>::value,
+    "std::atomic<Tp> requires that 'Tp' be a trivially copyable type");
+#endif
+
+  _CCCL_ALIGNAS(sizeof(_Tp)) _Tp __a_value;
+
+  _CCCL_HOST_DEVICE constexpr explicit inline
+  __atomic_storage() noexcept
+    : __a_value{} {}
+
+  _CCCL_HOST_DEVICE constexpr explicit inline
+  __atomic_storage(_Tp value) noexcept
+    : __a_value(value) {}
+
+  _CCCL_HOST_DEVICE inline
+  auto get() -> __underlying_t* {
+    return &__a_value;
+  }
+  _CCCL_HOST_DEVICE inline
+  auto get() const -> const __underlying_t* {
+    return &__a_value;
+  }
+  _CCCL_HOST_DEVICE inline
+  auto get() volatile -> volatile __underlying_t* {
+    return &__a_value;
+  }
+  _CCCL_HOST_DEVICE inline
+  auto get() const volatile -> const volatile __underlying_t* {
+    return &__a_value;
+  }
+};
+
 _CCCL_HOST_DEVICE inline
 void __atomic_thread_fence_dispatch(memory_order __order) {
     NV_DISPATCH_TARGET(
@@ -53,9 +85,6 @@ void __atomic_signal_fence_dispatch(memory_order __order) {
     )
 }
 
-// Extract the storage tag and SFINAE on the tag inside the storage object
-template <typename _Sto>
-using __atomic_storage_is_base = __enable_if_t<__atomic_tag::__atomic_base_tag == __remove_cvref_t<_Sto>::__tag, int>;
 
 template <typename _Sto, typename _Up, __atomic_storage_is_base<_Sto> = 0>
 _CCCL_HOST_DEVICE inline
@@ -132,12 +161,7 @@ bool __atomic_compare_exchange_weak_dispatch(_Sto* __a, _Up* __expected, _Up __v
     return __result;
 }
 
-template <typename _Tp>
-using __atomic_enable_if_ptr = __enable_if_t<is_pointer<__atomic_underlying_t<_Tp>>::value, int>;
-template <typename _Tp>
-using __atomic_enable_if_not_ptr = __enable_if_t<!is_pointer<__atomic_underlying_t<_Tp>>::value, int>;
-
-template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0, __atomic_enable_if_not_ptr<_Sto> = 0>
+template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0>
 _CCCL_HOST_DEVICE inline
 auto __atomic_fetch_add_dispatch(_Sto* __a, _Up __delta, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
     NV_DISPATCH_TARGET(
@@ -150,20 +174,7 @@ auto __atomic_fetch_add_dispatch(_Sto* __a, _Up __delta, memory_order __order, _
     )
 }
 
-template <typename _Sto, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0, __atomic_enable_if_ptr<_Sto> = 0>
-_CCCL_HOST_DEVICE inline
-auto __atomic_fetch_add_dispatch(_Sto* __a, ptrdiff_t __delta, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
-    NV_DISPATCH_TARGET(
-        NV_IS_DEVICE, (
-            return __atomic_fetch_add_cuda(__a->get(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{});
-        ),
-        NV_IS_HOST, (
-            return __atomic_fetch_add_host(__a->get(), __delta, __order);
-        )
-    )
-}
-
-template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0, __atomic_enable_if_not_ptr<_Sto> = 0>
+template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0>
 _CCCL_HOST_DEVICE inline
 auto __atomic_fetch_sub_dispatch(_Sto* __a, _Up __delta, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
     NV_DISPATCH_TARGET(
@@ -176,19 +187,6 @@ auto __atomic_fetch_sub_dispatch(_Sto* __a, _Up __delta, memory_order __order, _
     )
 }
 
-template <typename _Sto, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0, __atomic_enable_if_ptr<_Sto> = 0>
-_CCCL_HOST_DEVICE inline
-auto __atomic_fetch_sub_dispatch(_Sto* __a, ptrdiff_t __delta, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
-    NV_DISPATCH_TARGET(
-        NV_IS_DEVICE, (
-            return __atomic_fetch_sub_cuda(__a->get(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{});
-        ),
-        NV_IS_HOST, (
-            return __atomic_fetch_sub_host(__a->get(), __delta, __order);
-        )
-    )
-}
-
 template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0>
 _CCCL_HOST_DEVICE inline
 auto __atomic_fetch_and_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {})  -> __atomic_underlying_t<_Sto> {
@@ -254,4 +252,4 @@ auto __atomic_fetch_min_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sc
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
-#endif // __LIBCUDACXX___ATOMIC_DISPATCH_H
+#endif // _LIBCUDACXX___ATOMIC_TYPES_BASE_H
diff --git a/libcudacxx/include/cuda/std/__atomic/storage/common.h b/libcudacxx/include/cuda/std/__atomic/types/common.h
similarity index 67%
rename from libcudacxx/include/cuda/std/__atomic/storage/common.h
rename to libcudacxx/include/cuda/std/__atomic/types/common.h
index 4829c4b44b..e962838235 100644
--- a/libcudacxx/include/cuda/std/__atomic/storage/common.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/common.h
@@ -1,4 +1,3 @@
-// -*- C++ -*-
 //===----------------------------------------------------------------------===//
 //
 // Part of libcu++, the C++ Standard Library for your entire system,
@@ -9,10 +8,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCUDACXX___ATOMIC_STORAGE_COMMON_H
-#define _LIBCUDACXX___ATOMIC_STORAGE_COMMON_H
+#ifndef _LIBCUDACXX___ATOMIC_TYPES_COMMON_H
+#define _LIBCUDACXX___ATOMIC_TYPES_COMMON_H
 
-#include <cuda/std/type_traits>
+#include <cuda/std/detail/__config>
+
+#include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/remove_cvref.h>
+#include <cuda/std/__type_traits/is_assignable.h>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
@@ -22,6 +25,14 @@ enum class __atomic_tag {
   __atomic_small_tag,
 };
 
+// Helpers to SFINAE on the tag inside the storage object
+template <typename _Sto>
+using __atomic_storage_is_base = __enable_if_t<__atomic_tag::__atomic_base_tag == __remove_cvref_t<_Sto>::__tag, int>;
+template <typename _Sto>
+using __atomic_storage_is_locked = __enable_if_t<__atomic_tag::__atomic_locked_tag == __remove_cvref_t<_Sto>::__tag, int>;
+template <typename _Sto>
+using __atomic_storage_is_small = __enable_if_t<__atomic_tag::__atomic_small_tag == __remove_cvref_t<_Sto>::__tag, int>;
+
 // [atomics.types.generic]p1 guarantees _Tp is trivially copyable. Because
 // the default operator= in an object is not volatile, a byte-by-byte copy
 // is required.
@@ -41,8 +52,9 @@ _CCCL_HOST_DEVICE __atomic_assign_volatile(_Tp volatile* __a_value, _Tv volatile
     *__to++ = *__from++;
 }
 
+// The 'value_type' of the atomic may be 'volatile blah', so remove the volatile portion for now.
 template <typename _Tp>
-using __atomic_underlying_t = typename __remove_cvref_t<_Tp>::__underlying_t;
+using __atomic_underlying_t = typename  __remove_cvref_t<__remove_cvref_t<_Tp>::__underlying_t>;
 
 _CCCL_HOST_DEVICE
 inline int __atomic_memcmp(void const * __lhs, void const * __rhs, size_t __count) {
@@ -66,4 +78,4 @@ inline int __atomic_memcmp(void const * __lhs, void const * __rhs, size_t __coun
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
-#endif // _LIBCUDACXX___ATOMIC_STORAGE_COMMON_H
+#endif // _LIBCUDACXX___ATOMIC_TYPES_COMMON_H
diff --git a/libcudacxx/include/cuda/std/__atomic/storage/locked.h b/libcudacxx/include/cuda/std/__atomic/types/locked.h
similarity index 92%
rename from libcudacxx/include/cuda/std/__atomic/storage/locked.h
rename to libcudacxx/include/cuda/std/__atomic/types/locked.h
index 4b037daa29..ff181352b3 100644
--- a/libcudacxx/include/cuda/std/__atomic/storage/locked.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/locked.h
@@ -1,4 +1,3 @@
-// -*- C++ -*-
 //===----------------------------------------------------------------------===//
 //
 // Part of libcu++, the C++ Standard Library for your entire system,
@@ -9,15 +8,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCUDACXX___ATOMIC_STORAGE_LOCKED_H
-#define _LIBCUDACXX___ATOMIC_STORAGE_LOCKED_H
+#ifndef _LIBCUDACXX___ATOMIC_TYPES_LOCKED_H
+#define _LIBCUDACXX___ATOMIC_TYPES_LOCKED_H
 
 #include <cuda/std/detail/__config>
 
-#include <cuda/std/type_traits>
+#include <cuda/std/__type_traits/remove_cv.h>
+
+#include <cuda/std/__atomic/types/common.h>
+#include <cuda/std/__atomic/types/base.h>
 
-#include <cuda/std/__atomic/storage/common.h>
-#include <cuda/std/__atomic/storage/base.h>
 #include <cuda/std/__atomic/order.h>
 #include <cuda/std/__atomic/scopes.h>
 
@@ -26,7 +26,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 // Locked atomics must override the dispatch to be able to implement RMW primitives around the embedded lock.
 template <typename _Tp>
 struct __atomic_locked_storage {
-  using __underlying_t = typename remove_cv<_Tp>::type;
+  using __underlying_t = typename __remove_cv_t<_Tp>;
   static constexpr __atomic_tag __tag = __atomic_tag::__atomic_locked_tag;
 
   _Tp __a_value;
@@ -64,10 +64,6 @@ struct __atomic_locked_storage {
   }
 };
 
-// Extract the storage tag and SFINAE on the tag inside the storage object
-template <typename _Sto>
-using __atomic_storage_is_locked = __enable_if_t<__atomic_tag::__atomic_locked_tag == __remove_cvref_t<_Sto>::__tag, int>;
-
 template <typename _Sto, typename _Up, __atomic_storage_is_locked<_Sto> = 0>
 _CCCL_HOST_DEVICE inline
 void __atomic_init_dispatch(_Sto* __a,  _Up __val) {
@@ -201,4 +197,4 @@ auto __atomic_fetch_xor_dispatch(_Sto* __a, _Up __pattern, memory_order, _Sco =
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
-#endif // _LIBCUDACXX___ATOMIC_STORAGE_LOCKED_H
+#endif // _LIBCUDACXX___ATOMIC_TYPES_LOCKED_H
diff --git a/libcudacxx/include/cuda/std/__atomic/storage/reference.h b/libcudacxx/include/cuda/std/__atomic/types/reference.h
similarity index 85%
rename from libcudacxx/include/cuda/std/__atomic/storage/reference.h
rename to libcudacxx/include/cuda/std/__atomic/types/reference.h
index e1e7309738..77af449842 100644
--- a/libcudacxx/include/cuda/std/__atomic/storage/reference.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/reference.h
@@ -1,4 +1,3 @@
-// -*- C++ -*-
 //===----------------------------------------------------------------------===//
 //
 // Part of libcu++, the C++ Standard Library for your entire system,
@@ -9,19 +8,18 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCUDACXX___ATOMIC_STORAGE_REF_H
-#define _LIBCUDACXX___ATOMIC_STORAGE_REF_H
+#ifndef _LIBCUDACXX___ATOMIC_TYPES_REFERENCE_H
+#define _LIBCUDACXX___ATOMIC_TYPES_REFERENCE_H
 
 #include <cuda/std/detail/__config>
 
 #include <cuda/std/type_traits>
 
-#include <cuda/std/__atomic/storage/base.h>
+#include <cuda/std/__atomic/types/base.h>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-// Reference is compatible with __atomic_base_tag and uses default dispatch
-
+// Reference is compatible with __atomic_base_tag and uses the default dispatch
 template <typename _Tp>
 struct __atomic_ref_storage {
   using __underlying_t = _Tp;
@@ -60,4 +58,4 @@ struct __atomic_ref_storage {
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
-#endif // _LIBCUDACXX___ATOMIC_STORAGE_REF_H
+#endif // _LIBCUDACXX___ATOMIC_TYPES_REFERENCE_H
diff --git a/libcudacxx/include/cuda/std/__atomic/storage/small.h b/libcudacxx/include/cuda/std/__atomic/types/small.h
similarity index 94%
rename from libcudacxx/include/cuda/std/__atomic/storage/small.h
rename to libcudacxx/include/cuda/std/__atomic/types/small.h
index 7b8d0bc49b..30deef7f7e 100644
--- a/libcudacxx/include/cuda/std/__atomic/storage/small.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/small.h
@@ -1,4 +1,3 @@
-// -*- C++ -*-
 //===----------------------------------------------------------------------===//
 //
 // Part of libcu++, the C++ Standard Library for your entire system,
@@ -9,19 +8,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCUDACXX___ATOMIC_STORAGE_SMALL_H
-#define _LIBCUDACXX___ATOMIC_STORAGE_SMALL_H
+#ifndef _LIBCUDACXX___ATOMIC_TYPES_SMALL_H
+#define _LIBCUDACXX___ATOMIC_TYPES_SMALL_H
 
 #include <cuda/std/detail/__config>
 
 #include <cuda/std/type_traits>
 
-#include <cuda/std/__atomic/storage/base.h>
+#include <cuda/std/__atomic/types/base.h>
 #include <cuda/std/__atomic/order.h>
 #include <cuda/std/__atomic/scopes.h>
 
-#include <cuda/std/__atomic/operations/heterogeneous.h>
-
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 // Atomic small types require conversion to/from a proxy type that can be
@@ -73,10 +70,6 @@ struct __atomic_small_storage {
     __atomic_storage<__proxy_t> __a_value;
 };
 
-// Extract the storage tag and SFINAE on the tag inside the storage object
-template <typename _Sto>
-using __atomic_storage_is_small = __enable_if_t<__atomic_tag::__atomic_small_tag == __remove_cvref_t<_Sto>::__tag, int>;
-
 template <typename _Sto, typename _Up, __atomic_storage_is_small<_Sto> = 0>
 _CCCL_HOST_DEVICE inline
 void __atomic_init_dispatch(_Sto* __a, _Up __val) {
@@ -184,4 +177,4 @@ auto __atomic_fetch_min_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sc
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
-#endif // _LIBCUDACXX___ATOMIC_STORAGE_SMALL_H
+#endif // _LIBCUDACXX___ATOMIC_TYPES_SMALL_H
diff --git a/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h b/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h
index d0fa35f50e..252fec2540 100644
--- a/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h
+++ b/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h
@@ -1,4 +1,3 @@
-// -*- C++ -*-
 //===----------------------------------------------------------------------===//
 //
 // Part of libcu++, the C++ Standard Library for your entire system,
@@ -17,113 +16,10 @@
 #include <cuda/std/__atomic/scopes.h>
 #include <cuda/std/__atomic/order.h>
 
-#include <cuda/std/__atomic/operations/heterogeneous.h>
 #include <cuda/std/__atomic/wait/polling.h>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-// Leaving this in to figure out if we want this.
-// For now this should be dead code, as we don't support platform wait.
-#ifdef _LIBCUDACXX_HAS_PLATFORM_WAIT
-
-template <class _Tp, int _Sco, __enable_if_t<!__libcpp_platform_wait_uses_type<_Tp>::__value, int> = 1>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_all(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a) {
-#ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
-    auto * const __c = __libcpp_contention_state(__a);
-    __cxx_atomic_fetch_add(__cxx_atomic_rebind<_Sco>(&__c->__version), (__libcpp_platform_wait_t)1, memory_order_relaxed);
-    __cxx_atomic_thread_fence(memory_order_seq_cst);
-    if (0 != __cxx_atomic_exchange(__cxx_atomic_rebind<_Sco>(&__c->__waiters), (ptrdiff_t)0, memory_order_relaxed))
-        __libcpp_platform_wake(&__c->__version, true);
-#endif
-}
-template <class _Tp, int _Sco, __enable_if_t<!__libcpp_platform_wait_uses_type<_Tp>::__value, int> = 1>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_one(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a) {
-    __cxx_atomic_notify_all(__a);
-}
-template <class _Ty, class _Tp = __detail::__cxx_atomic_underlying_t<_Ty>, int _Sco = _Ty::__sco, __enable_if_t<!__libcpp_platform_wait_uses_type<_Tp>::__value, int> = 1>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_try_wait_slow(_Ty const volatile* __a, _Tp const __val, memory_order __order) {
-#ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
-    auto * const __c = __libcpp_contention_state(__a);
-    __cxx_atomic_store(__cxx_atomic_rebind<_Sco>(&__c->__waiters), (ptrdiff_t)1, memory_order_relaxed);
-    __cxx_atomic_thread_fence(memory_order_seq_cst);
-    auto const __version = __cxx_atomic_load(__cxx_atomic_rebind<_Sco>(&__c->__version), memory_order_relaxed);
-    if (!__cxx_nonatomic_compare_equal(__cxx_atomic_load(__a, __order), __val))
-        return;
-    if(sizeof(__libcpp_platform_wait_t) < 8) {
-        constexpr timespec __timeout = { 2, 0 }; // Hedge on rare 'int version' aliasing.
-        __libcpp_platform_wait(&__c->__version, __version, &__timeout);
-    }
-    else
-        __libcpp_platform_wait(&__c->__version, __version, nullptr);
-#else
-    __cxx_atomic_try_wait_slow_fallback(__a, __val, __order);
-#endif // _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
-}
-
-template <class _Tp, int _Sco, __enable_if_t<__libcpp_platform_wait_uses_type<_Tp>::__value, int> = 1>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_try_wait_slow(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a, _Tp __val, memory_order) {
-#ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
-    auto * const __c = __libcpp_contention_state(__a);
-    __cxx_atomic_fetch_add(__cxx_atomic_rebind<_Sco>(&__c->__waiters), (ptrdiff_t)1, memory_order_relaxed);
-    __cxx_atomic_thread_fence(memory_order_seq_cst);
-#endif
-    __libcpp_platform_wait((_Tp*)__a, __val, nullptr);
-#ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
-    __cxx_atomic_fetch_sub(__cxx_atomic_rebind<_Sco>(&__c->__waiters), (ptrdiff_t)1, memory_order_relaxed);
-#endif
-}
-template <class _Tp, int _Sco, __enable_if_t<__libcpp_platform_wait_uses_type<_Tp>::__value, int> = 1>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_all(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a) {
-#ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
-    auto * const __c = __libcpp_contention_state(__a);
-    __cxx_atomic_thread_fence(memory_order_seq_cst);
-    if (0 != __cxx_atomic_load(__cxx_atomic_rebind<_Sco>(&__c->__waiters), memory_order_relaxed))
-#endif
-        __libcpp_platform_wake((_Tp*)__a, true);
-}
-template <class _Tp, int _Sco, __enable_if_t<__libcpp_platform_wait_uses_type<_Tp>::__value, int> = 1>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_one(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a) {
-#ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
-    auto * const __c = __libcpp_contention_state(__a);
-    __cxx_atomic_thread_fence(memory_order_seq_cst);
-    if (0 != __cxx_atomic_load(__cxx_atomic_rebind<_Sco>(&__c->__waiters), memory_order_relaxed))
-#endif
-        __libcpp_platform_wake((_Tp*)__a, false);
-}
-
-// Contention table wait/notify is also not supported as above.
-#elif !defined(_LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE)
-
-template <class _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_all(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a) {
-    auto * const __c = __libcpp_contention_state(__a);
-    __cxx_atomic_thread_fence(memory_order_seq_cst);
-    if(0 == __cxx_atomic_load(__cxx_atomic_rebind<_Sco>(&__c->__credit), memory_order_relaxed))
-        return;
-    if(0 != __cxx_atomic_exchange(__cxx_atomic_rebind<_Sco>(&__c->__credit), (ptrdiff_t)0, memory_order_relaxed)) {
-        __libcpp_mutex_lock(&__c->__mutex);
-        __libcpp_mutex_unlock(&__c->__mutex);
-        __libcpp_condvar_broadcast(&__c->__condvar);
-    }
-}
-template <class _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_one(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a) {
-    __cxx_atomic_notify_all(__a);
-}
-template <class _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_try_wait_slow(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a, _Tp const __val, memory_order __order) {
-    auto * const __c = __libcpp_contention_state(__a);
-    __libcpp_mutex_lock(&__c->__mutex);
-    __cxx_atomic_store(__cxx_atomic_rebind<_Sco>(&__c->__credit), (ptrdiff_t)1, memory_order_relaxed);
-    __cxx_atomic_thread_fence(memory_order_seq_cst);
-    if (__cxx_nonatomic_compare_equal(__cxx_atomic_load(__a, __order), __val))
-        __libcpp_condvar_wait(&__c->__condvar, &__c->__mutex);
-    __libcpp_mutex_unlock(&__c->__mutex);
-}
-
-#else
-
-// Heterogeneous atomic impl begins here
 extern "C" _CCCL_DEVICE void __atomic_try_wait_unsupported_before_SM_70__();
 
 template <typename _Tp, typename _Sco>
@@ -158,8 +54,6 @@ _LIBCUDACXX_INLINE_VISIBILITY void __atomic_notify_all(_Tp const volatile*, _Sco
     );
 }
 
-#endif // _LIBCUDACXX_HAS_PLATFORM_WAIT || !defined(_LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE)
-
 template <typename _Tp> _LIBCUDACXX_INLINE_VISIBILITY
 bool __nonatomic_compare_equal(_Tp const& __lhs, _Tp const& __rhs) {
 #if defined(_CCCL_CUDA_COMPILER)
@@ -185,4 +79,4 @@ _LIBCUDACXX_INLINE_VISIBILITY void __atomic_wait(_Tp const volatile* __a, __atom
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
-#endif _LIBCUDACXX___ATOMIC_WAIT_NOTIFY_WAIT_H
+#endif // _LIBCUDACXX___ATOMIC_WAIT_NOTIFY_WAIT_H
diff --git a/libcudacxx/include/cuda/std/__atomic/wait/polling.h b/libcudacxx/include/cuda/std/__atomic/wait/polling.h
index cac1c3d7e8..4785f36433 100644
--- a/libcudacxx/include/cuda/std/__atomic/wait/polling.h
+++ b/libcudacxx/include/cuda/std/__atomic/wait/polling.h
@@ -1,4 +1,3 @@
-// -*- C++ -*-
 //===----------------------------------------------------------------------===//
 //
 // Part of libcu++, the C++ Standard Library for your entire system,
@@ -14,12 +13,11 @@
 
 #include <cuda/std/detail/__config>
 
-#include <cuda/std/__atomic/storage/common.h>
+#include <cuda/std/__atomic/types.h>
+
 #include <cuda/std/__atomic/order.h>
 #include <cuda/std/__atomic/scopes.h>
 
-#include <cuda/std/__atomic/operations/heterogeneous.h>
-
 #include <cuda/std/detail/libcxx/include/__threading_support>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
diff --git a/libcudacxx/include/cuda/std/__cuda/atomic.h b/libcudacxx/include/cuda/std/__cuda/atomic.h
index 95b4268b58..25d1c773ed 100644
--- a/libcudacxx/include/cuda/std/__cuda/atomic.h
+++ b/libcudacxx/include/cuda/std/__cuda/atomic.h
@@ -30,208 +30,81 @@ template <class _Tp, thread_scope _Sco = thread_scope::thread_scope_system>
 struct atomic
   : public std::__atomic_impl<_Tp, _Sco>
 {
-  using value_type = _Tp;
+    using value_type = _Tp;
+
+  constexpr atomic() noexcept = default;
 
-  _CCCL_HOST_DEVICE
-  constexpr atomic() noexcept
-      : std::__atomic_impl<_Tp, _Sco>() {}
-  _CCCL_HOST_DEVICE
+  _LIBCUDACXX_INLINE_VISIBILITY
   constexpr atomic(_Tp __d) noexcept
       : std::__atomic_impl<_Tp, _Sco>(__d) {}
 
-  _CCCL_HOST_DEVICE _Tp operator=(_Tp __d) volatile noexcept
-  {
-    this->store(__d);
-    return __d;
-  }
-  _CCCL_HOST_DEVICE _Tp operator=(_Tp __d) noexcept
-  {
-    this->store(__d);
-    return __d;
-  }
-
-  _CCCL_HOST_DEVICE _Tp fetch_max(const _Tp& __op, memory_order __m = memory_order_seq_cst) volatile noexcept
-  {
-    return std::__atomic_fetch_max_dispatch(this->__this_atom(), __op, __m, std::__scope_to_tag<_Sco>{});
-  }
-
-  _CCCL_HOST_DEVICE _Tp fetch_min(const _Tp& __op, memory_order __m = memory_order_seq_cst) volatile noexcept
-  {
-    return std::__atomic_fetch_min_dispatch(this->__this_atom(), __op, __m, std::__scope_to_tag<_Sco>{});
-  }
-};
-
-// atomic<T*>
-
-template <class _Tp, thread_scope _Sco>
-struct atomic<_Tp*, _Sco>
-  : public std::__atomic_impl<_Tp*, _Sco>
-{
-  using value_type = _Tp*;
-
-  _CCCL_HOST_DEVICE
-  constexpr atomic() noexcept
-      : std::__atomic_impl<_Tp*, _Sco>() {}
-
-  _CCCL_HOST_DEVICE
-  constexpr atomic(_Tp* __d) noexcept
-      : std::__atomic_impl<_Tp*, _Sco>(__d) {}
+  atomic(const atomic&) = delete;
+  atomic& operator=(const atomic&) = delete;
+  atomic& operator=(const atomic&) volatile = delete;
 
-  _CCCL_HOST_DEVICE _Tp* operator=(_Tp* __d) volatile noexcept
-  {
-    this->store(__d);
-    return __d;
-  }
-  _CCCL_HOST_DEVICE _Tp* operator=(_Tp* __d) noexcept
-  {
-    this->store(__d);
-    return __d;
-  }
+  _LIBCUDACXX_INLINE_VISIBILITY
+  _Tp operator=(_Tp __d) volatile noexcept
+      {this->store(__d); return __d;}
+  _LIBCUDACXX_INLINE_VISIBILITY
+  _Tp operator=(_Tp __d) noexcept
+      {this->store(__d); return __d;}
 
-  _CCCL_HOST_DEVICE _Tp* fetch_add(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) volatile noexcept
+  _LIBCUDACXX_INLINE_VISIBILITY
+  _Tp fetch_max(const _Tp& __op, memory_order __m = memory_order_seq_cst) noexcept
   {
-    return std::__atomic_fetch_add_dispatch(this->__this_atom(), __op, __m, std::__scope_to_tag<_Sco>{});
+    return std::__atomic_fetch_max_dispatch(&this->__a, __op, __m, std::__scope_to_tag<_Sco>{});
   }
-  _CCCL_HOST_DEVICE _Tp* fetch_add(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) noexcept
+  _LIBCUDACXX_INLINE_VISIBILITY
+  _Tp fetch_max(const _Tp& __op, memory_order __m = memory_order_seq_cst) volatile noexcept
   {
-    return std::__atomic_fetch_add_dispatch(this->__this_atom(), __op, __m, std::__scope_to_tag<_Sco>{});
-  }
-  _CCCL_HOST_DEVICE _Tp* fetch_sub(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) volatile noexcept
-  {
-    return std::__atomic_fetch_sub_dispatch(this->__this_atom(), __op, __m, std::__scope_to_tag<_Sco>{});
-  }
-  _CCCL_HOST_DEVICE _Tp* fetch_sub(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) noexcept
-  {
-    return std::__atomic_fetch_sub_dispatch(this->__this_atom(), __op, __m, std::__scope_to_tag<_Sco>{});
+    return std::__atomic_fetch_max_dispatch(&this->__a, __op, __m, std::__scope_to_tag<_Sco>{});
   }
 
-  _CCCL_HOST_DEVICE _Tp* operator++(int) volatile noexcept
-  {
-    return fetch_add(1);
-  }
-  _CCCL_HOST_DEVICE _Tp* operator++(int) noexcept
-  {
-    return fetch_add(1);
-  }
-  _CCCL_HOST_DEVICE _Tp* operator--(int) volatile noexcept
-  {
-    return fetch_sub(1);
-  }
-  _CCCL_HOST_DEVICE _Tp* operator--(int) noexcept
-  {
-    return fetch_sub(1);
-  }
-  _CCCL_HOST_DEVICE _Tp* operator++() volatile noexcept
-  {
-    return fetch_add(1) + 1;
-  }
-  _CCCL_HOST_DEVICE _Tp* operator++() noexcept
+  _LIBCUDACXX_INLINE_VISIBILITY
+  _Tp fetch_min(const _Tp& __op, memory_order __m = memory_order_seq_cst) noexcept
   {
-    return fetch_add(1) + 1;
+    return std::__atomic_fetch_min_dispatch(&this->__a, __op, __m, std::__scope_to_tag<_Sco>{});
   }
-  _CCCL_HOST_DEVICE _Tp* operator--() volatile noexcept
+  _LIBCUDACXX_INLINE_VISIBILITY
+  _Tp fetch_min(const _Tp& __op, memory_order __m = memory_order_seq_cst) volatile noexcept
   {
-    return fetch_sub(1) - 1;
-  }
-  _CCCL_HOST_DEVICE _Tp* operator--() noexcept
-  {
-    return fetch_sub(1) - 1;
-  }
-  _CCCL_HOST_DEVICE _Tp* operator+=(ptrdiff_t __op) volatile noexcept
-  {
-    return fetch_add(__op) + __op;
-  }
-  _CCCL_HOST_DEVICE _Tp* operator+=(ptrdiff_t __op) noexcept
-  {
-    return fetch_add(__op) + __op;
-  }
-  _CCCL_HOST_DEVICE _Tp* operator-=(ptrdiff_t __op) volatile noexcept
-  {
-    return fetch_sub(__op) - __op;
-  }
-  _CCCL_HOST_DEVICE _Tp* operator-=(ptrdiff_t __op) noexcept
-  {
-    return fetch_sub(__op) - __op;
+    return std::__atomic_fetch_min_dispatch(&this->__a, __op, __m, std::__scope_to_tag<_Sco>{});
   }
 };
 
 // atomic_ref<T>
 
 template <class _Tp, thread_scope _Sco = thread_scope::thread_scope_system>
-struct atomic_ref : public std::__atomic_ref_impl<_Tp, _Sco>
+struct atomic_ref
+  : public std::__atomic_ref_impl<_Tp, _Sco>
 {
-  typedef std::__atomic_ref_impl<_Tp, _Sco> __base;
-
-  _CCCL_HOST_DEVICE constexpr atomic_ref(_Tp& __d) noexcept
-      : __base(__d)
-  {}
-
-  _CCCL_HOST_DEVICE _Tp operator=(_Tp __d) const noexcept
-  {
-    this->store(__d);
-    return __d;
-  }
-
-  _CCCL_HOST_DEVICE _Tp fetch_max(const _Tp& __op, memory_order __m = memory_order_seq_cst) const noexcept
-  {
-    return std::__atomic_fetch_max_dispatch(this->__this_atom(), __op, __m, std::__scope_to_tag<_Sco>{});
-  }
+  using value_type = _Tp;
 
-  _CCCL_HOST_DEVICE _Tp fetch_min(const _Tp& __op, memory_order __m = memory_order_seq_cst) const noexcept
-  {
-    return std::__atomic_fetch_min_dispatch(this->__this_atom(), __op, __m, std::__scope_to_tag<_Sco>{});
-  }
-};
+  static constexpr size_t required_alignment = sizeof(_Tp);
 
-// atomic_ref<T*>
+  static constexpr bool is_always_lock_free = sizeof(_Tp) <= 8;
 
-template <class _Tp, thread_scope _Sco>
-struct atomic_ref<_Tp*, _Sco> : public std::__atomic_ref_impl<_Tp*, _Sco>
-{
-  typedef std::__atomic_ref_impl<_Tp*, _Sco> __base;
+  _LIBCUDACXX_INLINE_VISIBILITY
+  explicit atomic_ref(_Tp& __ref)
+    : std::__atomic_ref_impl<_Tp, _Sco>(__ref) {}
 
-  _CCCL_HOST_DEVICE constexpr atomic_ref(_Tp*& __d) noexcept
-      : __base(__d)
-  {}
+  _LIBCUDACXX_INLINE_VISIBILITY
+  _Tp operator=(_Tp __v) const noexcept {this->store(__v); return __v;}
 
-  _CCCL_HOST_DEVICE _Tp* operator=(_Tp* __d) const noexcept
-  {
-    this->store(__d);
-    return __d;
-  }
+  atomic_ref(const atomic_ref&) noexcept = default;
+  atomic_ref& operator=(const atomic_ref&) = delete;
+  atomic_ref& operator=(const atomic_ref&) const = delete;
 
-  _CCCL_HOST_DEVICE _Tp* fetch_add(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) const noexcept
-  {
-    return __atomic_fetch_add_dispatch(this->__this_atom(), __op, __m, std::__scope_to_tag<_Sco>{});
-  }
-  _CCCL_HOST_DEVICE _Tp* fetch_sub(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) const noexcept
+  _LIBCUDACXX_INLINE_VISIBILITY
+  _Tp fetch_max(const _Tp& __op, memory_order __m = memory_order_seq_cst) const noexcept
   {
-    return __atomic_fetch_sub_dispatch(this->__this_atom(), __op, __m, std::__scope_to_tag<_Sco>{});
+    return std::__atomic_fetch_max_dispatch(&this->__a, __op, __m, std::__scope_to_tag<_Sco>{});
   }
 
-  _CCCL_HOST_DEVICE _Tp* operator++(int) const noexcept
-  {
-    return fetch_add(1);
-  }
-  _CCCL_HOST_DEVICE _Tp* operator--(int) const noexcept
-  {
-    return fetch_sub(1);
-  }
-  _CCCL_HOST_DEVICE _Tp* operator++() const noexcept
-  {
-    return fetch_add(1) + 1;
-  }
-  _CCCL_HOST_DEVICE _Tp* operator--() const noexcept
-  {
-    return fetch_sub(1) - 1;
-  }
-  _CCCL_HOST_DEVICE _Tp* operator+=(ptrdiff_t __op) const noexcept
-  {
-    return fetch_add(__op) + __op;
-  }
-  _CCCL_HOST_DEVICE _Tp* operator-=(ptrdiff_t __op) const noexcept
+  _LIBCUDACXX_INLINE_VISIBILITY
+  _Tp fetch_min(const _Tp& __op, memory_order __m = memory_order_seq_cst) const noexcept
   {
-    return fetch_sub(__op) - __op;
+    return std::__atomic_fetch_min_dispatch(&this->__a, __op, __m, std::__scope_to_tag<_Sco>{});
   }
 };
 
diff --git a/libcudacxx/include/cuda/std/__cuda/barrier.h b/libcudacxx/include/cuda/std/__cuda/barrier.h
index 7c6ac14030..1605b59a80 100644
--- a/libcudacxx/include/cuda/std/__cuda/barrier.h
+++ b/libcudacxx/include/cuda/std/__cuda/barrier.h
@@ -13,7 +13,7 @@
 
 #include <cuda/std/detail/__config>
 
-#include <cuda/std/__atomic/api/atomic_impl.h>
+#include <cuda/std/__atomic/api/owned.h>
 
 #if defined(__CUDA_MINIMUM_ARCH__) && __CUDA_MINIMUM_ARCH__ < 700
 #  error "CUDA synchronization primitives are only supported for sm_70 and up."
diff --git a/libcudacxx/include/cuda/std/atomic b/libcudacxx/include/cuda/std/atomic
index bb8296ec6c..ec35ad16c1 100644
--- a/libcudacxx/include/cuda/std/atomic
+++ b/libcudacxx/include/cuda/std/atomic
@@ -43,16 +43,16 @@ template <class T> T kill_dependency(T y) noexcept;
 
 // lock-free property
 
-#define ATOMIC_BOOL_LOCK_FREE unspecified
-#define ATOMIC_CHAR_LOCK_FREE unspecified
-#define ATOMIC_CHAR16_T_LOCK_FREE unspecified
-#define ATOMIC_CHAR32_T_LOCK_FREE unspecified
-#define ATOMIC_WCHAR_T_LOCK_FREE unspecified
-#define ATOMIC_SHORT_LOCK_FREE unspecified
-#define ATOMIC_INT_LOCK_FREE unspecified
-#define ATOMIC_LONG_LOCK_FREE unspecified
-#define ATOMIC_LLONG_LOCK_FREE unspecified
-#define ATOMIC_POINTER_LOCK_FREE unspecified
+#define LIBCUDACXX_ATOMIC_BOOL_LOCK_FREE unspecified
+#define LIBCUDACXX_ATOMIC_CHAR_LOCK_FREE unspecified
+#define LIBCUDACXX_ATOMIC_CHAR16_T_LOCK_FREE unspecified
+#define LIBCUDACXX_ATOMIC_CHAR32_T_LOCK_FREE unspecified
+#define LIBCUDACXX_ATOMIC_WCHAR_T_LOCK_FREE unspecified
+#define LIBCUDACXX_ATOMIC_SHORT_LOCK_FREE unspecified
+#define LIBCUDACXX_ATOMIC_INT_LOCK_FREE unspecified
+#define LIBCUDACXX_ATOMIC_LONG_LOCK_FREE unspecified
+#define LIBCUDACXX_ATOMIC_LLONG_LOCK_FREE unspecified
+#define LIBCUDACXX_ATOMIC_POINTER_LOCK_FREE unspecified
 
 // flag type and operations
 
@@ -556,25 +556,6 @@ void atomic_signal_fence(memory_order m) noexcept;
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/std/detail/libcxx/include/__assert> // all public C++ headers provide the assertion handler
-#include <cuda/std/detail/libcxx/include/__debug>
-#include <cuda/std/detail/libcxx/include/__threading_support>
-#include <cuda/std/__type_traits/conditional.h>
-#include <cuda/std/__type_traits/enable_if.h>
-#include <cuda/std/__type_traits/is_assignable.h>
-#include <cuda/std/__type_traits/is_const.h>
-#include <cuda/std/__type_traits/is_floating_point.h>
-#include <cuda/std/__type_traits/is_integral.h>
-#include <cuda/std/__type_traits/is_same.h>
-#include <cuda/std/__type_traits/is_trivially_copyable.h>
-#include <cuda/std/__type_traits/underlying_type.h>
-#include <cuda/std/__utility/forward.h>
-#include <cuda/std/detail/libcxx/include/cstring>
-#include <cuda/std/cstddef>
-#include <cuda/std/cstdint>
-#include <cuda/std/type_traits>
-#include <cuda/std/version>
-
 #include <cuda/std/detail/libcxx/include/__pragma_push>
 
 #ifdef _LIBCUDACXX_HAS_NO_THREADS
@@ -590,19 +571,16 @@ void atomic_signal_fence(memory_order m) noexcept;
 # error C++ standard library is incompatible with <stdatomic.h>
 #endif
 
+#include <cuda/std/__atomic/platform.h>
+
 #include <cuda/std/__atomic/order.h>
 #include <cuda/std/__atomic/scopes.h>
 
-#include <cuda/std/__atomic/storage/common.h>
-#include <cuda/std/__atomic/storage/base.h>
-#include <cuda/std/__atomic/storage/locked.h>
-#include <cuda/std/__atomic/storage/reference.h>
-#include <cuda/std/__atomic/storage/small.h>
-
 #include <cuda/std/__atomic/wait/polling.h>
 #include <cuda/std/__atomic/wait/notify_wait.h>
 
-#include <cuda/std/__atomic/api/atomic_impl.h>
+#include <cuda/std/__atomic/api/owned.h>
+#include <cuda/std/__atomic/api/reference.h>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
@@ -616,17 +594,21 @@ _Tp kill_dependency(_Tp __y) noexcept
 // atomic<T>
 template <class _Tp>
 struct atomic
-    : public __atomic_impl<_Tp, thread_scope_system>
+    : public __atomic_impl<_Tp>
 {
     using value_type = _Tp;
 
     _LIBCUDACXX_INLINE_VISIBILITY
     constexpr atomic() noexcept
-        : __atomic_impl<_Tp, thread_scope_system>() {}
+        : __atomic_impl<_Tp>() {}
 
     _LIBCUDACXX_INLINE_VISIBILITY
     constexpr atomic(_Tp __d) noexcept
-        : __atomic_impl<_Tp, thread_scope_system>(__d) {}
+        : __atomic_impl<_Tp>(__d) {}
+
+    atomic(const atomic&) = delete;
+    atomic& operator=(const atomic&) = delete;
+    atomic& operator=(const atomic&) volatile = delete;
 
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp operator=(_Tp __d) volatile noexcept
@@ -636,73 +618,10 @@ struct atomic
         {this->store(__d); return __d;}
 };
 
-// atomic<T*>
-
-template <class _Tp>
-struct atomic<_Tp*>
-    : public __atomic_impl<_Tp*, thread_scope_system>
-{
-    using value_type = _Tp*;
-
-    atomic() noexcept = default;
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    constexpr atomic(_Tp* __d) noexcept : __atomic_impl<_Tp*, thread_scope_system>(__d) {}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator=(_Tp* __d) volatile noexcept
-        {this->store(__d); return __d;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator=(_Tp* __d) noexcept
-        {this->store(__d); return __d;}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* fetch_add(ptrdiff_t __op, memory_order __m = memory_order_seq_cst)
-                                                                        volatile noexcept
-        {return __atomic_fetch_add_dispatch(this->__this_atom(), __op, __m, __thread_scope_system_tag{});}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* fetch_add(ptrdiff_t __op, memory_order __m = memory_order_seq_cst)
-                                                                        noexcept
-        {return __atomic_fetch_add_dispatch(this->__this_atom(), __op, __m, __thread_scope_system_tag{});}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* fetch_sub(ptrdiff_t __op, memory_order __m = memory_order_seq_cst)
-                                                                        volatile noexcept
-        {return __atomic_fetch_sub_dispatch(this->__this_atom(), __op, __m, __thread_scope_system_tag{});}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* fetch_sub(ptrdiff_t __op, memory_order __m = memory_order_seq_cst)
-                                                                        noexcept
-        {return __atomic_fetch_sub_dispatch(this->__this_atom(), __op, __m, __thread_scope_system_tag{});}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator++(int) volatile noexcept            {return fetch_add(1);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator++(int) noexcept                     {return fetch_add(1);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator--(int) volatile noexcept            {return fetch_sub(1);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator--(int) noexcept                     {return fetch_sub(1);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator++() volatile noexcept               {return fetch_add(1) + 1;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator++() noexcept                        {return fetch_add(1) + 1;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator--() volatile noexcept               {return fetch_sub(1) - 1;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator--() noexcept                        {return fetch_sub(1) - 1;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator+=(ptrdiff_t __op) volatile noexcept {return fetch_add(__op) + __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator+=(ptrdiff_t __op) noexcept          {return fetch_add(__op) + __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator-=(ptrdiff_t __op) volatile noexcept {return fetch_sub(__op) - __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator-=(ptrdiff_t __op) noexcept          {return fetch_sub(__op) - __op;}
-};
-
 // atomic_ref<T>
 template <class _Tp>
 struct atomic_ref
-    : public __atomic_ref_impl<_Tp, thread_scope_system>
+    : public __atomic_ref_impl<_Tp>
 {
     using value_type = _Tp;
 
@@ -711,50 +630,14 @@ struct atomic_ref
     static constexpr bool is_always_lock_free = sizeof(_Tp) <= 8;
 
     _LIBCUDACXX_INLINE_VISIBILITY
-    explicit atomic_ref(_Tp& __ref) : __atomic_ref_impl<_Tp, thread_scope_system>(__ref) {}
+    explicit atomic_ref(_Tp& __ref) : __atomic_ref_impl<_Tp>(__ref) {}
 
     _LIBCUDACXX_INLINE_VISIBILITY
     _Tp operator=(_Tp __v) const noexcept {this->store(__v); return __v;}
-};
-
-// atomic_ref<T*>
-template <class _Tp>
- struct atomic_ref<_Tp*>
-    : public __atomic_ref_impl<_Tp*, thread_scope_system>
-{
-    using value_type = _Tp*;
-
-    static constexpr size_t required_alignment = sizeof(_Tp*);
-
-    static constexpr bool is_always_lock_free = sizeof(_Tp*) <= 8;
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    explicit atomic_ref(_Tp*& __ref) : __atomic_ref_impl<_Tp*, thread_scope_system>(__ref) {}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator=(_Tp* __v) const noexcept {this->store(__v); return __v;}
 
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* fetch_add(ptrdiff_t __op, memory_order __m = memory_order_seq_cst)
-                                                                        const noexcept
-        {return __atomic_fetch_add_dispatch(this->__this_atom(), __op, __m, __thread_scope_system_tag{});}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* fetch_sub(ptrdiff_t __op, memory_order __m = memory_order_seq_cst)
-                                                                        const noexcept
-        {return __atomic_fetch_sub_dispatch(this->__this_atom(), __op, __m, __thread_scope_system_tag{});}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator++(int) const noexcept                     {return fetch_add(1);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator--(int) const noexcept                     {return fetch_sub(1);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator++() const noexcept                        {return fetch_add(1) + 1;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator--() const noexcept                        {return fetch_sub(1) - 1;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator+=(ptrdiff_t __op) const noexcept          {return fetch_add(__op) + __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator-=(ptrdiff_t __op) const noexcept          {return fetch_sub(__op) - __op;}
+    atomic_ref(const atomic_ref&) noexcept = default;
+    atomic_ref& operator=(const atomic_ref&) = delete;
+    atomic_ref& operator=(const atomic_ref&) const = delete;
 };
 
 // atomic_is_lock_free
@@ -782,7 +665,7 @@ _LIBCUDACXX_INLINE_VISIBILITY
 void
 atomic_init(volatile atomic<_Tp>* __o, _Tp __d) noexcept
 {
-    __atomic_init_dispatch(__o->__this_atom(), __d);
+    __atomic_init_dispatch(&__o->__a, __d);
 }
 
 template <class _Tp>
@@ -790,7 +673,7 @@ _LIBCUDACXX_INLINE_VISIBILITY
 void
 atomic_init(atomic<_Tp>* __o, _Tp __d) noexcept
 {
-    __atomic_init_dispatch(__o->__this_atom(), __d);
+    __atomic_init_dispatch(&__o->__a, __d);
 }
 
 // atomic_store
@@ -1386,59 +1269,59 @@ atomic_fetch_xor_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 
 // flag type and operations
 
-typedef struct atomic_flag
-    : protected __atomic_impl<_LIBCUDACXX_ATOMIC_FLAG_TYPE, thread_scope_system>
+struct atomic_flag
 {
+    __atomic_storage_t<_LIBCUDACXX_ATOMIC_FLAG_TYPE> __a;
+
     _LIBCUDACXX_INLINE_VISIBILITY
     bool test(memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {return _LIBCUDACXX_ATOMIC_FLAG_TYPE(true)==__atomic_load_dispatch(this->__get_atom(), __m, __thread_scope_system_tag{});}
+        {return _LIBCUDACXX_ATOMIC_FLAG_TYPE(true)==__atomic_load_dispatch(&__a, __m, __thread_scope_system_tag{});}
     _LIBCUDACXX_INLINE_VISIBILITY
     bool test(memory_order __m = memory_order_seq_cst) const noexcept
-        {return _LIBCUDACXX_ATOMIC_FLAG_TYPE(true)==__atomic_load_dispatch(this->__get_atom(), __m, __thread_scope_system_tag{});}
+        {return _LIBCUDACXX_ATOMIC_FLAG_TYPE(true)==__atomic_load_dispatch(&__a, __m, __thread_scope_system_tag{});}
 
     _LIBCUDACXX_INLINE_VISIBILITY
     bool test_and_set(memory_order __m = memory_order_seq_cst) volatile noexcept
-        {return __atomic_exchange_dispatch(this->__get_atom(), _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), __m, __thread_scope_system_tag{});}
+        {return __atomic_exchange_dispatch(&__a, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), __m, __thread_scope_system_tag{});}
     _LIBCUDACXX_INLINE_VISIBILITY
     bool test_and_set(memory_order __m = memory_order_seq_cst) noexcept
-        {return __atomic_exchange_dispatch(this->__get_atom(), _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), __m, __thread_scope_system_tag{});}
+        {return __atomic_exchange_dispatch(&__a, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), __m, __thread_scope_system_tag{});}
     _LIBCUDACXX_INLINE_VISIBILITY
     void clear(memory_order __m = memory_order_seq_cst) volatile noexcept
-        {__atomic_store_dispatch(this->__get_atom(), _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), __m, __thread_scope_system_tag{});}
+        {__atomic_store_dispatch(&__a, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), __m, __thread_scope_system_tag{});}
     _LIBCUDACXX_INLINE_VISIBILITY
     void clear(memory_order __m = memory_order_seq_cst) noexcept
-        {__atomic_store_dispatch(this->__get_atom(), _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), __m, __thread_scope_system_tag{});}
+        {__atomic_store_dispatch(&__a, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), __m, __thread_scope_system_tag{});}
 
-#if !defined(__CUDA_MINIMUM_ARCH__) || __CUDA_MINIMUM_ARCH__ >= 700
     _LIBCUDACXX_INLINE_VISIBILITY
-    void wait(bool __v, memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {__atomic_wait(this->__get_atom(), _LIBCUDACXX_ATOMIC_FLAG_TYPE(__v), __m, __thread_scope_system_tag{});}
+    void wait(_LIBCUDACXX_ATOMIC_FLAG_TYPE __v, memory_order __m = memory_order_seq_cst) const volatile noexcept
+        {__atomic_wait(&__a, __v, __m, __thread_scope_system_tag{});}
     _LIBCUDACXX_INLINE_VISIBILITY
-    void wait(bool __v, memory_order __m = memory_order_seq_cst) const noexcept
-        {__atomic_wait(this->__get_atom(), _LIBCUDACXX_ATOMIC_FLAG_TYPE(__v), __m, __thread_scope_system_tag{});}
+    void wait(_LIBCUDACXX_ATOMIC_FLAG_TYPE __v, memory_order __m = memory_order_seq_cst) const noexcept
+        {__atomic_wait(&__a, __v, __m, __thread_scope_system_tag{});}
     _LIBCUDACXX_INLINE_VISIBILITY
     void notify_one() volatile noexcept
-        {__atomic_notify_one(this->__get_atom(), __thread_scope_system_tag{});}
+        {__atomic_notify_one(&__a, __thread_scope_system_tag{});}
     _LIBCUDACXX_INLINE_VISIBILITY
     void notify_one() noexcept
-        {__atomic_notify_one(this->__get_atom(), __thread_scope_system_tag{});}
+        {__atomic_notify_one(&__a, __thread_scope_system_tag{});}
     _LIBCUDACXX_INLINE_VISIBILITY
     void notify_all() volatile noexcept
-        {__atomic_notify_all(this->__get_atom(), __thread_scope_system_tag{});}
+        {__atomic_notify_all(&__a, __thread_scope_system_tag{});}
     _LIBCUDACXX_INLINE_VISIBILITY
     void notify_all() noexcept
-        {__atomic_notify_all(this->__get_atom(), __thread_scope_system_tag{});}
-#endif
+        {__atomic_notify_all(&__a, __thread_scope_system_tag{});}
+
 
     atomic_flag() noexcept = default;
 
     _LIBCUDACXX_INLINE_VISIBILITY constexpr
-    atomic_flag(bool __b) noexcept : __atomic_impl<_LIBCUDACXX_ATOMIC_FLAG_TYPE, thread_scope_system>(__b) {} // EXTENSION
+    atomic_flag(bool __b) noexcept : __a(__b) {} // EXTENSION
 
     atomic_flag(const atomic_flag&) = delete;
     atomic_flag& operator=(const atomic_flag&) = delete;
     atomic_flag& operator=(const atomic_flag&) volatile = delete;
-} atomic_flag;
+};
 
 
 inline _LIBCUDACXX_INLINE_VISIBILITY
@@ -1655,7 +1538,7 @@ typedef atomic<ptrdiff_t> atomic_ptrdiff_t;
 typedef atomic<intmax_t>  atomic_intmax_t;
 typedef atomic<uintmax_t> atomic_uintmax_t;
 
-static_assert(ATOMIC_INT_LOCK_FREE, "This library assumes atomic<int> is lock-free.");
+static_assert(LIBCUDACXX_ATOMIC_INT_LOCK_FREE, "This library assumes atomic<int> is lock-free.");
 
 typedef atomic<int>       atomic_signed_lock_free;
 typedef atomic<unsigned>  atomic_unsigned_lock_free;
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/latch b/libcudacxx/include/cuda/std/detail/libcxx/include/latch
index 290eb924eb..f2e6fb154f 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/latch
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/latch
@@ -53,7 +53,7 @@ namespace std
 
 #include <cuda/std/limits>
 
-#include <cuda/std/__atomic/api/atomic_impl.h>
+#include <cuda/std/__atomic/api/owned.h>
 
 #include <cuda/std/detail/libcxx/include/__pragma_push>
 
@@ -72,7 +72,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 template<thread_scope _Sco = thread_scope_system>
 class __latch_base
 {
-    _LIBCUDACXX_LATCH_ALIGNMENT __atomic_impl<ptrdiff_t, _Sco> __counter;
+    _LIBCUDACXX_LATCH_ALIGNMENT __atomic_impl<volatile ptrdiff_t, _Sco> __counter;
 public:
     inline _LIBCUDACXX_INLINE_VISIBILITY constexpr
     explicit __latch_base(ptrdiff_t __expected)

From a625d3f9c0fafc9af5c73989f9cb43cbfde5e144 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Thu, 2 May 2024 10:54:36 -0700
Subject: [PATCH 15/71] Update codegen to reflect new header layout.

---
 libcudacxx/codegen/CMakeLists.txt | 10 +++++-----
 libcudacxx/codegen/codegen.cpp    | 14 +++++++++++++-
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/libcudacxx/codegen/CMakeLists.txt b/libcudacxx/codegen/CMakeLists.txt
index 77e749b83b..3477f988af 100644
--- a/libcudacxx/codegen/CMakeLists.txt
+++ b/libcudacxx/codegen/CMakeLists.txt
@@ -19,8 +19,8 @@ target_compile_features(
 
 add_dependencies(libcudacxx.atomics.codegen codegen)
 
-set(atomic_generated_output "${libcudacxx_BINARY_DIR}/codegen/atomic_cuda_ptx_generated.h")
-set(atomic_install_location "${libcudacxx_SOURCE_DIR}/include/cuda/std/__atomic/operations")
+set(atomic_generated_output "${libcudacxx_BINARY_DIR}/codegen/cuda_ptx_generated.h")
+set(atomic_install_location "${libcudacxx_SOURCE_DIR}/include/cuda/std/__atomic/functions")
 
 add_custom_target(
     libcudacxx.atomics.codegen.execute
@@ -32,13 +32,13 @@ add_dependencies(libcudacxx.atomics.codegen libcudacxx.atomics.codegen.execute)
 
 add_custom_target(
     libcudacxx.atomics.codegen.install
-    COMMAND ${CMAKE_COMMAND} -E copy "${atomic_generated_output}" "${atomic_install_location}/atomic_cuda_ptx_generated.h"
-    BYPRODUCTS "${atomic_install_location}/atomic_cuda_ptx_generated.h"
+    COMMAND ${CMAKE_COMMAND} -E copy "${atomic_generated_output}" "${atomic_install_location}/cuda_ptx_generated.h"
+    BYPRODUCTS "${atomic_install_location}/cuda_ptx_generated.h"
 )
 
 add_dependencies(libcudacxx.atomics.codegen.install libcudacxx.atomics.codegen.execute)
 
 add_test(
     NAME libcudacxx.atomics.codegen.diff
-    COMMAND ${CMAKE_COMMAND} -E compare_files "${atomic_install_location}/atomic_cuda_ptx_generated.h" "${atomic_generated_output}"
+    COMMAND ${CMAKE_COMMAND} -E compare_files "${atomic_install_location}/cuda_ptx_generated.h" "${atomic_generated_output}"
 )
diff --git a/libcudacxx/codegen/codegen.cpp b/libcudacxx/codegen/codegen.cpp
index 2df154de05..508f41ae68 100644
--- a/libcudacxx/codegen/codegen.cpp
+++ b/libcudacxx/codegen/codegen.cpp
@@ -66,7 +66,7 @@ int main()
 
   std::vector<std::string> cv_qualifier{"volatile ", ""};
 
-  std::ofstream out("atomic_cuda_ptx_generated.h");
+  std::ofstream out("cuda_ptx_generated.h");
 
     out << R"XXX(//===----------------------------------------------------------------------===//
 //
@@ -81,6 +81,17 @@ int main()
 // This is a autogenerated file, we want to ensure that it contains exactly the contents we want to generate
 // clang-format off
 
+#ifndef _LIBCUDACXX___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H
+#define _LIBCUDACXX___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H
+
+#include <cuda/std/detail/__config>
+#include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/is_signed.h>
+#include <cuda/std/__type_traits/is_unsigned.h>
+
+#include <cuda/std/__atomic/scopes.h>
+#include <cuda/std/__atomic/order.h>
+
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 )XXX";
@@ -507,6 +518,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
   }
 
   out << "\n_LIBCUDACXX_END_NAMESPACE_STD\n";
+  out << "\n#endif // _LIBCUDACXX___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H\n";
   out << "\n// clang-format on\n";
 
   return 0;

From 4a4782b37950a2026e5c88a843543e1b69f5ff92 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Thu, 2 May 2024 10:56:15 -0700
Subject: [PATCH 16/71] Make platform.h define
 `LIBCUDACXX_ATOMIC_BLAH_LOCK_FREE`.

 * We previously defined or *clobbered* the existing STL definitions.
 * See: `ATOMIC_BOOL_LOCK_FREE`->`LIBCUDACXX_ATOMIC_BOOL_LOCK_FREE`
---
 .../include/cuda/std/__atomic/platform.h      | 71 +++++++++++++++++++
 .../include/cuda/std/__cuda/atomic_prelude.h  | 64 -----------------
 .../detail/libcxx/include/__threading_support |  1 -
 .../isalwayslockfree.pass.cpp                 | 38 +++++-----
 .../atomics.lockfree/lockfree.pass.cpp        | 20 +++---
 5 files changed, 100 insertions(+), 94 deletions(-)
 create mode 100644 libcudacxx/include/cuda/std/__atomic/platform.h
 delete mode 100644 libcudacxx/include/cuda/std/__cuda/atomic_prelude.h

diff --git a/libcudacxx/include/cuda/std/__atomic/platform.h b/libcudacxx/include/cuda/std/__atomic/platform.h
new file mode 100644
index 0000000000..286b5a8483
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__atomic/platform.h
@@ -0,0 +1,71 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __LIBCUDACXX___ATOMIC_PLATFORM_H
+#define __LIBCUDACXX___ATOMIC_PLATFORM_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_COMPILER_MSVC)
+#include <cuda/std/__atomic/platform/msvc_to_builtins.h>
+#endif
+
+#if defined(__CLANG_ATOMIC_BOOL_LOCK_FREE)
+# define LIBCUDACXX_ATOMIC_BOOL_LOCK_FREE      __CLANG_ATOMIC_BOOL_LOCK_FREE
+# define LIBCUDACXX_ATOMIC_CHAR_LOCK_FREE      __CLANG_ATOMIC_CHAR_LOCK_FREE
+# define LIBCUDACXX_ATOMIC_CHAR16_T_LOCK_FREE  __CLANG_ATOMIC_CHAR16_T_LOCK_FREE
+# define LIBCUDACXX_ATOMIC_CHAR32_T_LOCK_FREE  __CLANG_ATOMIC_CHAR32_T_LOCK_FREE
+# define LIBCUDACXX_ATOMIC_WCHAR_T_LOCK_FREE   __CLANG_ATOMIC_WCHAR_T_LOCK_FREE
+# define LIBCUDACXX_ATOMIC_SHORT_LOCK_FREE     __CLANG_ATOMIC_SHORT_LOCK_FREE
+# define LIBCUDACXX_ATOMIC_INT_LOCK_FREE       __CLANG_ATOMIC_INT_LOCK_FREE
+# define LIBCUDACXX_ATOMIC_LONG_LOCK_FREE      __CLANG_ATOMIC_LONG_LOCK_FREE
+# define LIBCUDACXX_ATOMIC_LLONG_LOCK_FREE     __CLANG_ATOMIC_LLONG_LOCK_FREE
+# define LIBCUDACXX_ATOMIC_POINTER_LOCK_FREE   __CLANG_ATOMIC_POINTER_LOCK_FREE
+#elif defined(__GCC_ATOMIC_BOOL_LOCK_FREE)
+# define LIBCUDACXX_ATOMIC_BOOL_LOCK_FREE      __GCC_ATOMIC_BOOL_LOCK_FREE
+# define LIBCUDACXX_ATOMIC_CHAR_LOCK_FREE      __GCC_ATOMIC_CHAR_LOCK_FREE
+# define LIBCUDACXX_ATOMIC_CHAR16_T_LOCK_FREE  __GCC_ATOMIC_CHAR16_T_LOCK_FREE
+# define LIBCUDACXX_ATOMIC_CHAR32_T_LOCK_FREE  __GCC_ATOMIC_CHAR32_T_LOCK_FREE
+# define LIBCUDACXX_ATOMIC_WCHAR_T_LOCK_FREE   __GCC_ATOMIC_WCHAR_T_LOCK_FREE
+# define LIBCUDACXX_ATOMIC_SHORT_LOCK_FREE     __GCC_ATOMIC_SHORT_LOCK_FREE
+# define LIBCUDACXX_ATOMIC_INT_LOCK_FREE       __GCC_ATOMIC_INT_LOCK_FREE
+# define LIBCUDACXX_ATOMIC_LONG_LOCK_FREE      __GCC_ATOMIC_LONG_LOCK_FREE
+# define LIBCUDACXX_ATOMIC_LLONG_LOCK_FREE     __GCC_ATOMIC_LLONG_LOCK_FREE
+# define LIBCUDACXX_ATOMIC_POINTER_LOCK_FREE   __GCC_ATOMIC_POINTER_LOCK_FREE
+#else // !defined(__CLANG_ATOMIC_BOOL_LOCK_FREE) && !defined(__GCC_ATOMIC_BOOL_LOCK_FREE)
+# define LIBCUDACXX_ATOMIC_BOOL_LOCK_FREE      2
+# define LIBCUDACXX_ATOMIC_CHAR_LOCK_FREE      2
+# define LIBCUDACXX_ATOMIC_CHAR16_T_LOCK_FREE  2
+# define LIBCUDACXX_ATOMIC_CHAR32_T_LOCK_FREE  2
+# define LIBCUDACXX_ATOMIC_WCHAR_T_LOCK_FREE   2
+# define LIBCUDACXX_ATOMIC_SHORT_LOCK_FREE     2
+# define LIBCUDACXX_ATOMIC_INT_LOCK_FREE       2
+# define LIBCUDACXX_ATOMIC_LONG_LOCK_FREE      2
+# define LIBCUDACXX_ATOMIC_LLONG_LOCK_FREE     2
+# define LIBCUDACXX_ATOMIC_POINTER_LOCK_FREE   2
+#endif
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+#if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+template<typename _Tp>
+struct __atomic_is_always_lock_free {
+    enum { __value = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0) };
+};
+#else
+template<typename _Tp>
+struct __atomic_is_always_lock_free {
+    enum { __value = sizeof(_Tp) <= 8 };
+};
+#endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif // __LIBCUDACXX___ATOMIC_PLATFORM_H
diff --git a/libcudacxx/include/cuda/std/__cuda/atomic_prelude.h b/libcudacxx/include/cuda/std/__cuda/atomic_prelude.h
deleted file mode 100644
index 4e43fb4481..0000000000
--- a/libcudacxx/include/cuda/std/__cuda/atomic_prelude.h
+++ /dev/null
@@ -1,64 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCUDACXX___CUDA_ATOMIC_PRELUDE_H
-#define _LIBCUDACXX___CUDA_ATOMIC_PRELUDE_H
-
-#include <cuda/std/detail/__config>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-#ifndef _CCCL_COMPILER_NVRTC
-#  include <cuda/std/cassert> // TRANSITION: Fix transitive includes
-
-#  include <atomic>
-static_assert(ATOMIC_BOOL_LOCK_FREE == 2, "");
-static_assert(ATOMIC_CHAR_LOCK_FREE == 2, "");
-static_assert(ATOMIC_CHAR16_T_LOCK_FREE == 2, "");
-static_assert(ATOMIC_CHAR32_T_LOCK_FREE == 2, "");
-static_assert(ATOMIC_WCHAR_T_LOCK_FREE == 2, "");
-static_assert(ATOMIC_SHORT_LOCK_FREE == 2, "");
-static_assert(ATOMIC_INT_LOCK_FREE == 2, "");
-static_assert(ATOMIC_LONG_LOCK_FREE == 2, "");
-static_assert(ATOMIC_LLONG_LOCK_FREE == 2, "");
-static_assert(ATOMIC_POINTER_LOCK_FREE == 2, "");
-#  undef ATOMIC_BOOL_LOCK_FREE
-#  undef ATOMIC_BOOL_LOCK_FREE
-#  undef ATOMIC_CHAR_LOCK_FREE
-#  undef ATOMIC_CHAR16_T_LOCK_FREE
-#  undef ATOMIC_CHAR32_T_LOCK_FREE
-#  undef ATOMIC_WCHAR_T_LOCK_FREE
-#  undef ATOMIC_SHORT_LOCK_FREE
-#  undef ATOMIC_INT_LOCK_FREE
-#  undef ATOMIC_LONG_LOCK_FREE
-#  undef ATOMIC_LLONG_LOCK_FREE
-#  undef ATOMIC_POINTER_LOCK_FREE
-#  undef ATOMIC_FLAG_INIT
-#  undef ATOMIC_VAR_INIT
-#endif // _CCCL_COMPILER_NVRTC
-
-// pre-define lock free query for heterogeneous compatibility
-#ifndef _LIBCUDACXX_ATOMIC_IS_LOCK_FREE
-#  define _LIBCUDACXX_ATOMIC_IS_LOCK_FREE(__x) (__x <= 8)
-#endif
-
-#ifndef _CCCL_COMPILER_NVRTC
-#  include <thread>
-
-#  include <errno.h>
-#endif // _CCCL_COMPILER_NVRTC
-
-#endif // _LIBCUDACXX___CUDA_ATOMIC_PRELUDE_H
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__threading_support b/libcudacxx/include/cuda/std/detail/libcxx/include/__threading_support
index 18bafb86ae..329c605de7 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__threading_support
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__threading_support
@@ -21,7 +21,6 @@
 #endif // no system header
 
 #include <cuda/std/detail/libcxx/include/__assert> // all public C++ headers provide the assertion handler
-#include <cuda/std/__cuda/atomic_prelude.h>
 #include <cuda/std/__functional/hash.h>
 #include <cuda/std/chrono>
 #include <cuda/std/climits>
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.lockfree/isalwayslockfree.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.lockfree/isalwayslockfree.pass.cpp
index 9e5e9d41e7..e951b1da78 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.lockfree/isalwayslockfree.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.lockfree/isalwayslockfree.pass.cpp
@@ -38,7 +38,7 @@ __host__ __device__ void checkAlwaysLockFree()
 }
 
 // FIXME: This separate test is needed to work around llvm.org/PR31864
-// which causes ATOMIC_LLONG_LOCK_FREE to be defined as '1' in 32-bit builds
+// which causes LIBCUDACXX_ATOMIC_LLONG_LOCK_FREE to be defined as '1' in 32-bit builds
 // even though __atomic_always_lock_free returns true for the same type.
 constexpr bool NeedWorkaroundForPR31864 =
 #if defined(__clang__)
@@ -53,8 +53,8 @@ template <bool Disable                      = NeedWorkaroundForPR31864,
           class ULLong                      = unsigned long long>
 __host__ __device__ void checkLongLongTypes()
 {
-  static_assert(cuda::std::atomic<LLong>::is_always_lock_free == (2 == ATOMIC_LLONG_LOCK_FREE), "");
-  static_assert(cuda::std::atomic<ULLong>::is_always_lock_free == (2 == ATOMIC_LLONG_LOCK_FREE), "");
+  static_assert(cuda::std::atomic<LLong>::is_always_lock_free == (2 == LIBCUDACXX_ATOMIC_LLONG_LOCK_FREE), "");
+  static_assert(cuda::std::atomic<ULLong>::is_always_lock_free == (2 == LIBCUDACXX_ATOMIC_LLONG_LOCK_FREE), "");
 }
 
 // Used to make the calls to __atomic_always_lock_free dependent on a template
@@ -74,7 +74,7 @@ __host__ __device__ void checkLongLongTypes()
   constexpr bool ExpectLockFree = __atomic_always_lock_free(getSizeOf<LLong>(), 0);
   static_assert(cuda::std::atomic<LLong>::is_always_lock_free == ExpectLockFree, "");
   static_assert(cuda::std::atomic<ULLong>::is_always_lock_free == ExpectLockFree, "");
-  static_assert((0 != ATOMIC_LLONG_LOCK_FREE) == ExpectLockFree, "");
+  static_assert((0 != LIBCUDACXX_ATOMIC_LLONG_LOCK_FREE) == ExpectLockFree, "");
 }
 
 __host__ __device__ void run()
@@ -143,22 +143,22 @@ __host__ __device__ void run()
   });
 
   // C macro and static constexpr must be consistent.
-  static_assert(cuda::std::atomic<bool>::is_always_lock_free == (2 == ATOMIC_BOOL_LOCK_FREE), "");
-  static_assert(cuda::std::atomic<char>::is_always_lock_free == (2 == ATOMIC_CHAR_LOCK_FREE), "");
-  static_assert(cuda::std::atomic<signed char>::is_always_lock_free == (2 == ATOMIC_CHAR_LOCK_FREE), "");
-  static_assert(cuda::std::atomic<unsigned char>::is_always_lock_free == (2 == ATOMIC_CHAR_LOCK_FREE), "");
-  static_assert(cuda::std::atomic<char16_t>::is_always_lock_free == (2 == ATOMIC_CHAR16_T_LOCK_FREE), "");
-  static_assert(cuda::std::atomic<char32_t>::is_always_lock_free == (2 == ATOMIC_CHAR32_T_LOCK_FREE), "");
-  static_assert(cuda::std::atomic<wchar_t>::is_always_lock_free == (2 == ATOMIC_WCHAR_T_LOCK_FREE), "");
-  static_assert(cuda::std::atomic<short>::is_always_lock_free == (2 == ATOMIC_SHORT_LOCK_FREE), "");
-  static_assert(cuda::std::atomic<unsigned short>::is_always_lock_free == (2 == ATOMIC_SHORT_LOCK_FREE), "");
-  static_assert(cuda::std::atomic<int>::is_always_lock_free == (2 == ATOMIC_INT_LOCK_FREE), "");
-  static_assert(cuda::std::atomic<unsigned int>::is_always_lock_free == (2 == ATOMIC_INT_LOCK_FREE), "");
-  static_assert(cuda::std::atomic<long>::is_always_lock_free == (2 == ATOMIC_LONG_LOCK_FREE), "");
-  static_assert(cuda::std::atomic<unsigned long>::is_always_lock_free == (2 == ATOMIC_LONG_LOCK_FREE), "");
+  static_assert(cuda::std::atomic<bool>::is_always_lock_free == (2 == LIBCUDACXX_ATOMIC_BOOL_LOCK_FREE), "");
+  static_assert(cuda::std::atomic<char>::is_always_lock_free == (2 == LIBCUDACXX_ATOMIC_CHAR_LOCK_FREE), "");
+  static_assert(cuda::std::atomic<signed char>::is_always_lock_free == (2 == LIBCUDACXX_ATOMIC_CHAR_LOCK_FREE), "");
+  static_assert(cuda::std::atomic<unsigned char>::is_always_lock_free == (2 == LIBCUDACXX_ATOMIC_CHAR_LOCK_FREE), "");
+  static_assert(cuda::std::atomic<char16_t>::is_always_lock_free == (2 == LIBCUDACXX_ATOMIC_CHAR16_T_LOCK_FREE), "");
+  static_assert(cuda::std::atomic<char32_t>::is_always_lock_free == (2 == LIBCUDACXX_ATOMIC_CHAR32_T_LOCK_FREE), "");
+  static_assert(cuda::std::atomic<wchar_t>::is_always_lock_free == (2 == LIBCUDACXX_ATOMIC_WCHAR_T_LOCK_FREE), "");
+  static_assert(cuda::std::atomic<short>::is_always_lock_free == (2 == LIBCUDACXX_ATOMIC_SHORT_LOCK_FREE), "");
+  static_assert(cuda::std::atomic<unsigned short>::is_always_lock_free == (2 == LIBCUDACXX_ATOMIC_SHORT_LOCK_FREE), "");
+  static_assert(cuda::std::atomic<int>::is_always_lock_free == (2 == LIBCUDACXX_ATOMIC_INT_LOCK_FREE), "");
+  static_assert(cuda::std::atomic<unsigned int>::is_always_lock_free == (2 == LIBCUDACXX_ATOMIC_INT_LOCK_FREE), "");
+  static_assert(cuda::std::atomic<long>::is_always_lock_free == (2 == LIBCUDACXX_ATOMIC_LONG_LOCK_FREE), "");
+  static_assert(cuda::std::atomic<unsigned long>::is_always_lock_free == (2 == LIBCUDACXX_ATOMIC_LONG_LOCK_FREE), "");
   checkLongLongTypes();
-  static_assert(cuda::std::atomic<void*>::is_always_lock_free == (2 == ATOMIC_POINTER_LOCK_FREE), "");
-  static_assert(cuda::std::atomic<cuda::std::nullptr_t>::is_always_lock_free == (2 == ATOMIC_POINTER_LOCK_FREE), "");
+  static_assert(cuda::std::atomic<void*>::is_always_lock_free == (2 == LIBCUDACXX_ATOMIC_POINTER_LOCK_FREE), "");
+  static_assert(cuda::std::atomic<cuda::std::nullptr_t>::is_always_lock_free == (2 == LIBCUDACXX_ATOMIC_POINTER_LOCK_FREE), "");
 }
 
 int main(int, char**)
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.lockfree/lockfree.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.lockfree/lockfree.pass.cpp
index 1ca3afd2f7..fdb53f8c5c 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.lockfree/lockfree.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.lockfree/lockfree.pass.cpp
@@ -29,16 +29,16 @@
 
 int main(int, char**)
 {
-  assert(ATOMIC_BOOL_LOCK_FREE == 0 || ATOMIC_BOOL_LOCK_FREE == 1 || ATOMIC_BOOL_LOCK_FREE == 2);
-  assert(ATOMIC_CHAR_LOCK_FREE == 0 || ATOMIC_CHAR_LOCK_FREE == 1 || ATOMIC_CHAR_LOCK_FREE == 2);
-  assert(ATOMIC_CHAR16_T_LOCK_FREE == 0 || ATOMIC_CHAR16_T_LOCK_FREE == 1 || ATOMIC_CHAR16_T_LOCK_FREE == 2);
-  assert(ATOMIC_CHAR32_T_LOCK_FREE == 0 || ATOMIC_CHAR32_T_LOCK_FREE == 1 || ATOMIC_CHAR32_T_LOCK_FREE == 2);
-  assert(ATOMIC_WCHAR_T_LOCK_FREE == 0 || ATOMIC_WCHAR_T_LOCK_FREE == 1 || ATOMIC_WCHAR_T_LOCK_FREE == 2);
-  assert(ATOMIC_SHORT_LOCK_FREE == 0 || ATOMIC_SHORT_LOCK_FREE == 1 || ATOMIC_SHORT_LOCK_FREE == 2);
-  assert(ATOMIC_INT_LOCK_FREE == 0 || ATOMIC_INT_LOCK_FREE == 1 || ATOMIC_INT_LOCK_FREE == 2);
-  assert(ATOMIC_LONG_LOCK_FREE == 0 || ATOMIC_LONG_LOCK_FREE == 1 || ATOMIC_LONG_LOCK_FREE == 2);
-  assert(ATOMIC_LLONG_LOCK_FREE == 0 || ATOMIC_LLONG_LOCK_FREE == 1 || ATOMIC_LLONG_LOCK_FREE == 2);
-  assert(ATOMIC_POINTER_LOCK_FREE == 0 || ATOMIC_POINTER_LOCK_FREE == 1 || ATOMIC_POINTER_LOCK_FREE == 2);
+  assert(LIBCUDACXX_ATOMIC_BOOL_LOCK_FREE == 0 || LIBCUDACXX_ATOMIC_BOOL_LOCK_FREE == 1 || LIBCUDACXX_ATOMIC_BOOL_LOCK_FREE == 2);
+  assert(LIBCUDACXX_ATOMIC_CHAR_LOCK_FREE == 0 || LIBCUDACXX_ATOMIC_CHAR_LOCK_FREE == 1 || LIBCUDACXX_ATOMIC_CHAR_LOCK_FREE == 2);
+  assert(LIBCUDACXX_ATOMIC_CHAR16_T_LOCK_FREE == 0 || LIBCUDACXX_ATOMIC_CHAR16_T_LOCK_FREE == 1 || LIBCUDACXX_ATOMIC_CHAR16_T_LOCK_FREE == 2);
+  assert(LIBCUDACXX_ATOMIC_CHAR32_T_LOCK_FREE == 0 || LIBCUDACXX_ATOMIC_CHAR32_T_LOCK_FREE == 1 || LIBCUDACXX_ATOMIC_CHAR32_T_LOCK_FREE == 2);
+  assert(LIBCUDACXX_ATOMIC_WCHAR_T_LOCK_FREE == 0 || LIBCUDACXX_ATOMIC_WCHAR_T_LOCK_FREE == 1 || LIBCUDACXX_ATOMIC_WCHAR_T_LOCK_FREE == 2);
+  assert(LIBCUDACXX_ATOMIC_SHORT_LOCK_FREE == 0 || LIBCUDACXX_ATOMIC_SHORT_LOCK_FREE == 1 || LIBCUDACXX_ATOMIC_SHORT_LOCK_FREE == 2);
+  assert(LIBCUDACXX_ATOMIC_INT_LOCK_FREE == 0 || LIBCUDACXX_ATOMIC_INT_LOCK_FREE == 1 || LIBCUDACXX_ATOMIC_INT_LOCK_FREE == 2);
+  assert(LIBCUDACXX_ATOMIC_LONG_LOCK_FREE == 0 || LIBCUDACXX_ATOMIC_LONG_LOCK_FREE == 1 || LIBCUDACXX_ATOMIC_LONG_LOCK_FREE == 2);
+  assert(LIBCUDACXX_ATOMIC_LLONG_LOCK_FREE == 0 || LIBCUDACXX_ATOMIC_LLONG_LOCK_FREE == 1 || LIBCUDACXX_ATOMIC_LLONG_LOCK_FREE == 2);
+  assert(LIBCUDACXX_ATOMIC_POINTER_LOCK_FREE == 0 || LIBCUDACXX_ATOMIC_POINTER_LOCK_FREE == 1 || LIBCUDACXX_ATOMIC_POINTER_LOCK_FREE == 2);
 
   return 0;
 }

From fc9743755a2bc8dac59378dd639f96f97a98cdea Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Fri, 3 May 2024 12:25:22 -0700
Subject: [PATCH 17/71] Fix missing <cstdint> in generated ptx file.

---
 libcudacxx/codegen/codegen.cpp                               | 5 ++++-
 .../include/cuda/std/__atomic/functions/cuda_ptx_generated.h | 5 ++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/libcudacxx/codegen/codegen.cpp b/libcudacxx/codegen/codegen.cpp
index 508f41ae68..e15db41f83 100644
--- a/libcudacxx/codegen/codegen.cpp
+++ b/libcudacxx/codegen/codegen.cpp
@@ -78,13 +78,16 @@ int main()
 //
 //===----------------------------------------------------------------------===//
 
-// This is a autogenerated file, we want to ensure that it contains exactly the contents we want to generate
+// This is an autogenerated file, we want to ensure that it contains exactly the contents we want to generate
 // clang-format off
 
 #ifndef _LIBCUDACXX___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H
 #define _LIBCUDACXX___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H
 
 #include <cuda/std/detail/__config>
+
+#include <cuda/std/cstdint>
+
 #include <cuda/std/__type_traits/enable_if.h>
 #include <cuda/std/__type_traits/is_signed.h>
 #include <cuda/std/__type_traits/is_unsigned.h>
diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
index 7a6d4020c6..8a1fda3d1d 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
@@ -8,13 +8,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-// This is a autogenerated file, we want to ensure that it contains exactly the contents we want to generate
+// This is an autogenerated file, we want to ensure that it contains exactly the contents we want to generate
 // clang-format off
 
 #ifndef _LIBCUDACXX___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H
 #define _LIBCUDACXX___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H
 
 #include <cuda/std/detail/__config>
+
+#include <cuda/std/cstdint>
+
 #include <cuda/std/__type_traits/enable_if.h>
 #include <cuda/std/__type_traits/is_signed.h>
 #include <cuda/std/__type_traits/is_unsigned.h>

From f91a7c177e3c2fd358faa09ceb1df54b9dcb8676 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Fri, 3 May 2024 12:26:28 -0700
Subject: [PATCH 18/71] `__cuda_std__` mode does not require use of host
 atomics checks.

---
 libcudacxx/include/cuda/std/__atomic/functions/host.h         | 4 ----
 libcudacxx/include/cuda/std/__atomic/platform.h               | 2 ++
 .../include/cuda/std/__atomic/platform/msvc_to_builtins.h     | 4 ----
 3 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__atomic/functions/host.h b/libcudacxx/include/cuda/std/__atomic/functions/host.h
index 45e0d865c9..134b00bff5 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/host.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/host.h
@@ -19,10 +19,6 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 // Guard ifdef for lock free query in case it is assigned elsewhere (MSVC/CUDA)
-#ifndef _LIBCUDACXX_ATOMIC_IS_LOCK_FREE
-#define _LIBCUDACXX_ATOMIC_IS_LOCK_FREE(__x) __atomic_is_lock_free(__x, 0)
-#endif
-
 inline
 void __atomic_thread_fence_host(memory_order __order) {
   __atomic_thread_fence(__atomic_order_to_int(__order));
diff --git a/libcudacxx/include/cuda/std/__atomic/platform.h b/libcudacxx/include/cuda/std/__atomic/platform.h
index 286b5a8483..67b9615605 100644
--- a/libcudacxx/include/cuda/std/__atomic/platform.h
+++ b/libcudacxx/include/cuda/std/__atomic/platform.h
@@ -52,6 +52,8 @@
 # define LIBCUDACXX_ATOMIC_POINTER_LOCK_FREE   2
 #endif
 
+#define _LIBCUDACXX_ATOMIC_IS_LOCK_FREE(size) (size <= 8)
+
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 #if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
diff --git a/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h b/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h
index de0ca90300..ddbce4b8bb 100644
--- a/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h
+++ b/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h
@@ -38,10 +38,6 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #endif // hardware
 
 // MSVC Does not have compiler intrinsics for lock-free checking
-#ifndef _LIBCUDACXX_ATOMIC_IS_LOCK_FREE
-#  define _LIBCUDACXX_ATOMIC_IS_LOCK_FREE(__x) (__x <= 8)
-#endif
-
 inline int __stronger_order_msvc(int __a, int __b)
 {
   int const __max = __a > __b ? __a : __b;

From 8fb4c6e22a97168193992feddd8a169bd3ae8645 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Fri, 3 May 2024 12:27:21 -0700
Subject: [PATCH 19/71] Fix missing `_If` in types.h.

---
 libcudacxx/include/cuda/std/__atomic/types.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__atomic/types.h b/libcudacxx/include/cuda/std/__atomic/types.h
index 70c3fe2db5..e3a3a8868f 100644
--- a/libcudacxx/include/cuda/std/__atomic/types.h
+++ b/libcudacxx/include/cuda/std/__atomic/types.h
@@ -18,6 +18,8 @@
 #include <cuda/std/__atomic/types/small.h>
 #include <cuda/std/__atomic/types/reference.h>
 
+#include <cuda/std/__type_traits/conditional.h>
+
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <typename _Tp>
@@ -28,9 +30,9 @@ struct __atomic_traits {
 };
 
 template <typename _Tp>
-using __atomic_storage_t = typename _If<__atomic_traits<_Tp>::__atomic_requires_small,
+using __atomic_storage_t = typename _CUDA_VSTD::_If<__atomic_traits<_Tp>::__atomic_requires_small,
                                             __atomic_small_storage<_Tp>,
-                                            _If<__atomic_traits<_Tp>::__atomic_requires_lock,
+                                            _CUDA_VSTD::_If<__atomic_traits<_Tp>::__atomic_requires_lock,
                                                 __atomic_locked_storage<_Tp>,
                                                 __atomic_storage<_Tp>
                                                 >>;

From c4546f5447c9f12d27ff0b46d253210cacd8f4ec Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Fri, 3 May 2024 12:27:37 -0700
Subject: [PATCH 20/71] Fix missing <cstdint> in derived PTX file.

---
 .../include/cuda/std/__atomic/functions/cuda_ptx_derived.h   | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
index 0c4367f21c..dd3c9e65bf 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
@@ -12,6 +12,9 @@
 #define __LIBCUDACXX___ATOMIC_FUNCTIONS_DERIVED_H
 
 #include <cuda/std/detail/__config>
+
+#include <cuda/std/cstdint>
+
 #include <cuda/std/__atomic/functions/cuda_ptx_generated.h>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
@@ -199,4 +202,4 @@ static inline _CCCL_DEVICE void __atomic_signal_fence_cuda(int)
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
-#endif __LIBCUDACXX___ATOMIC_FUNCTIONS_DERIVED_H
+#endif // __LIBCUDACXX___ATOMIC_FUNCTIONS_DERIVED_H

From 077e3d37cb5525153f85a2a78125cc9217c135fd Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Fri, 3 May 2024 12:27:50 -0700
Subject: [PATCH 21/71] Remove uneeded headers from base.h.

---
 libcudacxx/include/cuda/std/__atomic/types/base.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__atomic/types/base.h b/libcudacxx/include/cuda/std/__atomic/types/base.h
index 942c029647..103343934d 100644
--- a/libcudacxx/include/cuda/std/__atomic/types/base.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/base.h
@@ -13,8 +13,6 @@
 
 #include <cuda/std/detail/__config>
 
-#include <cuda/std/__type_traits/remove_cvref.h>
-#include <cuda/std/__type_traits/enable_if.h>
 #include <cuda/std/__type_traits/is_trivially_copyable.h>
 
 #include <cuda/std/__atomic/functions.h>

From b38a43af50d6b8ef74dd94b797a7d6b040f1cd4f Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Fri, 3 May 2024 12:29:00 -0700
Subject: [PATCH 22/71] Fix type mixup in `__atomic_wait`.

---
 libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h b/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h
index 252fec2540..c9359c9627 100644
--- a/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h
+++ b/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h
@@ -64,7 +64,7 @@ bool __nonatomic_compare_equal(_Tp const& __lhs, _Tp const& __rhs) {
 }
 
 template <typename _Tp, typename _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY void __atomic_wait(_Tp const volatile* __a, __atomic_underlying_t<_Tp> const __val, memory_order __order, _Sco = {}) {
+_LIBCUDACXX_INLINE_VISIBILITY void __atomic_wait(_Tp const volatile* __a, __remove_cv_t<__atomic_underlying_t<_Tp>> const __val, memory_order __order, _Sco = {}) {
     for(int __i = 0; __i < _LIBCUDACXX_POLLING_COUNT; ++__i) {
         if(!__nonatomic_compare_equal(__atomic_load_dispatch(__a, __order, _Sco{}), __val))
             return;

From 6e4e9476618046c155d168562310eb1e624eeb5c Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Fri, 3 May 2024 12:34:28 -0700
Subject: [PATCH 23/71] Change heterogeneous tests to permutate over H/D
 launcher combinations. * However this is restricted to *one* device launch
 per suite to prevent deadlocks. * Tests are much slower, but extremely
 thorough. * Concurrent H/D coverage is particularly exemplified in
 latch.pass.cpp.

---
 .../test/libcudacxx/heterogeneous/helpers.h   | 219 +++++++++++++-----
 .../test/libcudacxx/heterogeneous/meta.h      | 100 ++++++++
 2 files changed, 267 insertions(+), 52 deletions(-)
 create mode 100644 libcudacxx/test/libcudacxx/heterogeneous/meta.h

diff --git a/libcudacxx/test/libcudacxx/heterogeneous/helpers.h b/libcudacxx/test/libcudacxx/heterogeneous/helpers.h
index 8d304d0723..d1d3c5870f 100644
--- a/libcudacxx/test/libcudacxx/heterogeneous/helpers.h
+++ b/libcudacxx/test/libcudacxx/heterogeneous/helpers.h
@@ -18,6 +18,17 @@
 
 #include <stdlib.h>
 
+#include "meta.h"
+
+template <typename ... T>
+struct void_sink{};
+
+template <typename T, typename = int>
+struct has_threadcount : std::false_type { };
+
+template <typename T>
+struct has_threadcount <T, decltype((void) T::threadcount, (int)0)> : std::true_type {};
+
 #define DEFINE_ASYNC_TRAIT(...)                                             \
   template <typename T, typename = cuda::std::true_type>                    \
   struct async##__VA_ARGS__##_trait_impl                                    \
@@ -38,6 +49,21 @@ DEFINE_ASYNC_TRAIT()
 DEFINE_ASYNC_TRAIT(_initialize)
 DEFINE_ASYNC_TRAIT(_validate)
 
+template <typename T, bool = has_threadcount<T>::value>
+struct threadcount_trait_impl
+{
+  static constexpr size_t value = 1;
+};
+
+template <typename T>
+struct threadcount_trait_impl<T, true>
+{
+  static constexpr size_t value = T::threadcount;
+};
+
+template <typename T>
+using threadcount_trait = threadcount_trait_impl<T>;
+
 #undef DEFINE_ASYNC_TRAIT
 
 #define HETEROGENEOUS_SAFE_CALL(...)                                                  \
@@ -59,6 +85,10 @@ __host__ inline std::vector<std::thread>& host_threads()
 
 __host__ inline void sync_host_threads()
 {
+#ifdef DEBUG_TESTERS
+  printf("%s\n", __PRETTY_FUNCTION__);
+  fflush(stdout);
+#endif
   for (auto&& thread : host_threads())
   {
     thread.join();
@@ -66,27 +96,29 @@ __host__ inline void sync_host_threads()
   host_threads().clear();
 }
 
-__host__ inline std::vector<cudaStream_t>& device_streams()
+__host__ inline std::vector<std::thread>& device_threads()
 {
-  static std::vector<cudaStream_t> streams;
-  return streams;
+  static std::vector<std::thread> threads;
+  return threads;
 }
 
-__host__ inline void sync_device_streams()
+__host__ inline void sync_device_threads()
 {
-  for (auto&& stream : device_streams())
+#ifdef DEBUG_TESTERS
+  printf("%s\n", __PRETTY_FUNCTION__);
+  fflush(stdout);
+#endif
+  for (auto&& thread : device_threads())
   {
-    HETEROGENEOUS_SAFE_CALL(cudaStreamSynchronize(stream));
-    HETEROGENEOUS_SAFE_CALL(cudaStreamDestroy(stream));
+    thread.join();
   }
-
-  device_streams().clear();
+  device_threads().clear();
 }
 
 __host__ void sync_all()
 {
   sync_host_threads();
-  sync_device_streams();
+  sync_device_threads();
 }
 
 struct async_tester_fence
@@ -105,20 +137,7 @@ struct async_tester_fence
 };
 
 template <typename... Testers>
-struct tester_list
-{};
-
-template <typename TesterList, typename... Testers>
-struct extend_tester_list_t;
-
-template <typename... Original, typename... Additional>
-struct extend_tester_list_t<tester_list<Original...>, Additional...>
-{
-  using type = tester_list<Original..., Additional...>;
-};
-
-template <typename TesterList, typename... Testers>
-using extend_tester_list = typename extend_tester_list_t<TesterList, Testers...>::type;
+using tester_list = type_list<Testers...>;
 
 template <typename Tester, typename T>
 __host__ __device__ void initialize(T& object)
@@ -182,6 +201,20 @@ void device_destroy(T* object)
   HETEROGENEOUS_SAFE_CALL(cudaGetLastError());
   HETEROGENEOUS_SAFE_CALL(cudaDeviceSynchronize());
 }
+template <typename Fn>
+void device_launch_async(Fn& launcher) {
+  auto streamManager = [launcher]() {
+    cudaStream_t stream;
+    HETEROGENEOUS_SAFE_CALL(cudaStreamCreate(&stream));
+    launcher(stream);
+    HETEROGENEOUS_SAFE_CALL(cudaGetLastError());
+
+    HETEROGENEOUS_SAFE_CALL(cudaStreamSynchronize(stream));
+    HETEROGENEOUS_SAFE_CALL(cudaStreamDestroy(stream));
+  };
+
+  device_threads().push_back(std::thread(streamManager));
+}
 
 template <typename Tester, typename T>
 void device_initialize(T& object)
@@ -191,14 +224,23 @@ void device_initialize(T& object)
   fflush(stdout);
 #endif
 
-  cudaStream_t s;
-  HETEROGENEOUS_SAFE_CALL(cudaStreamCreate(&s));
-  initialization_kernel<Tester><<<1, 1, 0, s>>>(object);
-  HETEROGENEOUS_SAFE_CALL(cudaGetLastError());
-  device_streams().push_back(s);
+  auto kernel_launcher = [&object](cudaStream_t stream) {
+    constexpr auto tc = threadcount_trait<Tester>::value;
+#ifdef DEBUG_TESTERS
+    printf("%i device init threads launched\r\n", (int)tc);
+    fflush(stdout);
+#endif
+    initialization_kernel<Tester><<<1, tc, 0, stream>>>(object);
+  };
+
+  device_launch_async(kernel_launcher);
 
   if (!async_initialize_trait<Tester>::value)
   {
+#ifdef DEBUG_TESTERS
+    printf("init not async, synchronizing\r\n");
+    fflush(stdout);
+#endif
     HETEROGENEOUS_SAFE_CALL(cudaDeviceSynchronize());
     sync_all();
   }
@@ -212,14 +254,23 @@ void device_validate(T& object)
   fflush(stdout);
 #endif
 
-  cudaStream_t s;
-  HETEROGENEOUS_SAFE_CALL(cudaStreamCreate(&s));
-  validation_kernel<Tester><<<1, 1, 0, s>>>(object);
-  HETEROGENEOUS_SAFE_CALL(cudaGetLastError());
-  device_streams().push_back(s);
+  auto kernel_launcher = [&object](cudaStream_t stream) {
+    constexpr auto tc = threadcount_trait<Tester>::value;
+#ifdef DEBUG_TESTERS
+    printf("%i device validate threads launched\r\n", (int)tc);
+    fflush(stdout);
+#endif
+    validation_kernel<Tester><<<1, tc, 0, stream>>>(object);
+  };
+
+  device_launch_async(kernel_launcher);
 
   if (!async_validate_trait<Tester>::value)
   {
+#ifdef DEBUG_TESTERS
+    printf("validate not async, synchronizing\r\n");
+    fflush(stdout);
+#endif
     HETEROGENEOUS_SAFE_CALL(cudaDeviceSynchronize());
     sync_all();
   }
@@ -233,16 +284,25 @@ void host_initialize(T& object)
   fflush(stdout);
 #endif
 
-  if (async_initialize_trait<Tester>::value)
-  {
+  constexpr auto tc = threadcount_trait<Tester>::value;
+#ifdef DEBUG_TESTERS
+  printf("%i host init threads launched\r\n", (int)tc);
+  fflush(stdout);
+#endif
+
+  for (size_t i = 0; i < tc; i++) {
     host_threads().emplace_back([&] {
       initialize<Tester>(object);
     });
   }
 
-  else
+  if (!async_initialize_trait<Tester>::value)
   {
-    initialize<Tester>(object);
+#ifdef DEBUG_TESTERS
+    printf("init not async, synchronizing\r\n");
+    fflush(stdout);
+#endif
+    HETEROGENEOUS_SAFE_CALL(cudaDeviceSynchronize());
     sync_all();
   }
 }
@@ -255,16 +315,25 @@ void host_validate(T& object)
   fflush(stdout);
 #endif
 
-  if (async_validate_trait<Tester>::value)
-  {
+  constexpr auto tc = threadcount_trait<Tester>::value;
+#ifdef DEBUG_TESTERS
+    printf("%i host validate threads launched\r\n", (int)tc);
+    fflush(stdout);
+#endif
+
+  for (size_t i = 0; i < tc; i++) {
     host_threads().emplace_back([&] {
       validate<Tester>(object);
     });
   }
 
-  else
+  if (!async_initialize_trait<Tester>::value)
   {
-    validate<Tester>(object);
+#ifdef DEBUG_TESTERS
+    printf("validate not async, synchronizing\r\n");
+    fflush(stdout);
+#endif
+    HETEROGENEOUS_SAFE_CALL(cudaDeviceSynchronize());
     sync_all();
   }
 }
@@ -276,21 +345,32 @@ template <typename T>
 using performer = void (*)(T&);
 
 template <typename T>
-struct initializer_validator
-{
+struct initializer_validator {
   performer<T> initializer;
   performer<T> validator;
 };
 
-template <typename T, typename... Testers, typename... Args>
-void validate_device_dynamic(tester_list<Testers...>, Args... args)
-{
-  void* pointer;
-  HETEROGENEOUS_SAFE_CALL(cudaMalloc(&pointer, sizeof(T)));
+struct host_launcher {
+  template <typename T, typename Tester>
+  static initializer_validator<T> get_exec() {
+    return initializer_validator<T>{host_initialize<Tester>, host_validate<Tester>};
+  }
+};
+
+struct device_launcher {
+  template <typename T, typename Tester>
+  static initializer_validator<T> get_exec() {
+    return initializer_validator<T>{device_initialize<Tester>, device_validate<Tester>};
+  }
+};
 
+template <typename T, typename ... Testers, typename ... Launchers, typename... Args>
+void do_heterogeneous_test(type_list<Testers...>, type_list<Launchers...>, Args... args) {
+  void *pointer = nullptr;
+  HETEROGENEOUS_SAFE_CALL(cudaMallocHost(&pointer, sizeof(T)));
   T& object = *device_construct<T>(pointer, args...);
 
-  initializer_validator<T> performers[] = {{device_initialize<Testers>, device_validate<Testers>}...};
+  initializer_validator<T> performers[] = {{Launchers::template get_exec<T, Testers>()}...};
 
   for (auto&& performer : performers)
   {
@@ -304,7 +384,38 @@ void validate_device_dynamic(tester_list<Testers...>, Args... args)
   sync_all();
 
   device_destroy(&object);
-  HETEROGENEOUS_SAFE_CALL(cudaFree(pointer));
+  HETEROGENEOUS_SAFE_CALL(cudaFreeHost(pointer));
+}
+
+template <size_t Idx>
+using enable_if_permutations_remain = typename std::enable_if<Idx!=0, int>::type;
+template <size_t Idx>
+using enable_if_no_permutations_remain = typename std::enable_if<Idx==0, int>::type;
+
+template <size_t Idx, typename T, typename ... Testers, typename ... Launchers, typename... Args, enable_if_permutations_remain<Idx> = 0>
+void permute_tests(type_list<Testers...>, type_list<Launchers...>, Args... args) {
+#ifdef DEBUG_TESTERS
+  printf("Testing permutation %zu of %zu\r\n", Idx, sizeof...(Testers));
+  fflush(stdout);
+#endif
+  do_heterogeneous_test<T>(type_list<Testers...>{}, type_list<Launchers...>{}, args...);
+  permute_tests<Idx-1, T>(type_list<Testers...>{}, rotl<Launchers...>{}, args...);
+}
+
+template <size_t Idx, typename T, typename ... Testers, typename ... Launchers, typename... Args, enable_if_no_permutations_remain<Idx> = 0>
+void permute_tests(type_list<Testers...>, type_list<Launchers...>, Args... args) {}
+
+template <typename T, typename ... Testers, typename ... Launchers, typename... Args>
+void permute_tests(type_list<Testers...>, type_list<Launchers...>, Args... args) {
+  permute_tests<sizeof...(Testers), T>(type_list<Testers...>{}, type_list<Launchers...>{}, args...);
+}
+
+template <typename T, typename... Testers, typename... Args>
+void validate_device_dynamic(tester_list<Testers...>, Args... args)
+{
+  // ex: type_list<device_launcher, host_launcher, host_launcher>
+  using initial_launcher_list = append_n<sizeof...(Testers)-1, type_list<device_launcher>, host_launcher>;
+  permute_tests<T>(type_list<Testers...>{}, initial_launcher_list{}, args...);
 }
 
 #if __cplusplus >= 201402L
@@ -505,7 +616,7 @@ struct is_tester_list_async<tester_list<Testers...>>
 {};
 
 template <typename T, typename TesterList, typename... Args>
-void validate_not_movable(Args... args)
+void validate_pinned(Args... args)
 {
   using list_t = typename validate_list<false, TesterList>::type;
   list_t list0;
@@ -533,6 +644,8 @@ struct performer_adapter<Performer, performer_side::initialize>
   using async_initialize = async_trait<Performer>;
   using async_validate   = async_trait<Performer>;
 
+  static constexpr auto threadcount   = threadcount_trait<Performer>::value;
+
   template <typename T>
   __host__ __device__ static void initialize(T& t)
   {
@@ -546,6 +659,8 @@ struct performer_adapter<Performer, performer_side::validate>
   using async_initialize = async_trait<Performer>;
   using async_validate   = async_trait<Performer>;
 
+  static constexpr auto threadcount   = threadcount_trait<Performer>::value;
+
   template <typename T>
   __host__ __device__ static void initialize(T&)
   {}
diff --git a/libcudacxx/test/libcudacxx/heterogeneous/meta.h b/libcudacxx/test/libcudacxx/heterogeneous/meta.h
new file mode 100644
index 0000000000..e5a777c400
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/heterogeneous/meta.h
@@ -0,0 +1,100 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// metafunction header that provides some vocabulary for manipulating type packs
+// exports:
+//   rotl<T...>
+//   for_n<Count, T, Fn>
+//   bind_last<Fn<...>, B>
+//   append<List, Tx...>
+//   append_n<Count, List, T>
+
+#ifndef HETEROGENEOUS_META_H
+#define HETEROGENEOUS_META_H
+
+template <typename ...Tx>
+struct type_list{};
+
+// Rotates the typelist by removing the head and moving it to the tail
+template <typename ...Tx>
+struct rotl_impl {
+    // Empty or 1 element case
+    using type = type_list<Tx...>;
+};
+template <typename T, typename ...Tx>
+struct rotl_impl<T, Tx...>  {
+    using type = type_list<Tx..., T>;
+};
+template <typename ...Tx>
+struct rotl_impl<type_list<Tx...>> {
+    using type = typename rotl_impl<Tx...>::type;
+};
+
+template <typename ...Tx>
+using rotl = typename rotl_impl<Tx...>::type;
+
+// static_assert(std::is_same<rotl<int, char>, type_list<char,int>>(), "");
+// static_assert(std::is_same<rotl<int, char, short>, type_list<char,short,int>>(), "");
+
+template <size_t Idx, typename T, template <typename> typename Fn>
+struct for_n_impl {
+    using type = typename for_n_impl<Idx-1, Fn<T>, Fn>::type;
+};
+template <typename T, template <typename> typename Fn>
+struct for_n_impl<1, T, Fn> {
+    using type = Fn<T>;
+};
+template <typename T, template <typename> typename Fn>
+struct for_n_impl<0, T, Fn> {
+    using type = T;
+};
+
+template <size_t Idx, typename T, template <typename> typename Fn>
+using for_n = typename for_n_impl<Idx, T, Fn>::type;
+
+// static_assert(std::is_same<for_n<2, type_list<int, char, short>, rotl>, type_list<short, int, char>>(), "");
+// static_assert(std::is_same<for_n<3, type_list<int, char, short>, rotl>, type_list<int, char, short>>(), "");
+
+template <template <typename...> typename Fn, typename B>
+struct bind_last_impl {
+    template <typename T>
+    struct bound {
+        using type = Fn<T, B>;
+    };
+
+    template <typename T>
+    using type = typename bound<T>::type;
+};
+
+template <template <typename> typename Fn, typename B>
+using bind_last = bind_last_impl<Fn, B>;
+
+template <typename TypeList, typename... Tx>
+struct append_impl;
+template <typename... Old, typename... New>
+struct append_impl<type_list<Old...>, New...> {
+  using type = type_list<Old..., New...>;
+};
+template <typename... Old, typename... New>
+struct append_impl<type_list<Old...>, type_list<New...>> {
+  using type = type_list<Old..., New...>;
+};
+
+template <typename TypeList, typename... Tx>
+using append = typename append_impl<TypeList, Tx...>::type;
+
+// static_assert(std::is_same<append<type_list<>, int>, type_list<int>>(), "");
+// static_assert(std::is_same<append<type_list<int>, int>, type_list<int,int>>(), "");
+
+template <size_t Idx, typename TypeList, typename T>
+using append_n = for_n<Idx, TypeList, bind_last<append,T>::template type>;
+
+// static_assert(std::is_same<append_n<3, type_list<char>, int>, type_list<char, int, int, int>>(), "");
+// static_assert(std::is_same<append_n<5, type_list<char>, int>, type_list<char, int, int, int, int, int>>(), "");
+
+#endif // HETEROGENEOUS_META_H

From 936daaf52587f30b8f890f10bcea86f91ad9e29c Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Fri, 3 May 2024 12:34:52 -0700
Subject: [PATCH 24/71] Change tests to use `validate_pinned` API.

---
 .../libcudacxx/heterogeneous/atomic.pass.cpp  | 87 ++++++++++---------
 .../heterogeneous/atomic_flag.pass.cpp        |  3 +-
 .../heterogeneous/atomic_ref.pass.cpp         | 61 ++++++-------
 .../libcudacxx/heterogeneous/barrier.pass.cpp | 25 +++---
 .../heterogeneous/barrier_abi_v2.pass.cpp     | 25 +++---
 .../barrier_parity.cuda.pass.cpp              |  5 +-
 .../heterogeneous/barrier_parity.std.pass.cpp |  5 +-
 .../heterogeneous/cuda_atomic_ref.pass.cpp    | 65 +++++++-------
 .../libcudacxx/heterogeneous/latch.pass.cpp   | 21 +++--
 .../heterogeneous/latch_abi_v2.pass.cpp       | 13 +--
 .../heterogeneous/optional.pass.cpp           |  4 +-
 .../libcudacxx/heterogeneous/pair.pass.cpp    |  2 +-
 .../heterogeneous/semaphore.pass.cpp          | 13 +--
 .../libcudacxx/heterogeneous/tuple.pass.cpp   |  2 +-
 .../libcudacxx/heterogeneous/variant.pass.cpp |  2 +-
 15 files changed, 175 insertions(+), 158 deletions(-)

diff --git a/libcudacxx/test/libcudacxx/heterogeneous/atomic.pass.cpp b/libcudacxx/test/libcudacxx/heterogeneous/atomic.pass.cpp
index 7c040fe12e..704bff1358 100644
--- a/libcudacxx/test/libcudacxx/heterogeneous/atomic.pass.cpp
+++ b/libcudacxx/test/libcudacxx/heterogeneous/atomic.pass.cpp
@@ -10,6 +10,7 @@
 // UNSUPPORTED: windows && pre-sm-70
 
 #include <cuda/std/atomic>
+#include <cuda/std/cassert>
 
 #include "helpers.h"
 
@@ -144,18 +145,18 @@ using basic_testers =
               exchange_tester<-12, 17>>;
 
 using arithmetic_atomic_testers =
-  extend_tester_list<basic_testers,
-                     fetch_add_tester<17, 13, 30>,
-                     fetch_sub_tester<30, 21, 9>,
-                     fetch_sub_tester<9, 17, -8>>;
+  append<basic_testers,
+        fetch_add_tester<17, 13, 30>,
+        fetch_sub_tester<30, 21, 9>,
+        fetch_sub_tester<9, 17, -8>>;
 
 using bitwise_atomic_testers =
-  extend_tester_list<arithmetic_atomic_testers,
-                     fetch_add_tester<-8, 10, 2>,
-                     fetch_or_tester<2, 13, 15>,
-                     fetch_and_tester<15, 8, 8>,
-                     fetch_and_tester<8, 13, 8>,
-                     fetch_xor_tester<8, 12, 4>>;
+  append<arithmetic_atomic_testers,
+        fetch_add_tester<-8, 10, 2>,
+        fetch_or_tester<2, 13, 15>,
+        fetch_and_tester<15, 8, 8>,
+        fetch_and_tester<8, 13, 8>,
+        fetch_xor_tester<8, 12, 4>>;
 
 class big_not_lockfree_type
 {
@@ -197,39 +198,39 @@ __host__ __device__ void validate_not_lock_free()
 
 void kernel_invoker()
 {
-  validate_not_movable<cuda::std::atomic<signed char>, arithmetic_atomic_testers>();
-  validate_not_movable<cuda::std::atomic<signed short>, arithmetic_atomic_testers>();
-  validate_not_movable<cuda::std::atomic<signed int>, arithmetic_atomic_testers>();
-  validate_not_movable<cuda::std::atomic<signed long>, arithmetic_atomic_testers>();
-  validate_not_movable<cuda::std::atomic<signed long long>, arithmetic_atomic_testers>();
-
-  validate_not_movable<cuda::std::atomic<unsigned char>, bitwise_atomic_testers>();
-  validate_not_movable<cuda::std::atomic<unsigned short>, bitwise_atomic_testers>();
-  validate_not_movable<cuda::std::atomic<unsigned int>, bitwise_atomic_testers>();
-  validate_not_movable<cuda::std::atomic<unsigned long>, bitwise_atomic_testers>();
-  validate_not_movable<cuda::std::atomic<unsigned long long>, bitwise_atomic_testers>();
-
-  validate_not_movable<cuda::std::atomic<float>, arithmetic_atomic_testers>();
-  validate_not_movable<cuda::std::atomic<double>, arithmetic_atomic_testers>();
-
-  validate_not_movable<cuda::std::atomic<big_not_lockfree_type>, basic_testers>();
-
-  validate_not_movable<cuda::atomic<signed char, cuda::thread_scope_system>, arithmetic_atomic_testers>();
-  validate_not_movable<cuda::atomic<signed short, cuda::thread_scope_system>, arithmetic_atomic_testers>();
-  validate_not_movable<cuda::atomic<signed int, cuda::thread_scope_system>, arithmetic_atomic_testers>();
-  validate_not_movable<cuda::atomic<signed long, cuda::thread_scope_system>, arithmetic_atomic_testers>();
-  validate_not_movable<cuda::atomic<signed long long, cuda::thread_scope_system>, arithmetic_atomic_testers>();
-
-  validate_not_movable<cuda::atomic<unsigned char, cuda::thread_scope_system>, bitwise_atomic_testers>();
-  validate_not_movable<cuda::atomic<unsigned short, cuda::thread_scope_system>, bitwise_atomic_testers>();
-  validate_not_movable<cuda::atomic<unsigned int, cuda::thread_scope_system>, bitwise_atomic_testers>();
-  validate_not_movable<cuda::atomic<unsigned long, cuda::thread_scope_system>, bitwise_atomic_testers>();
-  validate_not_movable<cuda::atomic<unsigned long long, cuda::thread_scope_system>, bitwise_atomic_testers>();
-
-  validate_not_movable<cuda::atomic<float>, arithmetic_atomic_testers>();
-  validate_not_movable<cuda::atomic<double>, arithmetic_atomic_testers>();
-
-  validate_not_movable<cuda::atomic<big_not_lockfree_type, cuda::thread_scope_system>, basic_testers>();
+  validate_pinned<cuda::std::atomic<signed char>, arithmetic_atomic_testers>();
+  validate_pinned<cuda::std::atomic<signed short>, arithmetic_atomic_testers>();
+  validate_pinned<cuda::std::atomic<signed int>, arithmetic_atomic_testers>();
+  validate_pinned<cuda::std::atomic<signed long>, arithmetic_atomic_testers>();
+  validate_pinned<cuda::std::atomic<signed long long>, arithmetic_atomic_testers>();
+
+  validate_pinned<cuda::std::atomic<unsigned char>, bitwise_atomic_testers>();
+  validate_pinned<cuda::std::atomic<unsigned short>, bitwise_atomic_testers>();
+  validate_pinned<cuda::std::atomic<unsigned int>, bitwise_atomic_testers>();
+  validate_pinned<cuda::std::atomic<unsigned long>, bitwise_atomic_testers>();
+  validate_pinned<cuda::std::atomic<unsigned long long>, bitwise_atomic_testers>();
+
+  validate_pinned<cuda::std::atomic<float>, arithmetic_atomic_testers>();
+  validate_pinned<cuda::std::atomic<double>, arithmetic_atomic_testers>();
+
+  validate_pinned<cuda::std::atomic<big_not_lockfree_type>, basic_testers>();
+
+  validate_pinned<cuda::atomic<signed char, cuda::thread_scope_system>, arithmetic_atomic_testers>();
+  validate_pinned<cuda::atomic<signed short, cuda::thread_scope_system>, arithmetic_atomic_testers>();
+  validate_pinned<cuda::atomic<signed int, cuda::thread_scope_system>, arithmetic_atomic_testers>();
+  validate_pinned<cuda::atomic<signed long, cuda::thread_scope_system>, arithmetic_atomic_testers>();
+  validate_pinned<cuda::atomic<signed long long, cuda::thread_scope_system>, arithmetic_atomic_testers>();
+
+  validate_pinned<cuda::atomic<unsigned char, cuda::thread_scope_system>, bitwise_atomic_testers>();
+  validate_pinned<cuda::atomic<unsigned short, cuda::thread_scope_system>, bitwise_atomic_testers>();
+  validate_pinned<cuda::atomic<unsigned int, cuda::thread_scope_system>, bitwise_atomic_testers>();
+  validate_pinned<cuda::atomic<unsigned long, cuda::thread_scope_system>, bitwise_atomic_testers>();
+  validate_pinned<cuda::atomic<unsigned long long, cuda::thread_scope_system>, bitwise_atomic_testers>();
+
+  validate_pinned<cuda::atomic<float>, arithmetic_atomic_testers>();
+  validate_pinned<cuda::atomic<double>, arithmetic_atomic_testers>();
+
+  validate_pinned<cuda::atomic<big_not_lockfree_type, cuda::thread_scope_system>, basic_testers>();
 }
 
 int main(int arg, char** argv)
diff --git a/libcudacxx/test/libcudacxx/heterogeneous/atomic_flag.pass.cpp b/libcudacxx/test/libcudacxx/heterogeneous/atomic_flag.pass.cpp
index 091dbcc0ee..dae0cb8fc1 100644
--- a/libcudacxx/test/libcudacxx/heterogeneous/atomic_flag.pass.cpp
+++ b/libcudacxx/test/libcudacxx/heterogeneous/atomic_flag.pass.cpp
@@ -10,6 +10,7 @@
 // UNSUPPORTED: windows && pre-sm-70
 
 #include <cuda/std/atomic>
+#include <cuda/std/cassert>
 
 #include "helpers.h"
 
@@ -51,7 +52,7 @@ using atomic_flag_testers = tester_list<clear_tester, clear, test_and_set_tester
 
 void kernel_invoker()
 {
-  validate_not_movable<cuda::std::atomic_flag, atomic_flag_testers>();
+  validate_pinned<cuda::std::atomic_flag, atomic_flag_testers>();
 }
 
 int main(int argc, char** argv)
diff --git a/libcudacxx/test/libcudacxx/heterogeneous/atomic_ref.pass.cpp b/libcudacxx/test/libcudacxx/heterogeneous/atomic_ref.pass.cpp
index 818a009eee..dd98d04e8a 100644
--- a/libcudacxx/test/libcudacxx/heterogeneous/atomic_ref.pass.cpp
+++ b/libcudacxx/test/libcudacxx/heterogeneous/atomic_ref.pass.cpp
@@ -10,6 +10,7 @@
 // UNSUPPORTED: windows && pre-sm-70
 
 #include <cuda/std/atomic>
+#include <cuda/std/cassert>
 
 #include "helpers.h"
 
@@ -154,53 +155,53 @@ using basic_testers =
               exchange_tester<-12, 17>>;
 
 using arithmetic_atomic_testers =
-  extend_tester_list<basic_testers,
-                     fetch_add_tester<17, 13, 30>,
-                     fetch_sub_tester<30, 21, 9>,
-                     fetch_sub_tester<9, 17, -8>>;
+  append<basic_testers,
+         fetch_add_tester<17, 13, 30>,
+         fetch_sub_tester<30, 21, 9>,
+         fetch_sub_tester<9, 17, -8>>;
 
 using bitwise_atomic_testers =
-  extend_tester_list<arithmetic_atomic_testers,
-                     fetch_add_tester<-8, 10, 2>,
-                     fetch_or_tester<2, 13, 15>,
-                     fetch_and_tester<15, 8, 8>,
-                     fetch_and_tester<8, 13, 8>,
-                     fetch_xor_tester<8, 12, 4>>;
+  append<arithmetic_atomic_testers,
+         fetch_add_tester<-8, 10, 2>,
+         fetch_or_tester<2, 13, 15>,
+         fetch_and_tester<15, 8, 8>,
+         fetch_and_tester<8, 13, 8>,
+         fetch_xor_tester<8, 12, 4>>;
 
 void kernel_invoker()
 {
 // todo
 #ifdef _LIBCUDACXX_ATOMIC_REF_SUPPORTS_SMALL_INTEGRAL
-  validate_not_movable<signed char, arithmetic_atomic_testers>();
-  validate_not_movable<signed short, arithmetic_atomic_testers>();
+  validate_pinned<signed char, arithmetic_atomic_testers>();
+  validate_pinned<signed short, arithmetic_atomic_testers>();
 #endif
-  validate_not_movable<signed int, arithmetic_atomic_testers>();
-  validate_not_movable<signed long, arithmetic_atomic_testers>();
-  validate_not_movable<signed long long, arithmetic_atomic_testers>();
+  validate_pinned<signed int, arithmetic_atomic_testers>();
+  validate_pinned<signed long, arithmetic_atomic_testers>();
+  validate_pinned<signed long long, arithmetic_atomic_testers>();
 
 #ifdef _LIBCUDACXX_ATOMIC_REF_SUPPORTS_SMALL_INTEGRAL
-  validate_not_movable<unsigned char, bitwise_atomic_testers>();
-  validate_not_movable<unsigned short, bitwise_atomic_testers>();
+  validate_pinned<unsigned char, bitwise_atomic_testers>();
+  validate_pinned<unsigned short, bitwise_atomic_testers>();
 #endif
-  validate_not_movable<unsigned int, bitwise_atomic_testers>();
-  validate_not_movable<unsigned long, bitwise_atomic_testers>();
-  validate_not_movable<unsigned long long, bitwise_atomic_testers>();
+  validate_pinned<unsigned int, bitwise_atomic_testers>();
+  validate_pinned<unsigned long, bitwise_atomic_testers>();
+  validate_pinned<unsigned long long, bitwise_atomic_testers>();
 
 #ifdef _LIBCUDACXX_ATOMIC_REF_SUPPORTS_SMALL_INTEGRAL
-  validate_not_movable<signed char, arithmetic_atomic_testers>();
-  validate_not_movable<signed short, arithmetic_atomic_testers>();
+  validate_pinned<signed char, arithmetic_atomic_testers>();
+  validate_pinned<signed short, arithmetic_atomic_testers>();
 #endif
-  validate_not_movable<signed int, arithmetic_atomic_testers>();
-  validate_not_movable<signed long, arithmetic_atomic_testers>();
-  validate_not_movable<signed long long, arithmetic_atomic_testers>();
+  validate_pinned<signed int, arithmetic_atomic_testers>();
+  validate_pinned<signed long, arithmetic_atomic_testers>();
+  validate_pinned<signed long long, arithmetic_atomic_testers>();
 
 #ifdef _LIBCUDACXX_ATOMIC_REF_SUPPORTS_SMALL_INTEGRAL
-  validate_not_movable<unsigned char, bitwise_atomic_testers>();
-  validate_not_movable<unsigned short, bitwise_atomic_testers>();
+  validate_pinned<unsigned char, bitwise_atomic_testers>();
+  validate_pinned<unsigned short, bitwise_atomic_testers>();
 #endif
-  validate_not_movable<unsigned int, bitwise_atomic_testers>();
-  validate_not_movable<unsigned long, bitwise_atomic_testers>();
-  validate_not_movable<unsigned long long, bitwise_atomic_testers>();
+  validate_pinned<unsigned int, bitwise_atomic_testers>();
+  validate_pinned<unsigned long, bitwise_atomic_testers>();
+  validate_pinned<unsigned long long, bitwise_atomic_testers>();
 }
 
 int main(int arg, char** argv)
diff --git a/libcudacxx/test/libcudacxx/heterogeneous/barrier.pass.cpp b/libcudacxx/test/libcudacxx/heterogeneous/barrier.pass.cpp
index af16e62cb2..61b84544a8 100644
--- a/libcudacxx/test/libcudacxx/heterogeneous/barrier.pass.cpp
+++ b/libcudacxx/test/libcudacxx/heterogeneous/barrier.pass.cpp
@@ -12,6 +12,7 @@
 // uncomment for a really verbose output detailing what test steps are being launched
 // #define DEBUG_TESTERS
 
+#include <cuda/std/cassert>
 #include <cuda/barrier>
 
 #include "helpers.h"
@@ -148,23 +149,23 @@ using cuda_barrier_system = cuda::barrier<cuda::thread_scope_system, Completion>
 
 void kernel_invoker()
 {
-  validate_not_movable<barrier_and_token<cuda::std::barrier<>>, a_aw_w>(2);
-  validate_not_movable<barrier_and_token<cuda::barrier<cuda::thread_scope_system>>, a_aw_w>(2);
+  validate_pinned<barrier_and_token<cuda::std::barrier<>>, a_aw_w>(2);
+  validate_pinned<barrier_and_token<cuda::barrier<cuda::thread_scope_system>>, a_aw_w>(2);
 
-  validate_not_movable<barrier_and_token<cuda::std::barrier<>>, aw_aw>(2);
-  validate_not_movable<barrier_and_token<cuda::barrier<cuda::thread_scope_system>>, aw_aw>(2);
+  validate_pinned<barrier_and_token<cuda::std::barrier<>>, aw_aw>(2);
+  validate_pinned<barrier_and_token<cuda::barrier<cuda::thread_scope_system>>, aw_aw>(2);
 
-  validate_not_movable<barrier_and_token<cuda::std::barrier<>>, a_w_aw>(2);
-  validate_not_movable<barrier_and_token<cuda::barrier<cuda::thread_scope_system>>, a_w_aw>(2);
+  validate_pinned<barrier_and_token<cuda::std::barrier<>>, a_w_aw>(2);
+  validate_pinned<barrier_and_token<cuda::barrier<cuda::thread_scope_system>>, a_w_aw>(2);
 
-  validate_not_movable<barrier_and_token<cuda::std::barrier<>>, a_w_a_w>(2);
-  validate_not_movable<barrier_and_token<cuda::barrier<cuda::thread_scope_system>>, a_w_a_w>(2);
+  validate_pinned<barrier_and_token<cuda::std::barrier<>>, a_w_a_w>(2);
+  validate_pinned<barrier_and_token<cuda::barrier<cuda::thread_scope_system>>, a_w_a_w>(2);
 
-  validate_not_movable<barrier_and_token_with_completion<cuda::std::barrier>, completion_performers_a>(2);
-  validate_not_movable<barrier_and_token_with_completion<cuda_barrier_system>, completion_performers_a>(2);
+  validate_pinned<barrier_and_token_with_completion<cuda::std::barrier>, completion_performers_a>(2);
+  validate_pinned<barrier_and_token_with_completion<cuda_barrier_system>, completion_performers_a>(2);
 
-  validate_not_movable<barrier_and_token_with_completion<cuda::std::barrier>, completion_performers_b>(2);
-  validate_not_movable<barrier_and_token_with_completion<cuda_barrier_system>, completion_performers_b>(2);
+  validate_pinned<barrier_and_token_with_completion<cuda::std::barrier>, completion_performers_b>(2);
+  validate_pinned<barrier_and_token_with_completion<cuda_barrier_system>, completion_performers_b>(2);
 }
 
 int main(int arg, char** argv)
diff --git a/libcudacxx/test/libcudacxx/heterogeneous/barrier_abi_v2.pass.cpp b/libcudacxx/test/libcudacxx/heterogeneous/barrier_abi_v2.pass.cpp
index eef0a17fef..efc1e0930e 100644
--- a/libcudacxx/test/libcudacxx/heterogeneous/barrier_abi_v2.pass.cpp
+++ b/libcudacxx/test/libcudacxx/heterogeneous/barrier_abi_v2.pass.cpp
@@ -15,6 +15,7 @@
 #define _LIBCUDACXX_CUDA_ABI_VERSION 2
 
 #include <cuda/barrier>
+#include <cuda/std/cassert>
 
 #include "helpers.h"
 
@@ -153,23 +154,23 @@ using cuda_barrier_system = cuda::barrier<cuda::thread_scope_system, Completion>
 
 void kernel_invoker()
 {
-  validate_not_movable<barrier_and_token<cuda::std::barrier<>>, a_aw_w>(2);
-  validate_not_movable<barrier_and_token<cuda::barrier<cuda::thread_scope_system>>, a_aw_w>(2);
+  validate_pinned<barrier_and_token<cuda::std::barrier<>>, a_aw_w>(2);
+  validate_pinned<barrier_and_token<cuda::barrier<cuda::thread_scope_system>>, a_aw_w>(2);
 
-  validate_not_movable<barrier_and_token<cuda::std::barrier<>>, aw_aw>(2);
-  validate_not_movable<barrier_and_token<cuda::barrier<cuda::thread_scope_system>>, aw_aw>(2);
+  validate_pinned<barrier_and_token<cuda::std::barrier<>>, aw_aw>(2);
+  validate_pinned<barrier_and_token<cuda::barrier<cuda::thread_scope_system>>, aw_aw>(2);
 
-  validate_not_movable<barrier_and_token<cuda::std::barrier<>>, a_w_aw>(2);
-  validate_not_movable<barrier_and_token<cuda::barrier<cuda::thread_scope_system>>, a_w_aw>(2);
+  validate_pinned<barrier_and_token<cuda::std::barrier<>>, a_w_aw>(2);
+  validate_pinned<barrier_and_token<cuda::barrier<cuda::thread_scope_system>>, a_w_aw>(2);
 
-  validate_not_movable<barrier_and_token<cuda::std::barrier<>>, a_w_a_w>(2);
-  validate_not_movable<barrier_and_token<cuda::barrier<cuda::thread_scope_system>>, a_w_a_w>(2);
+  validate_pinned<barrier_and_token<cuda::std::barrier<>>, a_w_a_w>(2);
+  validate_pinned<barrier_and_token<cuda::barrier<cuda::thread_scope_system>>, a_w_a_w>(2);
 
-  validate_not_movable<barrier_and_token_with_completion<cuda::std::barrier>, completion_performers_a>(2);
-  validate_not_movable<barrier_and_token_with_completion<cuda_barrier_system>, completion_performers_a>(2);
+  validate_pinned<barrier_and_token_with_completion<cuda::std::barrier>, completion_performers_a>(2);
+  validate_pinned<barrier_and_token_with_completion<cuda_barrier_system>, completion_performers_a>(2);
 
-  validate_not_movable<barrier_and_token_with_completion<cuda::std::barrier>, completion_performers_b>(2);
-  validate_not_movable<barrier_and_token_with_completion<cuda_barrier_system>, completion_performers_b>(2);
+  validate_pinned<barrier_and_token_with_completion<cuda::std::barrier>, completion_performers_b>(2);
+  validate_pinned<barrier_and_token_with_completion<cuda_barrier_system>, completion_performers_b>(2);
 }
 
 int main(int arg, char** argv)
diff --git a/libcudacxx/test/libcudacxx/heterogeneous/barrier_parity.cuda.pass.cpp b/libcudacxx/test/libcudacxx/heterogeneous/barrier_parity.cuda.pass.cpp
index 059cdc3be4..072dcfd165 100644
--- a/libcudacxx/test/libcudacxx/heterogeneous/barrier_parity.cuda.pass.cpp
+++ b/libcudacxx/test/libcudacxx/heterogeneous/barrier_parity.cuda.pass.cpp
@@ -14,6 +14,7 @@
 // #define DEBUG_TESTERS
 
 #include <cuda/barrier>
+#include <cuda/std/cassert>
 
 #include <atomic>
 
@@ -89,8 +90,8 @@ using aw_aw_pw2 =
 
 void kernel_invoker()
 {
-  validate_not_movable<barrier_and_token<cuda::barrier<cuda::thread_scope_system>>, aw_aw_pw1>(2);
-  validate_not_movable<barrier_and_token<cuda::barrier<cuda::thread_scope_system>>, aw_aw_pw2>(2);
+  validate_pinned<barrier_and_token<cuda::barrier<cuda::thread_scope_system>>, aw_aw_pw1>(2);
+  validate_pinned<barrier_and_token<cuda::barrier<cuda::thread_scope_system>>, aw_aw_pw2>(2);
 }
 
 int main(int arg, char** argv)
diff --git a/libcudacxx/test/libcudacxx/heterogeneous/barrier_parity.std.pass.cpp b/libcudacxx/test/libcudacxx/heterogeneous/barrier_parity.std.pass.cpp
index 6ad2380526..7c6777ee7b 100644
--- a/libcudacxx/test/libcudacxx/heterogeneous/barrier_parity.std.pass.cpp
+++ b/libcudacxx/test/libcudacxx/heterogeneous/barrier_parity.std.pass.cpp
@@ -14,6 +14,7 @@
 // #define DEBUG_TESTERS
 
 #include <cuda/barrier>
+#include <cuda/std/cassert>
 
 #include <atomic>
 
@@ -88,8 +89,8 @@ using aw_aw_pw2 =
 
 void kernel_invoker()
 {
-  validate_not_movable<barrier_and_token<cuda::std::barrier<>>, aw_aw_pw1>(2);
-  validate_not_movable<barrier_and_token<cuda::std::barrier<>>, aw_aw_pw2>(2);
+  validate_pinned<barrier_and_token<cuda::std::barrier<>>, aw_aw_pw1>(2);
+  validate_pinned<barrier_and_token<cuda::std::barrier<>>, aw_aw_pw2>(2);
 }
 
 int main(int arg, char** argv)
diff --git a/libcudacxx/test/libcudacxx/heterogeneous/cuda_atomic_ref.pass.cpp b/libcudacxx/test/libcudacxx/heterogeneous/cuda_atomic_ref.pass.cpp
index 4246af8c58..c8d4f077e1 100644
--- a/libcudacxx/test/libcudacxx/heterogeneous/cuda_atomic_ref.pass.cpp
+++ b/libcudacxx/test/libcudacxx/heterogeneous/cuda_atomic_ref.pass.cpp
@@ -10,6 +10,7 @@
 // UNSUPPORTED: windows && pre-sm-70
 
 #include <cuda/atomic>
+#include <cuda/std/cassert>
 
 #include "helpers.h"
 
@@ -157,55 +158,55 @@ using basic_testers =
               exchange_tester<-12, 17>>;
 
 using arithmetic_atomic_testers =
-  extend_tester_list<basic_testers,
-                     fetch_add_tester<17, 13, 30>,
-                     fetch_sub_tester<30, 21, 9>,
-                     fetch_min_tester<9, 5, 5>,
-                     fetch_max_tester<5, 9, 9>,
-                     fetch_sub_tester<9, 17, -8>>;
+  append<basic_testers,
+         fetch_add_tester<17, 13, 30>,
+         fetch_sub_tester<30, 21, 9>,
+         fetch_min_tester<9, 5, 5>,
+         fetch_max_tester<5, 9, 9>,
+         fetch_sub_tester<9, 17, -8>>;
 
 using bitwise_atomic_testers =
-  extend_tester_list<arithmetic_atomic_testers,
-                     fetch_add_tester<-8, 10, 2>,
-                     fetch_or_tester<2, 13, 15>,
-                     fetch_and_tester<15, 8, 8>,
-                     fetch_and_tester<8, 13, 8>,
-                     fetch_xor_tester<8, 12, 4>>;
+  append<arithmetic_atomic_testers,
+         fetch_add_tester<-8, 10, 2>,
+         fetch_or_tester<2, 13, 15>,
+         fetch_and_tester<15, 8, 8>,
+         fetch_and_tester<8, 13, 8>,
+         fetch_xor_tester<8, 12, 4>>;
 
 void kernel_invoker()
 {
 // todo
 #ifdef _LIBCUDACXX_ATOMIC_REF_SUPPORTS_SMALL_INTEGRAL
-  validate_not_movable<signed char, arithmetic_atomic_testers>();
-  validate_not_movable<signed short, arithmetic_atomic_testers>();
+  validate_pinned<signed char, arithmetic_atomic_testers>();
+  validate_pinned<signed short, arithmetic_atomic_testers>();
 #endif
-  validate_not_movable<signed int, arithmetic_atomic_testers>();
-  validate_not_movable<signed long, arithmetic_atomic_testers>();
-  validate_not_movable<signed long long, arithmetic_atomic_testers>();
+  validate_pinned<signed int, arithmetic_atomic_testers>();
+  validate_pinned<signed long, arithmetic_atomic_testers>();
+  validate_pinned<signed long long, arithmetic_atomic_testers>();
 
 #ifdef _LIBCUDACXX_ATOMIC_REF_SUPPORTS_SMALL_INTEGRAL
-  validate_not_movable<unsigned char, bitwise_atomic_testers>();
-  validate_not_movable<unsigned short, bitwise_atomic_testers>();
+  validate_pinned<unsigned char, bitwise_atomic_testers>();
+  validate_pinned<unsigned short, bitwise_atomic_testers>();
 #endif
-  validate_not_movable<unsigned int, bitwise_atomic_testers>();
-  validate_not_movable<unsigned long, bitwise_atomic_testers>();
-  validate_not_movable<unsigned long long, bitwise_atomic_testers>();
+  validate_pinned<unsigned int, bitwise_atomic_testers>();
+  validate_pinned<unsigned long, bitwise_atomic_testers>();
+  validate_pinned<unsigned long long, bitwise_atomic_testers>();
 
 #ifdef _LIBCUDACXX_ATOMIC_REF_SUPPORTS_SMALL_INTEGRAL
-  validate_not_movable<signed char, arithmetic_atomic_testers>();
-  validate_not_movable<signed short, arithmetic_atomic_testers>();
+  validate_pinned<signed char, arithmetic_atomic_testers>();
+  validate_pinned<signed short, arithmetic_atomic_testers>();
 #endif
-  validate_not_movable<signed int, arithmetic_atomic_testers>();
-  validate_not_movable<signed long, arithmetic_atomic_testers>();
-  validate_not_movable<signed long long, arithmetic_atomic_testers>();
+  validate_pinned<signed int, arithmetic_atomic_testers>();
+  validate_pinned<signed long, arithmetic_atomic_testers>();
+  validate_pinned<signed long long, arithmetic_atomic_testers>();
 
 #ifdef _LIBCUDACXX_ATOMIC_REF_SUPPORTS_SMALL_INTEGRAL
-  validate_not_movable<unsigned char, bitwise_atomic_testers>();
-  validate_not_movable<unsigned short, bitwise_atomic_testers>();
+  validate_pinned<unsigned char, bitwise_atomic_testers>();
+  validate_pinned<unsigned short, bitwise_atomic_testers>();
 #endif
-  validate_not_movable<unsigned int, bitwise_atomic_testers>();
-  validate_not_movable<unsigned long, bitwise_atomic_testers>();
-  validate_not_movable<unsigned long long, bitwise_atomic_testers>();
+  validate_pinned<unsigned int, bitwise_atomic_testers>();
+  validate_pinned<unsigned long, bitwise_atomic_testers>();
+  validate_pinned<unsigned long long, bitwise_atomic_testers>();
 }
 
 int main(int arg, char** argv)
diff --git a/libcudacxx/test/libcudacxx/heterogeneous/latch.pass.cpp b/libcudacxx/test/libcudacxx/heterogeneous/latch.pass.cpp
index 5b50ef29b2..60fcffff7d 100644
--- a/libcudacxx/test/libcudacxx/heterogeneous/latch.pass.cpp
+++ b/libcudacxx/test/libcudacxx/heterogeneous/latch.pass.cpp
@@ -12,6 +12,7 @@
 // #define DEBUG_TESTERS
 
 #include <cuda/std/latch>
+#include <cuda/std/cassert>
 
 #include "helpers.h"
 
@@ -31,11 +32,12 @@ template <int N>
 struct arrive_and_wait
 {
   using async = cuda::std::true_type;
+  static constexpr size_t threadcount = N;
 
   template <typename Latch>
   __host__ __device__ static void perform(Latch& latch)
   {
-    latch.arrive_and_wait(N);
+    latch.arrive_and_wait(1);
   }
 };
 
@@ -70,16 +72,21 @@ using r5_cd1_aw2_w_cd2 = performer_list<reset<5>, count_down<1>, arrive_and_wait
 
 using r3_aw1_aw1_aw1 = performer_list<reset<3>, arrive_and_wait<1>, arrive_and_wait<1>, arrive_and_wait<1>>;
 
+using r3_aw3 = performer_list<reset<3>, arrive_and_wait<3>>;
+
 void kernel_invoker()
 {
-  validate_not_movable<cuda::std::latch, r0_w>(0);
-  validate_not_movable<cuda::latch<cuda::thread_scope_system>, r0_w>(0);
+  validate_pinned<cuda::std::latch, r0_w>(0);
+  validate_pinned<cuda::latch<cuda::thread_scope_system>, r0_w>(0);
+
+  validate_pinned<cuda::std::latch, r5_cd1_aw2_w_cd2>(0);
+  validate_pinned<cuda::latch<cuda::thread_scope_system>, r5_cd1_aw2_w_cd2>(0);
 
-  validate_not_movable<cuda::std::latch, r5_cd1_aw2_w_cd2>(0);
-  validate_not_movable<cuda::latch<cuda::thread_scope_system>, r5_cd1_aw2_w_cd2>(0);
+  validate_pinned<cuda::std::latch, r3_aw3>(0);
+  validate_pinned<cuda::latch<cuda::thread_scope_system>, r3_aw3>(0);
 
-  validate_not_movable<cuda::std::latch, r3_aw1_aw1_aw1>(0);
-  validate_not_movable<cuda::latch<cuda::thread_scope_system>, r3_aw1_aw1_aw1>(0);
+  validate_pinned<cuda::std::latch, r3_aw3>(0);
+  validate_pinned<cuda::latch<cuda::thread_scope_system>, r3_aw3>(0);
 }
 
 int main(int arg, char** argv)
diff --git a/libcudacxx/test/libcudacxx/heterogeneous/latch_abi_v2.pass.cpp b/libcudacxx/test/libcudacxx/heterogeneous/latch_abi_v2.pass.cpp
index 5e844f877d..11b621b555 100644
--- a/libcudacxx/test/libcudacxx/heterogeneous/latch_abi_v2.pass.cpp
+++ b/libcudacxx/test/libcudacxx/heterogeneous/latch_abi_v2.pass.cpp
@@ -14,6 +14,7 @@
 #define _LIBCUDACXX_CUDA_ABI_VERSION 2
 
 #include <cuda/std/latch>
+#include <cuda/std/cassert>
 
 #include "helpers.h"
 
@@ -76,14 +77,14 @@ using r3_aw1_aw1_aw1 = performer_list<reset<3>, arrive_and_wait<1>, arrive_and_w
 
 void kernel_invoker()
 {
-  validate_not_movable<cuda::std::latch, r0_w>(0);
-  validate_not_movable<cuda::latch<cuda::thread_scope_system>, r0_w>(0);
+  validate_pinned<cuda::std::latch, r0_w>(0);
+  validate_pinned<cuda::latch<cuda::thread_scope_system>, r0_w>(0);
 
-  validate_not_movable<cuda::std::latch, r5_cd1_aw2_w_cd2>(0);
-  validate_not_movable<cuda::latch<cuda::thread_scope_system>, r5_cd1_aw2_w_cd2>(0);
+  validate_pinned<cuda::std::latch, r5_cd1_aw2_w_cd2>(0);
+  validate_pinned<cuda::latch<cuda::thread_scope_system>, r5_cd1_aw2_w_cd2>(0);
 
-  validate_not_movable<cuda::std::latch, r3_aw1_aw1_aw1>(0);
-  validate_not_movable<cuda::latch<cuda::thread_scope_system>, r3_aw1_aw1_aw1>(0);
+  validate_pinned<cuda::std::latch, r3_aw1_aw1_aw1>(0);
+  validate_pinned<cuda::latch<cuda::thread_scope_system>, r3_aw1_aw1_aw1>(0);
 }
 
 int main(int arg, char** argv)
diff --git a/libcudacxx/test/libcudacxx/heterogeneous/optional.pass.cpp b/libcudacxx/test/libcudacxx/heterogeneous/optional.pass.cpp
index 99de54cbfb..f5394741e3 100644
--- a/libcudacxx/test/libcudacxx/heterogeneous/optional.pass.cpp
+++ b/libcudacxx/test/libcudacxx/heterogeneous/optional.pass.cpp
@@ -112,8 +112,8 @@ struct non_trivial
 void kernel_invoker()
 {
   // TODO: add validate_movable
-  validate_not_movable<cuda::std::optional<int>, testers>();
-  validate_not_movable<cuda::std::optional<non_trivial>, testers>();
+  validate_pinned<cuda::std::optional<int>, testers>();
+  validate_pinned<cuda::std::optional<non_trivial>, testers>();
 }
 
 int main(int arg, char** argv)
diff --git a/libcudacxx/test/libcudacxx/heterogeneous/pair.pass.cpp b/libcudacxx/test/libcudacxx/heterogeneous/pair.pass.cpp
index 3d79695dc7..30aba85432 100644
--- a/libcudacxx/test/libcudacxx/heterogeneous/pair.pass.cpp
+++ b/libcudacxx/test/libcudacxx/heterogeneous/pair.pass.cpp
@@ -54,7 +54,7 @@ using w_r_w_r = performer_list<Write<10>, Read<10>, Write<30>, Read<30>>;
 void kernel_invoker()
 {
   pair_t p(0, {0});
-  validate_not_movable<pair_t, w_r_w_r>(p);
+  validate_pinned<pair_t, w_r_w_r>(p);
 }
 
 int main(int arg, char** argv)
diff --git a/libcudacxx/test/libcudacxx/heterogeneous/semaphore.pass.cpp b/libcudacxx/test/libcudacxx/heterogeneous/semaphore.pass.cpp
index c7e555fcdf..62d32bc581 100644
--- a/libcudacxx/test/libcudacxx/heterogeneous/semaphore.pass.cpp
+++ b/libcudacxx/test/libcudacxx/heterogeneous/semaphore.pass.cpp
@@ -11,6 +11,7 @@
 // uncomment for a really verbose output detailing what test steps are being launched
 // #define DEBUG_TESTERS
 
+#include <cuda/std/cassert>
 #include <cuda/std/semaphore>
 
 #include "helpers.h"
@@ -46,14 +47,14 @@ using a_r3_a_a = performer_list<acquire, release<3>, acquire, acquire>;
 
 void kernel_invoker()
 {
-  validate_not_movable<cuda::std::counting_semaphore<3>, a_a_r2>(0);
-  validate_not_movable<cuda::counting_semaphore<cuda::thread_scope_system, 3>, a_a_r2>(0);
+  validate_pinned<cuda::std::counting_semaphore<3>, a_a_r2>(0);
+  validate_pinned<cuda::counting_semaphore<cuda::thread_scope_system, 3>, a_a_r2>(0);
 
-  validate_not_movable<cuda::std::counting_semaphore<3>, a_a_a_r1_r2>(0);
-  validate_not_movable<cuda::counting_semaphore<cuda::thread_scope_system, 3>, a_a_a_r1_r2>(0);
+  validate_pinned<cuda::std::counting_semaphore<3>, a_a_a_r1_r2>(0);
+  validate_pinned<cuda::counting_semaphore<cuda::thread_scope_system, 3>, a_a_a_r1_r2>(0);
 
-  validate_not_movable<cuda::std::counting_semaphore<3>, a_r3_a_a>(0);
-  validate_not_movable<cuda::counting_semaphore<cuda::thread_scope_system, 3>, a_r3_a_a>(0);
+  validate_pinned<cuda::std::counting_semaphore<3>, a_r3_a_a>(0);
+  validate_pinned<cuda::counting_semaphore<cuda::thread_scope_system, 3>, a_r3_a_a>(0);
 }
 
 int main(int arg, char** argv)
diff --git a/libcudacxx/test/libcudacxx/heterogeneous/tuple.pass.cpp b/libcudacxx/test/libcudacxx/heterogeneous/tuple.pass.cpp
index f0e0ffb405..40a63ba4a6 100644
--- a/libcudacxx/test/libcudacxx/heterogeneous/tuple.pass.cpp
+++ b/libcudacxx/test/libcudacxx/heterogeneous/tuple.pass.cpp
@@ -56,7 +56,7 @@ using w_r_w_r = performer_list<Write<10>, Read<10>, Write<30>, Read<30>>;
 void kernel_invoker()
 {
   tuple_t t(0, {0}, 0);
-  validate_not_movable<tuple_t, w_r_w_r>(t);
+  validate_pinned<tuple_t, w_r_w_r>(t);
 }
 
 int main(int arg, char** argv)
diff --git a/libcudacxx/test/libcudacxx/heterogeneous/variant.pass.cpp b/libcudacxx/test/libcudacxx/heterogeneous/variant.pass.cpp
index a0cf055777..39a409c2a2 100644
--- a/libcudacxx/test/libcudacxx/heterogeneous/variant.pass.cpp
+++ b/libcudacxx/test/libcudacxx/heterogeneous/variant.pass.cpp
@@ -53,7 +53,7 @@ using testers =
 void kernel_invoker()
 {
   variant_t v;
-  validate_not_movable<variant_t, testers>(v);
+  validate_pinned<variant_t, testers>(v);
 }
 
 int main(int arg, char** argv)

From 6a76eb51ac3d2ad8765b5a49a1cb835dee9beebc Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Fri, 3 May 2024 12:35:46 -0700
Subject: [PATCH 25/71] Fix a couple issues with `__atomic_underlying_t` in
 __atomics/types

---
 libcudacxx/include/cuda/std/__atomic/types/common.h | 4 ++--
 libcudacxx/include/cuda/std/__atomic/types/locked.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__atomic/types/common.h b/libcudacxx/include/cuda/std/__atomic/types/common.h
index e962838235..501f538408 100644
--- a/libcudacxx/include/cuda/std/__atomic/types/common.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/common.h
@@ -14,7 +14,7 @@
 #include <cuda/std/detail/__config>
 
 #include <cuda/std/__type_traits/enable_if.h>
-#include <cuda/std/__type_traits/remove_cvref.h>
+#include <cuda/std/__type_traits/remove_cv.h>
 #include <cuda/std/__type_traits/is_assignable.h>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
@@ -54,7 +54,7 @@ _CCCL_HOST_DEVICE __atomic_assign_volatile(_Tp volatile* __a_value, _Tv volatile
 
 // The 'value_type' of the atomic may be 'volatile blah', so remove the volatile portion for now.
 template <typename _Tp>
-using __atomic_underlying_t = typename  __remove_cvref_t<__remove_cvref_t<_Tp>::__underlying_t>;
+using __atomic_underlying_t = typename _Tp::__underlying_t;
 
 _CCCL_HOST_DEVICE
 inline int __atomic_memcmp(void const * __lhs, void const * __rhs, size_t __count) {
diff --git a/libcudacxx/include/cuda/std/__atomic/types/locked.h b/libcudacxx/include/cuda/std/__atomic/types/locked.h
index ff181352b3..60cf464d56 100644
--- a/libcudacxx/include/cuda/std/__atomic/types/locked.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/locked.h
@@ -26,7 +26,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 // Locked atomics must override the dispatch to be able to implement RMW primitives around the embedded lock.
 template <typename _Tp>
 struct __atomic_locked_storage {
-  using __underlying_t = typename __remove_cv_t<_Tp>;
+  using __underlying_t = _Tp;
   static constexpr __atomic_tag __tag = __atomic_tag::__atomic_locked_tag;
 
   _Tp __a_value;

From 2b3ed4779fd39760aebc5cfde1e54a058caf9266 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 3 May 2024 19:49:30 +0000
Subject: [PATCH 26/71] [pre-commit.ci] auto code formatting

---
 libcudacxx/codegen/codegen.cpp                |   2 +-
 .../include/cuda/std/__atomic/api/common.h    | 270 ++++++++-------
 .../include/cuda/std/__atomic/api/owned.h     | 126 +++----
 .../include/cuda/std/__atomic/api/reference.h |  90 ++---
 .../include/cuda/std/__atomic/functions.h     |   2 +-
 .../std/__atomic/functions/cuda_ptx_derived.h |  40 +--
 .../cuda/std/__atomic/functions/host.h        | 136 ++++----
 libcudacxx/include/cuda/std/__atomic/order.h  | 121 +++----
 .../include/cuda/std/__atomic/platform.h      |  82 ++---
 .../std/__atomic/platform/msvc_to_builtins.h  |   4 +-
 libcudacxx/include/cuda/std/__atomic/scopes.h |  63 ++--
 libcudacxx/include/cuda/std/__atomic/types.h  |  23 +-
 .../include/cuda/std/__atomic/types/base.h    | 313 ++++++++----------
 .../include/cuda/std/__atomic/types/common.h  |  62 ++--
 .../include/cuda/std/__atomic/types/locked.h  | 118 ++++---
 .../cuda/std/__atomic/types/reference.h       |  32 +-
 .../include/cuda/std/__atomic/types/small.h   | 222 +++++++------
 .../cuda/std/__atomic/wait/notify_wait.h      |  78 ++---
 .../include/cuda/std/__atomic/wait/polling.h  |  35 +-
 libcudacxx/include/cuda/std/__cuda/atomic.h   |  70 ++--
 .../libcudacxx/heterogeneous/atomic.pass.cpp  |  15 +-
 .../heterogeneous/atomic_ref.pass.cpp         |   5 +-
 .../libcudacxx/heterogeneous/barrier.pass.cpp |   2 +-
 .../test/libcudacxx/heterogeneous/helpers.h   |  92 +++--
 .../libcudacxx/heterogeneous/latch.pass.cpp   |   4 +-
 .../heterogeneous/latch_abi_v2.pass.cpp       |   2 +-
 .../test/libcudacxx/heterogeneous/meta.h      |  71 ++--
 .../isalwayslockfree.pass.cpp                 |   3 +-
 .../atomics.lockfree/lockfree.pass.cpp        |  30 +-
 29 files changed, 1129 insertions(+), 984 deletions(-)

diff --git a/libcudacxx/codegen/codegen.cpp b/libcudacxx/codegen/codegen.cpp
index e15db41f83..bf661ba612 100644
--- a/libcudacxx/codegen/codegen.cpp
+++ b/libcudacxx/codegen/codegen.cpp
@@ -68,7 +68,7 @@ int main()
 
   std::ofstream out("cuda_ptx_generated.h");
 
-    out << R"XXX(//===----------------------------------------------------------------------===//
+  out << R"XXX(//===----------------------------------------------------------------------===//
 //
 // Part of libcu++, the C++ Standard Library for your entire system,
 // under the Apache License v2.0 with LLVM Exceptions.
diff --git a/libcudacxx/include/cuda/std/__atomic/api/common.h b/libcudacxx/include/cuda/std/__atomic/api/common.h
index 7591c5d0d4..3a3de3f80e 100644
--- a/libcudacxx/include/cuda/std/__atomic/api/common.h
+++ b/libcudacxx/include/cuda/std/__atomic/api/common.h
@@ -16,121 +16,169 @@
 #include <cuda/std/__atomic/types/base.h>
 
 // API definitions for the base atomic implementation
-#define _LIBCUDACXX_ATOMIC_COMMON_IMPL(_CONST, _VOLATILE)                                                         \
-    _CCCL_HOST_DEVICE inline                                                                                      \
-    bool is_lock_free() const _VOLATILE noexcept                                                                  \
-        {return _LIBCUDACXX_ATOMIC_IS_LOCK_FREE(sizeof(_Tp));}                                                    \
-    _CCCL_HOST_DEVICE inline                                                                                      \
-    void store(_Tp __d, memory_order __m = memory_order_seq_cst) _CONST _VOLATILE noexcept                        \
-      _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)                                                                   \
-        {__atomic_store_dispatch(&__a, __d, __m, _Sco{});}                                                        \
-    _CCCL_HOST_DEVICE inline                                                                                      \
-    _Tp load(memory_order __m = memory_order_seq_cst) const _VOLATILE noexcept                                    \
-      _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)                                                                    \
-        {return __atomic_load_dispatch(&__a, __m, _Sco{});}                                                       \
-    _CCCL_HOST_DEVICE inline                                                                                      \
-    operator _Tp() const _VOLATILE noexcept {return load();}                                                      \
-    _CCCL_HOST_DEVICE inline                                                                                      \
-    _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) _CONST _VOLATILE noexcept                      \
-        {return __atomic_exchange_dispatch(&__a, __d, __m, _Sco{});}                                              \
-    _CCCL_HOST_DEVICE inline                                                                                      \
-    bool compare_exchange_weak(_Tp& __e, _Tp __d,                                                                 \
-                               memory_order __s, memory_order __f) _CONST _VOLATILE noexcept                      \
-      _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)                                                           \
-        {return __atomic_compare_exchange_weak_dispatch(&__a, &__e, __d, __s, __f, _Sco{});}                      \
-    _CCCL_HOST_DEVICE inline                                                                                      \
-    bool compare_exchange_strong(_Tp& __e, _Tp __d,                                                               \
-                                 memory_order __s, memory_order __f) _CONST _VOLATILE noexcept                    \
-      _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)                                                           \
-        {return __atomic_compare_exchange_strong_dispatch(&__a, &__e, __d, __s, __f, _Sco{});}                    \
-    _CCCL_HOST_DEVICE inline                                                                                      \
-    bool compare_exchange_weak(_Tp& __e, _Tp __d,                                                                 \
-                              memory_order __m = memory_order_seq_cst) _CONST _VOLATILE noexcept {                \
-        if (memory_order_acq_rel == __m)                                                                          \
-            return __atomic_compare_exchange_weak_dispatch(&__a, &__e, __d, __m, memory_order_acquire, _Sco{});   \
-        else if (memory_order_release == __m)                                                                     \
-            return __atomic_compare_exchange_weak_dispatch(&__a, &__e, __d, __m, memory_order_relaxed, _Sco{});   \
-        else                                                                                                      \
-            return __atomic_compare_exchange_weak_dispatch(&__a, &__e, __d, __m, __m, _Sco{});                    \
-    }                                                                                                             \
-    _CCCL_HOST_DEVICE inline                                                                                      \
-    bool compare_exchange_strong(_Tp& __e, _Tp __d,                                                               \
-                              memory_order __m = memory_order_seq_cst) _CONST _VOLATILE noexcept {                \
-        if (memory_order_acq_rel == __m)                                                                          \
-            return __atomic_compare_exchange_strong_dispatch(&__a, &__e, __d, __m, memory_order_acquire, _Sco{}); \
-        else if (memory_order_release == __m)                                                                     \
-            return __atomic_compare_exchange_strong_dispatch(&__a, &__e, __d, __m, memory_order_relaxed, _Sco{}); \
-        else                                                                                                      \
-            return __atomic_compare_exchange_strong_dispatch(&__a, &__e, __d, __m, __m, _Sco{});                  \
-    }                                                                                                             \
-    _CCCL_HOST_DEVICE inline                                                                                      \
-    void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const _VOLATILE noexcept                          \
-        {__atomic_wait(&__a, __v, __m, _Sco{});}                                                                  \
-    _CCCL_HOST_DEVICE inline                                                                                      \
-    void notify_one() _CONST _VOLATILE noexcept                                                                   \
-        {__atomic_notify_one(&__a, _Sco{});}                                                                      \
-    _CCCL_HOST_DEVICE inline                                                                                      \
-    void notify_all() _CONST _VOLATILE noexcept                                                                   \
-        {__atomic_notify_all(&__a, _Sco{});}                                                                      \
+#define _LIBCUDACXX_ATOMIC_COMMON_IMPL(_CONST, _VOLATILE)                                                           \
+  _CCCL_HOST_DEVICE inline bool is_lock_free() const _VOLATILE noexcept                                             \
+  {                                                                                                                 \
+    return _LIBCUDACXX_ATOMIC_IS_LOCK_FREE(sizeof(_Tp));                                                            \
+  }                                                                                                                 \
+  _CCCL_HOST_DEVICE inline void store(_Tp __d, memory_order __m = memory_order_seq_cst)                             \
+    _CONST _VOLATILE noexcept _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)                                             \
+  {                                                                                                                 \
+    __atomic_store_dispatch(&__a, __d, __m, _Sco{});                                                                \
+  }                                                                                                                 \
+  _CCCL_HOST_DEVICE inline _Tp load(memory_order __m = memory_order_seq_cst)                                        \
+    const _VOLATILE noexcept _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)                                               \
+  {                                                                                                                 \
+    return __atomic_load_dispatch(&__a, __m, _Sco{});                                                               \
+  }                                                                                                                 \
+  _CCCL_HOST_DEVICE inline operator _Tp() const _VOLATILE noexcept                                                  \
+  {                                                                                                                 \
+    return load();                                                                                                  \
+  }                                                                                                                 \
+  _CCCL_HOST_DEVICE inline _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) _CONST _VOLATILE noexcept \
+  {                                                                                                                 \
+    return __atomic_exchange_dispatch(&__a, __d, __m, _Sco{});                                                      \
+  }                                                                                                                 \
+  _CCCL_HOST_DEVICE inline bool compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __s, memory_order __f)        \
+    _CONST _VOLATILE noexcept _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)                                     \
+  {                                                                                                                 \
+    return __atomic_compare_exchange_weak_dispatch(&__a, &__e, __d, __s, __f, _Sco{});                              \
+  }                                                                                                                 \
+  _CCCL_HOST_DEVICE inline bool compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __s, memory_order __f)      \
+    _CONST _VOLATILE noexcept _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)                                     \
+  {                                                                                                                 \
+    return __atomic_compare_exchange_strong_dispatch(&__a, &__e, __d, __s, __f, _Sco{});                            \
+  }                                                                                                                 \
+  _CCCL_HOST_DEVICE inline bool compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst)   \
+    _CONST _VOLATILE noexcept                                                                                       \
+  {                                                                                                                 \
+    if (memory_order_acq_rel == __m)                                                                                \
+      return __atomic_compare_exchange_weak_dispatch(&__a, &__e, __d, __m, memory_order_acquire, _Sco{});           \
+    else if (memory_order_release == __m)                                                                           \
+      return __atomic_compare_exchange_weak_dispatch(&__a, &__e, __d, __m, memory_order_relaxed, _Sco{});           \
+    else                                                                                                            \
+      return __atomic_compare_exchange_weak_dispatch(&__a, &__e, __d, __m, __m, _Sco{});                            \
+  }                                                                                                                 \
+  _CCCL_HOST_DEVICE inline bool compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst) \
+    _CONST _VOLATILE noexcept                                                                                       \
+  {                                                                                                                 \
+    if (memory_order_acq_rel == __m)                                                                                \
+      return __atomic_compare_exchange_strong_dispatch(&__a, &__e, __d, __m, memory_order_acquire, _Sco{});         \
+    else if (memory_order_release == __m)                                                                           \
+      return __atomic_compare_exchange_strong_dispatch(&__a, &__e, __d, __m, memory_order_relaxed, _Sco{});         \
+    else                                                                                                            \
+      return __atomic_compare_exchange_strong_dispatch(&__a, &__e, __d, __m, __m, _Sco{});                          \
+  }                                                                                                                 \
+  _CCCL_HOST_DEVICE inline void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const _VOLATILE noexcept     \
+  {                                                                                                                 \
+    __atomic_wait(&__a, __v, __m, _Sco{});                                                                          \
+  }                                                                                                                 \
+  _CCCL_HOST_DEVICE inline void notify_one() _CONST _VOLATILE noexcept                                              \
+  {                                                                                                                 \
+    __atomic_notify_one(&__a, _Sco{});                                                                              \
+  }                                                                                                                 \
+  _CCCL_HOST_DEVICE inline void notify_all() _CONST _VOLATILE noexcept                                              \
+  {                                                                                                                 \
+    __atomic_notify_all(&__a, _Sco{});                                                                              \
+  }
 
 // API definitions for arithmetic atomics
-#define _LIBCUDACXX_ATOMIC_ARITHMETIC_IMPL(_CONST, _VOLATILE)                                                     \
-    _CCCL_HOST_DEVICE inline                                                                                      \
-    _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) _CONST _VOLATILE noexcept                    \
-        {return __atomic_fetch_add_dispatch(&__a, __op, __m, _Sco{});}                                            \
-    _CCCL_HOST_DEVICE inline                                                                                      \
-    _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) _CONST _VOLATILE noexcept                    \
-        {return __atomic_fetch_sub_dispatch(&__a, __op, __m, _Sco{});}                                            \
-    _CCCL_HOST_DEVICE inline                                                                                      \
-    _Tp operator++(int) _CONST _VOLATILE noexcept      {return fetch_add(_Tp(1));}                                \
-    _CCCL_HOST_DEVICE inline                                                                                      \
-    _Tp operator--(int) _CONST _VOLATILE noexcept      {return fetch_sub(_Tp(1));}                                \
-    _CCCL_HOST_DEVICE inline                                                                                      \
-    _Tp operator++() _CONST _VOLATILE noexcept         {return fetch_add(_Tp(1)) + _Tp(1);}                       \
-    _CCCL_HOST_DEVICE inline                                                                                      \
-    _Tp operator--() _CONST _VOLATILE noexcept         {return fetch_sub(_Tp(1)) - _Tp(1);}                       \
-    _CCCL_HOST_DEVICE inline                                                                                      \
-    _Tp operator+=(_Tp __op) _CONST _VOLATILE noexcept {return fetch_add(__op) + __op;}                           \
-    _CCCL_HOST_DEVICE inline                                                                                      \
-    _Tp operator-=(_Tp __op) _CONST _VOLATILE noexcept {return fetch_sub(__op) - __op;}                           \
+#define _LIBCUDACXX_ATOMIC_ARITHMETIC_IMPL(_CONST, _VOLATILE)                                                         \
+  _CCCL_HOST_DEVICE inline _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) _CONST _VOLATILE noexcept \
+  {                                                                                                                   \
+    return __atomic_fetch_add_dispatch(&__a, __op, __m, _Sco{});                                                      \
+  }                                                                                                                   \
+  _CCCL_HOST_DEVICE inline _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) _CONST _VOLATILE noexcept \
+  {                                                                                                                   \
+    return __atomic_fetch_sub_dispatch(&__a, __op, __m, _Sco{});                                                      \
+  }                                                                                                                   \
+  _CCCL_HOST_DEVICE inline _Tp operator++(int) _CONST _VOLATILE noexcept                                              \
+  {                                                                                                                   \
+    return fetch_add(_Tp(1));                                                                                         \
+  }                                                                                                                   \
+  _CCCL_HOST_DEVICE inline _Tp operator--(int) _CONST _VOLATILE noexcept                                              \
+  {                                                                                                                   \
+    return fetch_sub(_Tp(1));                                                                                         \
+  }                                                                                                                   \
+  _CCCL_HOST_DEVICE inline _Tp operator++() _CONST _VOLATILE noexcept                                                 \
+  {                                                                                                                   \
+    return fetch_add(_Tp(1)) + _Tp(1);                                                                                \
+  }                                                                                                                   \
+  _CCCL_HOST_DEVICE inline _Tp operator--() _CONST _VOLATILE noexcept                                                 \
+  {                                                                                                                   \
+    return fetch_sub(_Tp(1)) - _Tp(1);                                                                                \
+  }                                                                                                                   \
+  _CCCL_HOST_DEVICE inline _Tp operator+=(_Tp __op) _CONST _VOLATILE noexcept                                         \
+  {                                                                                                                   \
+    return fetch_add(__op) + __op;                                                                                    \
+  }                                                                                                                   \
+  _CCCL_HOST_DEVICE inline _Tp operator-=(_Tp __op) _CONST _VOLATILE noexcept                                         \
+  {                                                                                                                   \
+    return fetch_sub(__op) - __op;                                                                                    \
+  }
 
 // API definitions for bitwise atomics
-#define _LIBCUDACXX_ATOMIC_BITWISE_IMPL(_CONST, _VOLATILE)                                                        \
-    _CCCL_HOST_DEVICE inline                                                                                      \
-    _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) _CONST _VOLATILE noexcept                    \
-        {return __atomic_fetch_and_dispatch(&__a, __op, __m, _Sco{});}                                            \
-    _CCCL_HOST_DEVICE inline                                                                                      \
-    _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) _CONST _VOLATILE noexcept                     \
-        {return __atomic_fetch_or_dispatch(&__a, __op, __m, _Sco{});}                                             \
-    _CCCL_HOST_DEVICE inline                                                                                      \
-    _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) _CONST _VOLATILE noexcept                    \
-        {return __atomic_fetch_xor_dispatch(&__a, __op, __m, _Sco{});}                                            \
-    _CCCL_HOST_DEVICE inline                                                                                      \
-    _Tp operator&=(_Tp __op) _CONST _VOLATILE noexcept {return fetch_and(__op) & __op;}                           \
-    _CCCL_HOST_DEVICE inline                                                                                      \
-    _Tp operator|=(_Tp __op) _CONST _VOLATILE noexcept {return fetch_or(__op) | __op;}                            \
-    _CCCL_HOST_DEVICE inline                                                                                      \
-    _Tp operator^=(_Tp __op) _CONST _VOLATILE noexcept {return fetch_xor(__op) ^ __op;}                           \
+#define _LIBCUDACXX_ATOMIC_BITWISE_IMPL(_CONST, _VOLATILE)                                                            \
+  _CCCL_HOST_DEVICE inline _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) _CONST _VOLATILE noexcept \
+  {                                                                                                                   \
+    return __atomic_fetch_and_dispatch(&__a, __op, __m, _Sco{});                                                      \
+  }                                                                                                                   \
+  _CCCL_HOST_DEVICE inline _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) _CONST _VOLATILE noexcept  \
+  {                                                                                                                   \
+    return __atomic_fetch_or_dispatch(&__a, __op, __m, _Sco{});                                                       \
+  }                                                                                                                   \
+  _CCCL_HOST_DEVICE inline _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) _CONST _VOLATILE noexcept \
+  {                                                                                                                   \
+    return __atomic_fetch_xor_dispatch(&__a, __op, __m, _Sco{});                                                      \
+  }                                                                                                                   \
+  _CCCL_HOST_DEVICE inline _Tp operator&=(_Tp __op) _CONST _VOLATILE noexcept                                         \
+  {                                                                                                                   \
+    return fetch_and(__op) & __op;                                                                                    \
+  }                                                                                                                   \
+  _CCCL_HOST_DEVICE inline _Tp operator|=(_Tp __op) _CONST _VOLATILE noexcept                                         \
+  {                                                                                                                   \
+    return fetch_or(__op) | __op;                                                                                     \
+  }                                                                                                                   \
+  _CCCL_HOST_DEVICE inline _Tp operator^=(_Tp __op) _CONST _VOLATILE noexcept                                         \
+  {                                                                                                                   \
+    return fetch_xor(__op) ^ __op;                                                                                    \
+  }
 
 // API definitions for atomics with pointers
-#define _LIBCUDACXX_ATOMIC_POINTER_IMPL(_CONST, _VOLATILE)                                                        \
-    _CCCL_HOST_DEVICE inline                                                                                      \
-    _Tp fetch_add(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) _CONST _VOLATILE noexcept              \
-        {return __atomic_fetch_add_dispatch(&__a, __op, __m, __thread_scope_system_tag{});}                       \
-    _CCCL_HOST_DEVICE inline                                                                                      \
-    _Tp fetch_sub(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) _CONST _VOLATILE noexcept              \
-        {return __atomic_fetch_sub_dispatch(&__a, __op, __m, __thread_scope_system_tag{});}                       \
-    _CCCL_HOST_DEVICE inline                                                                                      \
-    _Tp operator++(int) _CONST _VOLATILE noexcept            {return fetch_add(1);}                               \
-    _CCCL_HOST_DEVICE inline                                                                                      \
-    _Tp operator--(int) _CONST _VOLATILE noexcept            {return fetch_sub(1);}                               \
-    _CCCL_HOST_DEVICE inline                                                                                      \
-    _Tp operator++() _CONST _VOLATILE noexcept               {return fetch_add(1) + 1;}                           \
-    _CCCL_HOST_DEVICE inline                                                                                      \
-    _Tp operator--() _CONST _VOLATILE noexcept               {return fetch_sub(1) - 1;}                           \
-    _CCCL_HOST_DEVICE inline                                                                                      \
-    _Tp operator+=(ptrdiff_t __op) _CONST _VOLATILE noexcept {return fetch_add(__op) + __op;}                     \
-    _CCCL_HOST_DEVICE inline                                                                                      \
-    _Tp operator-=(ptrdiff_t __op) _CONST _VOLATILE noexcept {return fetch_sub(__op) - __op;}                     \
+#define _LIBCUDACXX_ATOMIC_POINTER_IMPL(_CONST, _VOLATILE)                                        \
+  _CCCL_HOST_DEVICE inline _Tp fetch_add(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) \
+    _CONST _VOLATILE noexcept                                                                     \
+  {                                                                                               \
+    return __atomic_fetch_add_dispatch(&__a, __op, __m, __thread_scope_system_tag{});             \
+  }                                                                                               \
+  _CCCL_HOST_DEVICE inline _Tp fetch_sub(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) \
+    _CONST _VOLATILE noexcept                                                                     \
+  {                                                                                               \
+    return __atomic_fetch_sub_dispatch(&__a, __op, __m, __thread_scope_system_tag{});             \
+  }                                                                                               \
+  _CCCL_HOST_DEVICE inline _Tp operator++(int) _CONST _VOLATILE noexcept                          \
+  {                                                                                               \
+    return fetch_add(1);                                                                          \
+  }                                                                                               \
+  _CCCL_HOST_DEVICE inline _Tp operator--(int) _CONST _VOLATILE noexcept                          \
+  {                                                                                               \
+    return fetch_sub(1);                                                                          \
+  }                                                                                               \
+  _CCCL_HOST_DEVICE inline _Tp operator++() _CONST _VOLATILE noexcept                             \
+  {                                                                                               \
+    return fetch_add(1) + 1;                                                                      \
+  }                                                                                               \
+  _CCCL_HOST_DEVICE inline _Tp operator--() _CONST _VOLATILE noexcept                             \
+  {                                                                                               \
+    return fetch_sub(1) - 1;                                                                      \
+  }                                                                                               \
+  _CCCL_HOST_DEVICE inline _Tp operator+=(ptrdiff_t __op) _CONST _VOLATILE noexcept               \
+  {                                                                                               \
+    return fetch_add(__op) + __op;                                                                \
+  }                                                                                               \
+  _CCCL_HOST_DEVICE inline _Tp operator-=(ptrdiff_t __op) _CONST _VOLATILE noexcept               \
+  {                                                                                               \
+    return fetch_sub(__op) - __op;                                                                \
+  }
 
-#endif  // __LIBCUDACXX___ATOMIC_API_COMMON_H
+#endif // __LIBCUDACXX___ATOMIC_API_COMMON_H
diff --git a/libcudacxx/include/cuda/std/__atomic/api/owned.h b/libcudacxx/include/cuda/std/__atomic/api/owned.h
index bc08f023bf..c598e19c90 100644
--- a/libcudacxx/include/cuda/std/__atomic/api/owned.h
+++ b/libcudacxx/include/cuda/std/__atomic/api/owned.h
@@ -13,111 +13,121 @@
 
 #include <cuda/std/detail/__config>
 
-#include <cuda/std/__type_traits/conditional.h>
-
-#include <cuda/std/__atomic/wait/polling.h>
-#include <cuda/std/__atomic/wait/notify_wait.h>
-
+#include <cuda/std/__atomic/api/common.h>
 #include <cuda/std/__atomic/order.h>
 #include <cuda/std/__atomic/scopes.h>
 #include <cuda/std/__atomic/types.h>
-
-#include <cuda/std/__atomic/api/common.h>
+#include <cuda/std/__atomic/wait/notify_wait.h>
+#include <cuda/std/__atomic/wait/polling.h>
+#include <cuda/std/__type_traits/conditional.h>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <typename _Tp, typename _Sco>
-struct __atomic_common {
-    _CCCL_HOST_DEVICE constexpr inline
-    __atomic_common(_Tp __v) : __a(__v) {}
+struct __atomic_common
+{
+  _CCCL_HOST_DEVICE constexpr inline __atomic_common(_Tp __v)
+      : __a(__v)
+  {}
 
-    _CCCL_HOST_DEVICE constexpr inline
-    __atomic_common() : __a() {}
+  _CCCL_HOST_DEVICE constexpr inline __atomic_common()
+      : __a()
+  {}
 
-    __atomic_storage_t<_Tp> __a;
+  __atomic_storage_t<_Tp> __a;
 
 #if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
-    static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
+  static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
 #endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
 
-    _LIBCUDACXX_ATOMIC_COMMON_IMPL(,)
-    _LIBCUDACXX_ATOMIC_COMMON_IMPL(,volatile)
+  _LIBCUDACXX_ATOMIC_COMMON_IMPL(, )
+  _LIBCUDACXX_ATOMIC_COMMON_IMPL(, volatile)
 };
 
 template <typename _Tp, typename _Sco = __thread_scope_system_tag>
-struct __atomic_arithmetic {
-    _CCCL_HOST_DEVICE constexpr inline
-    __atomic_arithmetic(_Tp __v) : __a(__v) {}
+struct __atomic_arithmetic
+{
+  _CCCL_HOST_DEVICE constexpr inline __atomic_arithmetic(_Tp __v)
+      : __a(__v)
+  {}
 
-    _CCCL_HOST_DEVICE constexpr inline
-    __atomic_arithmetic() : __a() {}
+  _CCCL_HOST_DEVICE constexpr inline __atomic_arithmetic()
+      : __a()
+  {}
 
-    __atomic_storage_t<_Tp> __a;
+  __atomic_storage_t<_Tp> __a;
 
 #if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
-    static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
+  static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
 #endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
 
-    _LIBCUDACXX_ATOMIC_COMMON_IMPL(,)
-    _LIBCUDACXX_ATOMIC_COMMON_IMPL(,volatile)
+  _LIBCUDACXX_ATOMIC_COMMON_IMPL(, )
+  _LIBCUDACXX_ATOMIC_COMMON_IMPL(, volatile)
 
-    _LIBCUDACXX_ATOMIC_ARITHMETIC_IMPL(,)
-    _LIBCUDACXX_ATOMIC_ARITHMETIC_IMPL(,volatile)
+  _LIBCUDACXX_ATOMIC_ARITHMETIC_IMPL(, )
+  _LIBCUDACXX_ATOMIC_ARITHMETIC_IMPL(, volatile)
 };
 
 template <typename _Tp, typename _Sco = __thread_scope_system_tag>
-struct __atomic_bitwise {
-    _CCCL_HOST_DEVICE constexpr inline
-    __atomic_bitwise(_Tp __v) : __a(__v) {}
+struct __atomic_bitwise
+{
+  _CCCL_HOST_DEVICE constexpr inline __atomic_bitwise(_Tp __v)
+      : __a(__v)
+  {}
 
-    _CCCL_HOST_DEVICE constexpr inline
-    __atomic_bitwise() : __a() {}
+  _CCCL_HOST_DEVICE constexpr inline __atomic_bitwise()
+      : __a()
+  {}
 
-    __atomic_storage_t<_Tp> __a;
+  __atomic_storage_t<_Tp> __a;
 
 #if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
-    static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
+  static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
 #endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
 
-    _LIBCUDACXX_ATOMIC_COMMON_IMPL(,)
-    _LIBCUDACXX_ATOMIC_COMMON_IMPL(,volatile)
+  _LIBCUDACXX_ATOMIC_COMMON_IMPL(, )
+  _LIBCUDACXX_ATOMIC_COMMON_IMPL(, volatile)
 
-    _LIBCUDACXX_ATOMIC_ARITHMETIC_IMPL(,)
-    _LIBCUDACXX_ATOMIC_ARITHMETIC_IMPL(,volatile)
+  _LIBCUDACXX_ATOMIC_ARITHMETIC_IMPL(, )
+  _LIBCUDACXX_ATOMIC_ARITHMETIC_IMPL(, volatile)
 
-    _LIBCUDACXX_ATOMIC_BITWISE_IMPL(,)
-    _LIBCUDACXX_ATOMIC_BITWISE_IMPL(,volatile)
+  _LIBCUDACXX_ATOMIC_BITWISE_IMPL(, )
+  _LIBCUDACXX_ATOMIC_BITWISE_IMPL(, volatile)
 };
 
 template <typename _Tp, typename _Sco = __thread_scope_system_tag>
-struct __atomic_pointer {
-    _CCCL_HOST_DEVICE constexpr inline
-    __atomic_pointer(_Tp __v) : __a(__v) {}
+struct __atomic_pointer
+{
+  _CCCL_HOST_DEVICE constexpr inline __atomic_pointer(_Tp __v)
+      : __a(__v)
+  {}
 
-    _CCCL_HOST_DEVICE constexpr inline
-    __atomic_pointer() : __a() {}
+  _CCCL_HOST_DEVICE constexpr inline __atomic_pointer()
+      : __a()
+  {}
 
-    __atomic_storage_t<_Tp> __a;
+  __atomic_storage_t<_Tp> __a;
 
 #if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
-    static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
+  static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
 #endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
 
-    _LIBCUDACXX_ATOMIC_COMMON_IMPL(,)
-    _LIBCUDACXX_ATOMIC_COMMON_IMPL(,volatile)
+  _LIBCUDACXX_ATOMIC_COMMON_IMPL(, )
+  _LIBCUDACXX_ATOMIC_COMMON_IMPL(, volatile)
 
-    _LIBCUDACXX_ATOMIC_POINTER_IMPL(,)
-    _LIBCUDACXX_ATOMIC_POINTER_IMPL(,volatile)
+  _LIBCUDACXX_ATOMIC_POINTER_IMPL(, )
+  _LIBCUDACXX_ATOMIC_POINTER_IMPL(, volatile)
 };
 
 template <typename _Tp, thread_scope _Sco = thread_scope_system>
-using __atomic_impl = _If<is_pointer<_Tp>::value,
-                        __atomic_pointer<_Tp, __scope_to_tag<_Sco>>,
-                        _If<is_floating_point<_Tp>::value,
-                            __atomic_arithmetic<_Tp, __scope_to_tag<_Sco>>,
-                            _If<is_integral<_Tp>::value,
-                                __atomic_bitwise<_Tp, __scope_to_tag<_Sco>>,
-                                __atomic_common<_Tp, __scope_to_tag<_Sco>> >>>;
+using __atomic_impl =
+  _If<is_pointer<_Tp>::value,
+      __atomic_pointer<_Tp, __scope_to_tag<_Sco>>,
+      _If<is_floating_point<_Tp>::value,
+          __atomic_arithmetic<_Tp, __scope_to_tag<_Sco>>,
+          _If<is_integral<_Tp>::value,
+              __atomic_bitwise<_Tp, __scope_to_tag<_Sco>>,
+              __atomic_common<_Tp, __scope_to_tag<_Sco>>>>>;
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__atomic/api/reference.h b/libcudacxx/include/cuda/std/__atomic/api/reference.h
index df87d91f97..6e82124e4c 100644
--- a/libcudacxx/include/cuda/std/__atomic/api/reference.h
+++ b/libcudacxx/include/cuda/std/__atomic/api/reference.h
@@ -13,87 +13,93 @@
 
 #include <cuda/std/detail/__config>
 
-#include <cuda/std/__type_traits/conditional.h>
-
-#include <cuda/std/__atomic/wait/polling.h>
-#include <cuda/std/__atomic/wait/notify_wait.h>
-
+#include <cuda/std/__atomic/api/common.h>
 #include <cuda/std/__atomic/order.h>
 #include <cuda/std/__atomic/scopes.h>
 #include <cuda/std/__atomic/types/reference.h>
-
-#include <cuda/std/__atomic/api/common.h>
+#include <cuda/std/__atomic/wait/notify_wait.h>
+#include <cuda/std/__atomic/wait/polling.h>
+#include <cuda/std/__type_traits/conditional.h>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <typename _Tp, typename _Sco>
-struct __atomic_ref_common {
-    _CCCL_HOST_DEVICE constexpr inline
-    __atomic_ref_common(_Tp& __v) : __a(&__v) {}
+struct __atomic_ref_common
+{
+  _CCCL_HOST_DEVICE constexpr inline __atomic_ref_common(_Tp& __v)
+      : __a(&__v)
+  {}
 
-    __atomic_ref_storage<_Tp> __a;
+  __atomic_ref_storage<_Tp> __a;
 
 #if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
-    static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
+  static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
 #endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
 
-    _LIBCUDACXX_ATOMIC_COMMON_IMPL(const,)
+  _LIBCUDACXX_ATOMIC_COMMON_IMPL(const, )
 };
 
 template <typename _Tp, typename _Sco>
-struct __atomic_ref_arithmetic {
-    _CCCL_HOST_DEVICE constexpr inline
-    __atomic_ref_arithmetic(_Tp& __v) : __a(&__v) {}
+struct __atomic_ref_arithmetic
+{
+  _CCCL_HOST_DEVICE constexpr inline __atomic_ref_arithmetic(_Tp& __v)
+      : __a(&__v)
+  {}
 
-    __atomic_ref_storage<_Tp> __a;
+  __atomic_ref_storage<_Tp> __a;
 
 #if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
-    static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
+  static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
 #endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
 
-    _LIBCUDACXX_ATOMIC_COMMON_IMPL(const,)
-    _LIBCUDACXX_ATOMIC_ARITHMETIC_IMPL(const,)
+  _LIBCUDACXX_ATOMIC_COMMON_IMPL(const, )
+  _LIBCUDACXX_ATOMIC_ARITHMETIC_IMPL(const, )
 };
 
 template <typename _Tp, typename _Sco>
-struct __atomic_ref_bitwise {
-    _CCCL_HOST_DEVICE constexpr inline
-    __atomic_ref_bitwise(_Tp& __v) : __a(&__v) {}
+struct __atomic_ref_bitwise
+{
+  _CCCL_HOST_DEVICE constexpr inline __atomic_ref_bitwise(_Tp& __v)
+      : __a(&__v)
+  {}
 
-    __atomic_ref_storage<_Tp> __a;
+  __atomic_ref_storage<_Tp> __a;
 
 #if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
-    static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
+  static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
 #endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
 
-    _LIBCUDACXX_ATOMIC_COMMON_IMPL(const,)
-    _LIBCUDACXX_ATOMIC_ARITHMETIC_IMPL(const,)
-    _LIBCUDACXX_ATOMIC_BITWISE_IMPL(const,)
+  _LIBCUDACXX_ATOMIC_COMMON_IMPL(const, )
+  _LIBCUDACXX_ATOMIC_ARITHMETIC_IMPL(const, )
+  _LIBCUDACXX_ATOMIC_BITWISE_IMPL(const, )
 };
 
 template <typename _Tp, typename _Sco = __thread_scope_system_tag>
-struct __atomic_ref_pointer {
-    _CCCL_HOST_DEVICE constexpr inline
-    __atomic_ref_pointer(_Tp& __v) : __a(&__v) {}
+struct __atomic_ref_pointer
+{
+  _CCCL_HOST_DEVICE constexpr inline __atomic_ref_pointer(_Tp& __v)
+      : __a(&__v)
+  {}
 
-    __atomic_ref_storage<_Tp> __a;
+  __atomic_ref_storage<_Tp> __a;
 
 #if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
-    static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
+  static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
 #endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
 
-    _LIBCUDACXX_ATOMIC_COMMON_IMPL(const,)
-    _LIBCUDACXX_ATOMIC_POINTER_IMPL(const,)
+  _LIBCUDACXX_ATOMIC_COMMON_IMPL(const, )
+  _LIBCUDACXX_ATOMIC_POINTER_IMPL(const, )
 };
 
 template <typename _Tp, thread_scope _Sco = thread_scope_system>
-using __atomic_ref_impl = _If<is_pointer<_Tp>::value,
-                            __atomic_ref_pointer<_Tp, __scope_to_tag<_Sco>>,
-                            _If<is_floating_point<_Tp>::value,
-                                __atomic_ref_arithmetic<_Tp, __scope_to_tag<_Sco>>,
-                                _If<is_integral<_Tp>::value,
-                                    __atomic_ref_bitwise<_Tp, __scope_to_tag<_Sco>>,
-                                    __atomic_ref_common<_Tp, __scope_to_tag<_Sco>> >>>;
+using __atomic_ref_impl =
+  _If<is_pointer<_Tp>::value,
+      __atomic_ref_pointer<_Tp, __scope_to_tag<_Sco>>,
+      _If<is_floating_point<_Tp>::value,
+          __atomic_ref_arithmetic<_Tp, __scope_to_tag<_Sco>>,
+          _If<is_integral<_Tp>::value,
+              __atomic_ref_bitwise<_Tp, __scope_to_tag<_Sco>>,
+              __atomic_ref_common<_Tp, __scope_to_tag<_Sco>>>>>;
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__atomic/functions.h b/libcudacxx/include/cuda/std/__atomic/functions.h
index fcbb0a6027..b33afe5e3e 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions.h
@@ -16,8 +16,8 @@
 #include <cuda/std/__atomic/platform.h>
 
 // Device atomics
-#include <cuda/std/__atomic/functions/cuda_ptx_generated.h>
 #include <cuda/std/__atomic/functions/cuda_ptx_derived.h>
+#include <cuda/std/__atomic/functions/cuda_ptx_generated.h>
 
 // Host atomics
 #include <cuda/std/__atomic/functions/host.h>
diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
index dd3c9e65bf..3cd5d2b0d9 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
@@ -13,21 +13,14 @@
 
 #include <cuda/std/detail/__config>
 
-#include <cuda/std/cstdint>
-
 #include <cuda/std/__atomic/functions/cuda_ptx_generated.h>
+#include <cuda/std/cstdint>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <typename _Tp, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
 bool _CCCL_DEVICE __atomic_compare_exchange_cuda(
-  _Tp volatile* __ptr,
-  _Tp* __expected,
-  const _Tp __desired,
-  bool,
-  int __success_memorder,
-  int __failure_memorder,
-  _Sco)
+  _Tp volatile* __ptr, _Tp* __expected, const _Tp __desired, bool, int __success_memorder, int __failure_memorder, _Sco)
 {
   auto const __aligned = (uint32_t*) ((intptr_t) __ptr & ~(sizeof(uint32_t) - 1));
   auto const __offset  = uint32_t((intptr_t) __ptr & (sizeof(uint32_t) - 1)) * 8;
@@ -43,7 +36,8 @@ bool _CCCL_DEVICE __atomic_compare_exchange_cuda(
       break;
     }
     uint32_t const __attempt = (__old & ~__mask) | (*__desired << __offset);
-    if (__atomic_compare_exchange_cuda(__aligned, &__old, &__attempt, true, __success_memorder, __failure_memorder, _Sco{}))
+    if (__atomic_compare_exchange_cuda(
+          __aligned, &__old, &__attempt, true, __success_memorder, __failure_memorder, _Sco{}))
     {
       return true;
     }
@@ -73,11 +67,10 @@ _Tp _CCCL_DEVICE __atomic_fetch_add_cuda(_Tp volatile* __ptr, _Up __val, int __m
   return __expected;
 }
 
-template <
-  typename _Tp,
-  typename _Up,
-  typename _Sco,
-  __enable_if_t<sizeof(_Tp) <= 2 || _CUDA_VSTD::is_floating_point<_Tp>::value, int> = 0>
+template <typename _Tp,
+          typename _Up,
+          typename _Sco,
+          __enable_if_t<sizeof(_Tp) <= 2 || _CUDA_VSTD::is_floating_point<_Tp>::value, int> = 0>
 _Tp _CCCL_HOST_DEVICE __atomic_fetch_max_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
 {
   _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
@@ -92,11 +85,10 @@ _Tp _CCCL_HOST_DEVICE __atomic_fetch_max_cuda(_Tp volatile* __ptr, _Up __val, in
   return __expected;
 }
 
-template <
-  typename _Tp,
-  typename _Up,
-  typename _Sco,
-  __enable_if_t<sizeof(_Tp) <= 2 || _CUDA_VSTD::is_floating_point<_Tp>::value, int> = 0>
+template <typename _Tp,
+          typename _Up,
+          typename _Sco,
+          __enable_if_t<sizeof(_Tp) <= 2 || _CUDA_VSTD::is_floating_point<_Tp>::value, int> = 0>
 _Tp _CCCL_HOST_DEVICE __atomic_fetch_min_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
 {
   _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
@@ -175,13 +167,7 @@ void _CCCL_DEVICE __atomic_store_n_cuda(_Tp volatile* __ptr, _Tp __val, int __me
 
 template <typename _Tp, typename _Sco>
 bool _CCCL_DEVICE __atomic_compare_exchange_n_cuda(
-  _Tp volatile* __ptr,
-  _Tp* __expected,
-  _Tp __desired,
-  bool __weak,
-  int __success_memorder,
-  int __failure_memorder,
-  _Sco)
+  _Tp volatile* __ptr, _Tp* __expected, _Tp __desired, bool __weak, int __success_memorder, int __failure_memorder, _Sco)
 {
   return __atomic_compare_exchange_cuda(
     __ptr, __expected, __desired, __weak, __success_memorder, __failure_memorder, _Sco{});
diff --git a/libcudacxx/include/cuda/std/__atomic/functions/host.h b/libcudacxx/include/cuda/std/__atomic/functions/host.h
index 134b00bff5..b2a251f854 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/host.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/host.h
@@ -13,36 +13,39 @@
 
 #include <cuda/std/detail/__config>
 
-#include <cuda/std/__atomic/platform.h>
 #include <cuda/std/__atomic/order.h>
+#include <cuda/std/__atomic/platform.h>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 // Guard ifdef for lock free query in case it is assigned elsewhere (MSVC/CUDA)
-inline
-void __atomic_thread_fence_host(memory_order __order) {
+inline void __atomic_thread_fence_host(memory_order __order)
+{
   __atomic_thread_fence(__atomic_order_to_int(__order));
 }
 
-inline
-void __atomic_signal_fence_host(memory_order __order) {
+inline void __atomic_signal_fence_host(memory_order __order)
+{
   __atomic_signal_fence(__atomic_order_to_int(__order));
 }
 
 template <typename _Tp, typename _Up>
-inline void __atomic_store_host(_Tp* __a,  _Up __val, memory_order __order) {
+inline void __atomic_store_host(_Tp* __a, _Up __val, memory_order __order)
+{
   __atomic_store(__a, &__val, __atomic_order_to_int(__order));
 }
 
 template <typename _Tp>
-inline auto __atomic_load_host(_Tp* __a, memory_order __order) -> __remove_cvref_t<_Tp> {
+inline auto __atomic_load_host(_Tp* __a, memory_order __order) -> __remove_cvref_t<_Tp>
+{
   __remove_cvref_t<_Tp> __ret{};
   __atomic_load(__a, &__ret, __atomic_order_to_int(__order));
   return __ret;
 }
 
 template <typename _Tp, typename _Up>
-inline auto __atomic_exchange_host(_Tp* __a, _Up __val, memory_order __order) -> __remove_cvref_t<_Tp> {
+inline auto __atomic_exchange_host(_Tp* __a, _Up __val, memory_order __order) -> __remove_cvref_t<_Tp>
+{
   __remove_cvref_t<_Tp> __ret{};
   __atomic_exchange(__a, &__val, &__ret, __atomic_order_to_int(__order));
   return __ret;
@@ -50,132 +53,129 @@ inline auto __atomic_exchange_host(_Tp* __a, _Up __val, memory_order __order) ->
 
 template <typename _Tp, typename _Up>
 inline bool __atomic_compare_exchange_strong_host(
-    _Tp* __a, _Up* __expected, _Up __value, memory_order __success,
-    memory_order __failure) {
-  (void)__expected;
-  return __atomic_compare_exchange(__a,
-                                   __expected, &__value, false,
-                                   __atomic_order_to_int(__success),
-                                   __atomic_failure_order_to_int(__failure));
+  _Tp* __a, _Up* __expected, _Up __value, memory_order __success, memory_order __failure)
+{
+  (void) __expected;
+  return __atomic_compare_exchange(
+    __a, __expected, &__value, false, __atomic_order_to_int(__success), __atomic_failure_order_to_int(__failure));
 }
 
 template <typename _Tp, typename _Up>
 inline bool __atomic_compare_exchange_weak_host(
-    _Tp* __a, _Up* __expected, _Up __value, memory_order __success,
-    memory_order __failure) {
-  (void)__expected;
-  return __atomic_compare_exchange(__a,
-                                   __expected, &__value, true,
-                                   __atomic_order_to_int(__success),
-                                   __atomic_failure_order_to_int(__failure));
+  _Tp* __a, _Up* __expected, _Up __value, memory_order __success, memory_order __failure)
+{
+  (void) __expected;
+  return __atomic_compare_exchange(
+    __a, __expected, &__value, true, __atomic_order_to_int(__success), __atomic_failure_order_to_int(__failure));
 }
 
 template <typename _Tp>
-struct __atomic_ptr_skip {
+struct __atomic_ptr_skip
+{
   static constexpr auto __skip = 1;
 };
 
 template <typename _Tp>
-struct __atomic_ptr_skip<_Tp*> {
+struct __atomic_ptr_skip<_Tp*>
+{
   static constexpr auto __skip = sizeof(_Tp);
 };
 
 // FIXME: Haven't figured out what the spec says about using arrays with
 // atomic_fetch_add. Force a failure rather than creating bad behavior.
 template <typename _Tp>
-struct __atomic_ptr_skip<_Tp[]> { };
+struct __atomic_ptr_skip<_Tp[]>
+{};
 template <typename _Tp, int n>
-struct __atomic_ptr_skip<_Tp[n]> { };
+struct __atomic_ptr_skip<_Tp[n]>
+{};
 
 template <typename _Tp>
 using __atomic_ptr_skip_t = __atomic_ptr_skip<__remove_cvref_t<_Tp>>;
 
 template <typename _Tp, typename _Td, __enable_if_t<!is_floating_point<_Tp>::value, int> = 0>
-inline _Tp __atomic_fetch_add_host(_Tp* __a, _Td __delta,
-                           memory_order __order) {
+inline _Tp __atomic_fetch_add_host(_Tp* __a, _Td __delta, memory_order __order)
+{
   constexpr auto __skip_v = __atomic_ptr_skip_t<_Tp>::__skip;
-  return __atomic_fetch_add(__a, __delta * __skip_v,
-                            __atomic_order_to_int(__order));
+  return __atomic_fetch_add(__a, __delta * __skip_v, __atomic_order_to_int(__order));
 }
 
 template <typename _Tp, typename _Td, __enable_if_t<is_floating_point<_Tp>::value, int> = 0>
-inline _Tp __atomic_fetch_add_host(_Tp* __a, _Td __delta,
-                           memory_order __order) {
+inline _Tp __atomic_fetch_add_host(_Tp* __a, _Td __delta, memory_order __order)
+{
   auto __expected = __atomic_load_host(__a, memory_order_relaxed);
-  auto __desired = __expected + __delta;
+  auto __desired  = __expected + __delta;
 
-  while(!__atomic_compare_exchange_strong_host(__a, &__expected, __desired, __order, __order)) {
-      __desired = __expected + __delta;
+  while (!__atomic_compare_exchange_strong_host(__a, &__expected, __desired, __order, __order))
+  {
+    __desired = __expected + __delta;
   }
 
   return __expected;
 }
 
 template <typename _Tp, typename _Td, __enable_if_t<!is_floating_point<_Tp>::value, int> = 0>
-inline _Tp __atomic_fetch_sub_host(_Tp* __a, _Td __delta,
-                           memory_order __order) {
+inline _Tp __atomic_fetch_sub_host(_Tp* __a, _Td __delta, memory_order __order)
+{
   constexpr auto __skip_v = __atomic_ptr_skip_t<_Tp>::__skip;
-  return __atomic_fetch_sub(__a, __delta * __skip_v,
-                            __atomic_order_to_int(__order));
+  return __atomic_fetch_sub(__a, __delta * __skip_v, __atomic_order_to_int(__order));
 }
 
 template <typename _Tp, typename _Td, __enable_if_t<is_floating_point<_Tp>::value, int> = 0>
-inline _Tp __atomic_fetch_sub_host(_Tp* __a, _Td __delta,
-                           memory_order __order) {
+inline _Tp __atomic_fetch_sub_host(_Tp* __a, _Td __delta, memory_order __order)
+{
   auto __expected = __atomic_load_host(__a, memory_order_relaxed);
-  auto __desired = __expected - __delta;
+  auto __desired  = __expected - __delta;
 
-  while(!__atomic_compare_exchange_strong_host(__a, &__expected, __desired, __order, __order)) {
-      __desired = __expected - __delta;
+  while (!__atomic_compare_exchange_strong_host(__a, &__expected, __desired, __order, __order))
+  {
+    __desired = __expected - __delta;
   }
 
   return __expected;
 }
 
 template <typename _Tp, typename _Td>
-inline _Tp __atomic_fetch_and_host(_Tp* __a, _Td __pattern,
-                            memory_order __order) {
-  return __atomic_fetch_and(__a, __pattern,
-                            __atomic_order_to_int(__order));
+inline _Tp __atomic_fetch_and_host(_Tp* __a, _Td __pattern, memory_order __order)
+{
+  return __atomic_fetch_and(__a, __pattern, __atomic_order_to_int(__order));
 }
 
 template <typename _Tp, typename _Td>
-inline _Tp __atomic_fetch_or_host(_Tp* __a, _Td __pattern,
-                          memory_order __order) {
-  return __atomic_fetch_or(__a, __pattern,
-                           __atomic_order_to_int(__order));
+inline _Tp __atomic_fetch_or_host(_Tp* __a, _Td __pattern, memory_order __order)
+{
+  return __atomic_fetch_or(__a, __pattern, __atomic_order_to_int(__order));
 }
 
 template <typename _Tp, typename _Td>
-inline _Tp __atomic_fetch_xor_host(_Tp* __a, _Td __pattern,
-                           memory_order __order) {
-  return __atomic_fetch_xor(__a, __pattern,
-                            __atomic_order_to_int(__order));
+inline _Tp __atomic_fetch_xor_host(_Tp* __a, _Td __pattern, memory_order __order)
+{
+  return __atomic_fetch_xor(__a, __pattern, __atomic_order_to_int(__order));
 }
 
 template <typename _Tp, typename _Td>
-inline _Tp __atomic_fetch_max_host(_Tp* __a, _Td __val,
-                           memory_order __order) {
+inline _Tp __atomic_fetch_max_host(_Tp* __a, _Td __val, memory_order __order)
+{
   auto __expected = __atomic_load_host(__a, memory_order_relaxed);
-  auto __desired = __expected > __val ? __expected : __val;
+  auto __desired  = __expected > __val ? __expected : __val;
 
-  while(__desired == __val &&
-          !__atomic_compare_exchange_strong_host(__a, &__expected, __desired, __order, __order)) {
-      __desired = __expected > __val ? __expected : __val;
+  while (__desired == __val && !__atomic_compare_exchange_strong_host(__a, &__expected, __desired, __order, __order))
+  {
+    __desired = __expected > __val ? __expected : __val;
   }
 
   return __expected;
 }
 
 template <typename _Tp, typename _Td>
-inline _Tp __atomic_fetch_min_host(_Tp* __a, _Td __val,
-                           memory_order __order) {
+inline _Tp __atomic_fetch_min_host(_Tp* __a, _Td __val, memory_order __order)
+{
   auto __expected = __atomic_load_host(__a, memory_order_relaxed);
-  auto __desired = __expected < __val ? __expected : __val;
+  auto __desired  = __expected < __val ? __expected : __val;
 
-  while(__desired == __val &&
-          !__atomic_compare_exchange_strong_host(__a, &__expected, __desired, __order, __order)) {
-      __desired = __expected < __val ? __expected : __val;
+  while (__desired == __val && !__atomic_compare_exchange_strong_host(__a, &__expected, __desired, __order, __order))
+  {
+    __desired = __expected < __val ? __expected : __val;
   }
 
   return __expected;
diff --git a/libcudacxx/include/cuda/std/__atomic/order.h b/libcudacxx/include/cuda/std/__atomic/order.h
index 3e63fecc65..64a9089915 100644
--- a/libcudacxx/include/cuda/std/__atomic/order.h
+++ b/libcudacxx/include/cuda/std/__atomic/order.h
@@ -17,48 +17,47 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-#define _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m) \
-  _LIBCUDACXX_DIAGNOSE_WARNING(__m == memory_order_consume || \
-                           __m == memory_order_acquire || \
-                           __m == memory_order_acq_rel,   \
-                        "memory order argument to atomic operation is invalid")
-
-#define _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m) \
-  _LIBCUDACXX_DIAGNOSE_WARNING(__m == memory_order_release || \
-                           __m == memory_order_acq_rel,   \
-                        "memory order argument to atomic operation is invalid")
-
-#define _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__m, __f) \
-  _LIBCUDACXX_DIAGNOSE_WARNING(__f == memory_order_release || \
-                           __f == memory_order_acq_rel,   \
-                        "memory order argument to atomic operation is invalid")
+#define _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)                                              \
+  _LIBCUDACXX_DIAGNOSE_WARNING(                                                                \
+    __m == memory_order_consume || __m == memory_order_acquire || __m == memory_order_acq_rel, \
+    "memory order argument to atomic operation is invalid")
+
+#define _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)                                           \
+  _LIBCUDACXX_DIAGNOSE_WARNING(__m == memory_order_release || __m == memory_order_acq_rel, \
+                               "memory order argument to atomic operation is invalid")
+
+#define _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__m, __f)                                  \
+  _LIBCUDACXX_DIAGNOSE_WARNING(__f == memory_order_release || __f == memory_order_acq_rel, \
+                               "memory order argument to atomic operation is invalid")
 
 #ifndef __ATOMIC_RELAXED
-#define __ATOMIC_RELAXED 0
-#define __ATOMIC_CONSUME 1
-#define __ATOMIC_ACQUIRE 2
-#define __ATOMIC_RELEASE 3
-#define __ATOMIC_ACQ_REL 4
-#define __ATOMIC_SEQ_CST 5
+#  define __ATOMIC_RELAXED 0
+#  define __ATOMIC_CONSUME 1
+#  define __ATOMIC_ACQUIRE 2
+#  define __ATOMIC_RELEASE 3
+#  define __ATOMIC_ACQ_REL 4
+#  define __ATOMIC_SEQ_CST 5
 #endif //__ATOMIC_RELAXED
 
 // Figure out what the underlying type for `memory_order` would be if it were
 // declared as an unscoped enum (accounting for -fshort-enums). Use this result
 // to pin the underlying type in C++20.
-enum __legacy_memory_order {
-    __mo_relaxed,
-    __mo_consume,
-    __mo_acquire,
-    __mo_release,
-    __mo_acq_rel,
-    __mo_seq_cst
+enum __legacy_memory_order
+{
+  __mo_relaxed,
+  __mo_consume,
+  __mo_acquire,
+  __mo_release,
+  __mo_acq_rel,
+  __mo_seq_cst
 };
 
 typedef underlying_type<__legacy_memory_order>::type __memory_order_underlying_t;
 
 #if _CCCL_STD_VER > 2017
 
-enum class memory_order : __memory_order_underlying_t {
+enum class memory_order : __memory_order_underlying_t
+{
   relaxed = __mo_relaxed,
   consume = __mo_consume,
   acquire = __mo_acquire,
@@ -76,7 +75,8 @@ inline constexpr auto memory_order_seq_cst = memory_order::seq_cst;
 
 #else
 
-typedef enum memory_order {
+typedef enum memory_order
+{
   memory_order_relaxed = __mo_relaxed,
   memory_order_consume = __mo_consume,
   memory_order_acquire = __mo_acquire,
@@ -87,43 +87,47 @@ typedef enum memory_order {
 
 #endif // _CCCL_STD_VER > 2017
 
-_CCCL_HOST_DEVICE
-inline int __stronger_order_cuda(int __a, int __b) {
-    int const __max = __a > __b ? __a : __b;
-    if(__max != __ATOMIC_RELEASE)
-        return __max;
-    static int const __xform[] = {
-        __ATOMIC_RELEASE,
-        __ATOMIC_ACQ_REL,
-        __ATOMIC_ACQ_REL,
-        __ATOMIC_RELEASE };
-    return __xform[__a < __b ? __a : __b];
+_CCCL_HOST_DEVICE inline int __stronger_order_cuda(int __a, int __b)
+{
+  int const __max = __a > __b ? __a : __b;
+  if (__max != __ATOMIC_RELEASE)
+  {
+    return __max;
+  }
+  static int const __xform[] = {__ATOMIC_RELEASE, __ATOMIC_ACQ_REL, __ATOMIC_ACQ_REL, __ATOMIC_RELEASE};
+  return __xform[__a < __b ? __a : __b];
 }
 
-_CCCL_HOST_DEVICE
-inline constexpr int __atomic_order_to_int(memory_order __order) {
+_CCCL_HOST_DEVICE inline constexpr int __atomic_order_to_int(memory_order __order)
+{
   // Avoid switch statement to make this a constexpr.
-  return __order == memory_order_relaxed ? __ATOMIC_RELAXED:
-         (__order == memory_order_acquire ? __ATOMIC_ACQUIRE:
-          (__order == memory_order_release ? __ATOMIC_RELEASE:
-           (__order == memory_order_seq_cst ? __ATOMIC_SEQ_CST:
-            (__order == memory_order_acq_rel ? __ATOMIC_ACQ_REL:
-              __ATOMIC_CONSUME))));
+  return __order == memory_order_relaxed
+         ? __ATOMIC_RELAXED
+         : (__order == memory_order_acquire
+              ? __ATOMIC_ACQUIRE
+              : (__order == memory_order_release
+                   ? __ATOMIC_RELEASE
+                   : (__order == memory_order_seq_cst
+                        ? __ATOMIC_SEQ_CST
+                        : (__order == memory_order_acq_rel ? __ATOMIC_ACQ_REL : __ATOMIC_CONSUME))));
 }
 
-_CCCL_HOST_DEVICE
-inline constexpr int __atomic_failure_order_to_int(memory_order __order) {
+_CCCL_HOST_DEVICE inline constexpr int __atomic_failure_order_to_int(memory_order __order)
+{
   // Avoid switch statement to make this a constexpr.
-  return __order == memory_order_relaxed ? __ATOMIC_RELAXED:
-         (__order == memory_order_acquire ? __ATOMIC_ACQUIRE:
-          (__order == memory_order_release ? __ATOMIC_RELAXED:
-           (__order == memory_order_seq_cst ? __ATOMIC_SEQ_CST:
-            (__order == memory_order_acq_rel ? __ATOMIC_ACQUIRE:
-              __ATOMIC_CONSUME))));
+  return __order == memory_order_relaxed
+         ? __ATOMIC_RELAXED
+         : (__order == memory_order_acquire
+              ? __ATOMIC_ACQUIRE
+              : (__order == memory_order_release
+                   ? __ATOMIC_RELAXED
+                   : (__order == memory_order_seq_cst
+                        ? __ATOMIC_SEQ_CST
+                        : (__order == memory_order_acq_rel ? __ATOMIC_ACQUIRE : __ATOMIC_CONSUME))));
 }
 
 static_assert((is_same<underlying_type<memory_order>::type, __memory_order_underlying_t>::value),
-  "unexpected underlying type for std::memory_order");
+              "unexpected underlying type for std::memory_order");
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
@@ -140,5 +144,4 @@ constexpr memory_order memory_order_seq_cst = _CUDA_VSTD::memory_order_seq_cst;
 
 _LIBCUDACXX_END_NAMESPACE_CUDA
 
-
 #endif // __LIBCUDACXX___ATOMIC_ORDER_H
diff --git a/libcudacxx/include/cuda/std/__atomic/platform.h b/libcudacxx/include/cuda/std/__atomic/platform.h
index 67b9615605..77a5035c5c 100644
--- a/libcudacxx/include/cuda/std/__atomic/platform.h
+++ b/libcudacxx/include/cuda/std/__atomic/platform.h
@@ -14,42 +14,42 @@
 #include <cuda/std/detail/__config>
 
 #if defined(_CCCL_COMPILER_MSVC)
-#include <cuda/std/__atomic/platform/msvc_to_builtins.h>
+#  include <cuda/std/__atomic/platform/msvc_to_builtins.h>
 #endif
 
 #if defined(__CLANG_ATOMIC_BOOL_LOCK_FREE)
-# define LIBCUDACXX_ATOMIC_BOOL_LOCK_FREE      __CLANG_ATOMIC_BOOL_LOCK_FREE
-# define LIBCUDACXX_ATOMIC_CHAR_LOCK_FREE      __CLANG_ATOMIC_CHAR_LOCK_FREE
-# define LIBCUDACXX_ATOMIC_CHAR16_T_LOCK_FREE  __CLANG_ATOMIC_CHAR16_T_LOCK_FREE
-# define LIBCUDACXX_ATOMIC_CHAR32_T_LOCK_FREE  __CLANG_ATOMIC_CHAR32_T_LOCK_FREE
-# define LIBCUDACXX_ATOMIC_WCHAR_T_LOCK_FREE   __CLANG_ATOMIC_WCHAR_T_LOCK_FREE
-# define LIBCUDACXX_ATOMIC_SHORT_LOCK_FREE     __CLANG_ATOMIC_SHORT_LOCK_FREE
-# define LIBCUDACXX_ATOMIC_INT_LOCK_FREE       __CLANG_ATOMIC_INT_LOCK_FREE
-# define LIBCUDACXX_ATOMIC_LONG_LOCK_FREE      __CLANG_ATOMIC_LONG_LOCK_FREE
-# define LIBCUDACXX_ATOMIC_LLONG_LOCK_FREE     __CLANG_ATOMIC_LLONG_LOCK_FREE
-# define LIBCUDACXX_ATOMIC_POINTER_LOCK_FREE   __CLANG_ATOMIC_POINTER_LOCK_FREE
+#  define LIBCUDACXX_ATOMIC_BOOL_LOCK_FREE     __CLANG_ATOMIC_BOOL_LOCK_FREE
+#  define LIBCUDACXX_ATOMIC_CHAR_LOCK_FREE     __CLANG_ATOMIC_CHAR_LOCK_FREE
+#  define LIBCUDACXX_ATOMIC_CHAR16_T_LOCK_FREE __CLANG_ATOMIC_CHAR16_T_LOCK_FREE
+#  define LIBCUDACXX_ATOMIC_CHAR32_T_LOCK_FREE __CLANG_ATOMIC_CHAR32_T_LOCK_FREE
+#  define LIBCUDACXX_ATOMIC_WCHAR_T_LOCK_FREE  __CLANG_ATOMIC_WCHAR_T_LOCK_FREE
+#  define LIBCUDACXX_ATOMIC_SHORT_LOCK_FREE    __CLANG_ATOMIC_SHORT_LOCK_FREE
+#  define LIBCUDACXX_ATOMIC_INT_LOCK_FREE      __CLANG_ATOMIC_INT_LOCK_FREE
+#  define LIBCUDACXX_ATOMIC_LONG_LOCK_FREE     __CLANG_ATOMIC_LONG_LOCK_FREE
+#  define LIBCUDACXX_ATOMIC_LLONG_LOCK_FREE    __CLANG_ATOMIC_LLONG_LOCK_FREE
+#  define LIBCUDACXX_ATOMIC_POINTER_LOCK_FREE  __CLANG_ATOMIC_POINTER_LOCK_FREE
 #elif defined(__GCC_ATOMIC_BOOL_LOCK_FREE)
-# define LIBCUDACXX_ATOMIC_BOOL_LOCK_FREE      __GCC_ATOMIC_BOOL_LOCK_FREE
-# define LIBCUDACXX_ATOMIC_CHAR_LOCK_FREE      __GCC_ATOMIC_CHAR_LOCK_FREE
-# define LIBCUDACXX_ATOMIC_CHAR16_T_LOCK_FREE  __GCC_ATOMIC_CHAR16_T_LOCK_FREE
-# define LIBCUDACXX_ATOMIC_CHAR32_T_LOCK_FREE  __GCC_ATOMIC_CHAR32_T_LOCK_FREE
-# define LIBCUDACXX_ATOMIC_WCHAR_T_LOCK_FREE   __GCC_ATOMIC_WCHAR_T_LOCK_FREE
-# define LIBCUDACXX_ATOMIC_SHORT_LOCK_FREE     __GCC_ATOMIC_SHORT_LOCK_FREE
-# define LIBCUDACXX_ATOMIC_INT_LOCK_FREE       __GCC_ATOMIC_INT_LOCK_FREE
-# define LIBCUDACXX_ATOMIC_LONG_LOCK_FREE      __GCC_ATOMIC_LONG_LOCK_FREE
-# define LIBCUDACXX_ATOMIC_LLONG_LOCK_FREE     __GCC_ATOMIC_LLONG_LOCK_FREE
-# define LIBCUDACXX_ATOMIC_POINTER_LOCK_FREE   __GCC_ATOMIC_POINTER_LOCK_FREE
+#  define LIBCUDACXX_ATOMIC_BOOL_LOCK_FREE     __GCC_ATOMIC_BOOL_LOCK_FREE
+#  define LIBCUDACXX_ATOMIC_CHAR_LOCK_FREE     __GCC_ATOMIC_CHAR_LOCK_FREE
+#  define LIBCUDACXX_ATOMIC_CHAR16_T_LOCK_FREE __GCC_ATOMIC_CHAR16_T_LOCK_FREE
+#  define LIBCUDACXX_ATOMIC_CHAR32_T_LOCK_FREE __GCC_ATOMIC_CHAR32_T_LOCK_FREE
+#  define LIBCUDACXX_ATOMIC_WCHAR_T_LOCK_FREE  __GCC_ATOMIC_WCHAR_T_LOCK_FREE
+#  define LIBCUDACXX_ATOMIC_SHORT_LOCK_FREE    __GCC_ATOMIC_SHORT_LOCK_FREE
+#  define LIBCUDACXX_ATOMIC_INT_LOCK_FREE      __GCC_ATOMIC_INT_LOCK_FREE
+#  define LIBCUDACXX_ATOMIC_LONG_LOCK_FREE     __GCC_ATOMIC_LONG_LOCK_FREE
+#  define LIBCUDACXX_ATOMIC_LLONG_LOCK_FREE    __GCC_ATOMIC_LLONG_LOCK_FREE
+#  define LIBCUDACXX_ATOMIC_POINTER_LOCK_FREE  __GCC_ATOMIC_POINTER_LOCK_FREE
 #else // !defined(__CLANG_ATOMIC_BOOL_LOCK_FREE) && !defined(__GCC_ATOMIC_BOOL_LOCK_FREE)
-# define LIBCUDACXX_ATOMIC_BOOL_LOCK_FREE      2
-# define LIBCUDACXX_ATOMIC_CHAR_LOCK_FREE      2
-# define LIBCUDACXX_ATOMIC_CHAR16_T_LOCK_FREE  2
-# define LIBCUDACXX_ATOMIC_CHAR32_T_LOCK_FREE  2
-# define LIBCUDACXX_ATOMIC_WCHAR_T_LOCK_FREE   2
-# define LIBCUDACXX_ATOMIC_SHORT_LOCK_FREE     2
-# define LIBCUDACXX_ATOMIC_INT_LOCK_FREE       2
-# define LIBCUDACXX_ATOMIC_LONG_LOCK_FREE      2
-# define LIBCUDACXX_ATOMIC_LLONG_LOCK_FREE     2
-# define LIBCUDACXX_ATOMIC_POINTER_LOCK_FREE   2
+#  define LIBCUDACXX_ATOMIC_BOOL_LOCK_FREE     2
+#  define LIBCUDACXX_ATOMIC_CHAR_LOCK_FREE     2
+#  define LIBCUDACXX_ATOMIC_CHAR16_T_LOCK_FREE 2
+#  define LIBCUDACXX_ATOMIC_CHAR32_T_LOCK_FREE 2
+#  define LIBCUDACXX_ATOMIC_WCHAR_T_LOCK_FREE  2
+#  define LIBCUDACXX_ATOMIC_SHORT_LOCK_FREE    2
+#  define LIBCUDACXX_ATOMIC_INT_LOCK_FREE      2
+#  define LIBCUDACXX_ATOMIC_LONG_LOCK_FREE     2
+#  define LIBCUDACXX_ATOMIC_LLONG_LOCK_FREE    2
+#  define LIBCUDACXX_ATOMIC_POINTER_LOCK_FREE  2
 #endif
 
 #define _LIBCUDACXX_ATOMIC_IS_LOCK_FREE(size) (size <= 8)
@@ -57,14 +57,22 @@
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 #if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
-template<typename _Tp>
-struct __atomic_is_always_lock_free {
-    enum { __value = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0) };
+template <typename _Tp>
+struct __atomic_is_always_lock_free
+{
+  enum
+  {
+    __value = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0)
+  };
 };
 #else
-template<typename _Tp>
-struct __atomic_is_always_lock_free {
-    enum { __value = sizeof(_Tp) <= 8 };
+template <typename _Tp>
+struct __atomic_is_always_lock_free
+{
+  enum
+  {
+    __value = sizeof(_Tp) <= 8
+  };
 };
 #endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
 
diff --git a/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h b/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h
index ddbce4b8bb..fa5a0a3398 100644
--- a/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h
+++ b/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h
@@ -17,10 +17,10 @@
 #  error "This file is only for CL.EXE's benefit"
 #endif
 
-#include <intrin.h>
+#include <cuda/std/__atomic/order.h>
 #include <cuda/std/cassert>
 
-#include <cuda/std/__atomic/order.h>
+#include <intrin.h>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__atomic/scopes.h b/libcudacxx/include/cuda/std/__atomic/scopes.h
index 3cc3bf22c3..fc5ea0b7a0 100644
--- a/libcudacxx/include/cuda/std/__atomic/scopes.h
+++ b/libcudacxx/include/cuda/std/__atomic/scopes.h
@@ -18,44 +18,55 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 // REMEMBER CHANGES TO THESE ARE ABI BREAKING
 // TODO: Space values out for potential new scopes
 #ifndef __ATOMIC_BLOCK
-#define __ATOMIC_SYSTEM 0 // 0 indicates default
-#define __ATOMIC_DEVICE 1
-#define __ATOMIC_BLOCK 2
-#define __ATOMIC_THREAD 10
+#  define __ATOMIC_SYSTEM 0 // 0 indicates default
+#  define __ATOMIC_DEVICE 1
+#  define __ATOMIC_BLOCK  2
+#  define __ATOMIC_THREAD 10
 #endif //__ATOMIC_BLOCK
 
-enum thread_scope {
-    thread_scope_system = __ATOMIC_SYSTEM,
-    thread_scope_device = __ATOMIC_DEVICE,
-    thread_scope_block = __ATOMIC_BLOCK,
-    thread_scope_thread = __ATOMIC_THREAD
+enum thread_scope
+{
+  thread_scope_system = __ATOMIC_SYSTEM,
+  thread_scope_device = __ATOMIC_DEVICE,
+  thread_scope_block  = __ATOMIC_BLOCK,
+  thread_scope_thread = __ATOMIC_THREAD
 };
 
-struct __thread_scope_thread_tag { };
-struct __thread_scope_block_tag { };
-struct __thread_scope_device_tag { };
-struct __thread_scope_system_tag { };
+struct __thread_scope_thread_tag
+{};
+struct __thread_scope_block_tag
+{};
+struct __thread_scope_device_tag
+{};
+struct __thread_scope_system_tag
+{};
 
-template<int _Scope>  struct __scope_enum_to_tag { };
+template <int _Scope>
+struct __scope_enum_to_tag
+{};
 /* This would be the implementation once an actual thread-scope backend exists.
 template<> struct __scope_enum_to_tag<(int)thread_scope_thread> {
     using type = __thread_scope_thread_tag; };
 Until then: */
-template<>
-struct __scope_enum_to_tag<(int)thread_scope_thread> {
-    using __tag = __thread_scope_block_tag;
+template <>
+struct __scope_enum_to_tag<(int) thread_scope_thread>
+{
+  using __tag = __thread_scope_block_tag;
 };
-template<>
-struct __scope_enum_to_tag<(int)thread_scope_block> {
-    using __tag = __thread_scope_block_tag;
+template <>
+struct __scope_enum_to_tag<(int) thread_scope_block>
+{
+  using __tag = __thread_scope_block_tag;
 };
-template<>
-struct __scope_enum_to_tag<(int)thread_scope_device> {
-    using __tag = __thread_scope_device_tag;
+template <>
+struct __scope_enum_to_tag<(int) thread_scope_device>
+{
+  using __tag = __thread_scope_device_tag;
 };
-template<>
-struct __scope_enum_to_tag<(int)thread_scope_system> {
-    using __tag = __thread_scope_system_tag;
+template <>
+struct __scope_enum_to_tag<(int) thread_scope_system>
+{
+  using __tag = __thread_scope_system_tag;
 };
 
 template <int _Scope>
diff --git a/libcudacxx/include/cuda/std/__atomic/types.h b/libcudacxx/include/cuda/std/__atomic/types.h
index e3a3a8868f..48ba7a380c 100644
--- a/libcudacxx/include/cuda/std/__atomic/types.h
+++ b/libcudacxx/include/cuda/std/__atomic/types.h
@@ -15,27 +15,26 @@
 
 #include <cuda/std/__atomic/types/base.h>
 #include <cuda/std/__atomic/types/locked.h>
-#include <cuda/std/__atomic/types/small.h>
 #include <cuda/std/__atomic/types/reference.h>
-
+#include <cuda/std/__atomic/types/small.h>
 #include <cuda/std/__type_traits/conditional.h>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <typename _Tp>
-struct __atomic_traits {
-    static constexpr bool __atomic_requires_lock = !__atomic_is_always_lock_free<_Tp>::__value;
-    static constexpr bool __atomic_requires_small = sizeof(_Tp) < 4;
-    static constexpr bool __atomic_supports_reference = __atomic_is_always_lock_free<_Tp>::__value && (sizeof(_Tp) >= 4 && sizeof(_Tp) <= 8);
+struct __atomic_traits
+{
+  static constexpr bool __atomic_requires_lock  = !__atomic_is_always_lock_free<_Tp>::__value;
+  static constexpr bool __atomic_requires_small = sizeof(_Tp) < 4;
+  static constexpr bool __atomic_supports_reference =
+    __atomic_is_always_lock_free<_Tp>::__value && (sizeof(_Tp) >= 4 && sizeof(_Tp) <= 8);
 };
 
 template <typename _Tp>
-using __atomic_storage_t = typename _CUDA_VSTD::_If<__atomic_traits<_Tp>::__atomic_requires_small,
-                                            __atomic_small_storage<_Tp>,
-                                            _CUDA_VSTD::_If<__atomic_traits<_Tp>::__atomic_requires_lock,
-                                                __atomic_locked_storage<_Tp>,
-                                                __atomic_storage<_Tp>
-                                                >>;
+using __atomic_storage_t = typename _CUDA_VSTD::_If<
+  __atomic_traits<_Tp>::__atomic_requires_small,
+  __atomic_small_storage<_Tp>,
+  _CUDA_VSTD::_If<__atomic_traits<_Tp>::__atomic_requires_lock, __atomic_locked_storage<_Tp>, __atomic_storage<_Tp>>>;
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__atomic/types/base.h b/libcudacxx/include/cuda/std/__atomic/types/base.h
index 103343934d..fff5d554bd 100644
--- a/libcudacxx/include/cuda/std/__atomic/types/base.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/base.h
@@ -13,239 +13,218 @@
 
 #include <cuda/std/detail/__config>
 
-#include <cuda/std/__type_traits/is_trivially_copyable.h>
-
 #include <cuda/std/__atomic/functions.h>
-
 #include <cuda/std/__atomic/types/common.h>
+#include <cuda/std/__type_traits/is_trivially_copyable.h>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <typename _Tp>
-struct __atomic_storage {
-  using __underlying_t = _Tp;
+struct __atomic_storage
+{
+  using __underlying_t                = _Tp;
   static constexpr __atomic_tag __tag = __atomic_tag::__atomic_base_tag;
 
 #if !defined(_CCCL_COMPILER_GCC) || (__GNUC__ >= 5)
-  static_assert(is_trivially_copyable<_Tp>::value,
-    "std::atomic<Tp> requires that 'Tp' be a trivially copyable type");
+  static_assert(is_trivially_copyable<_Tp>::value, "std::atomic<Tp> requires that 'Tp' be a trivially copyable type");
 #endif
 
   _CCCL_ALIGNAS(sizeof(_Tp)) _Tp __a_value;
 
-  _CCCL_HOST_DEVICE constexpr explicit inline
-  __atomic_storage() noexcept
-    : __a_value{} {}
+  _CCCL_HOST_DEVICE constexpr explicit inline __atomic_storage() noexcept
+      : __a_value{}
+  {}
 
-  _CCCL_HOST_DEVICE constexpr explicit inline
-  __atomic_storage(_Tp value) noexcept
-    : __a_value(value) {}
+  _CCCL_HOST_DEVICE constexpr explicit inline __atomic_storage(_Tp value) noexcept
+      : __a_value(value)
+  {}
 
-  _CCCL_HOST_DEVICE inline
-  auto get() -> __underlying_t* {
+  _CCCL_HOST_DEVICE inline auto get() -> __underlying_t*
+  {
     return &__a_value;
   }
-  _CCCL_HOST_DEVICE inline
-  auto get() const -> const __underlying_t* {
+  _CCCL_HOST_DEVICE inline auto get() const -> const __underlying_t*
+  {
     return &__a_value;
   }
-  _CCCL_HOST_DEVICE inline
-  auto get() volatile -> volatile __underlying_t* {
+  _CCCL_HOST_DEVICE inline auto get() volatile -> volatile __underlying_t*
+  {
     return &__a_value;
   }
-  _CCCL_HOST_DEVICE inline
-  auto get() const volatile -> const volatile __underlying_t* {
+  _CCCL_HOST_DEVICE inline auto get() const volatile -> const volatile __underlying_t*
+  {
     return &__a_value;
   }
 };
 
-_CCCL_HOST_DEVICE inline
-void __atomic_thread_fence_dispatch(memory_order __order) {
-    NV_DISPATCH_TARGET(
-        NV_IS_DEVICE, (
-            __atomic_thread_fence_cuda(static_cast<__memory_order_underlying_t>(__order), __thread_scope_system_tag());
-        ),
-        NV_IS_HOST, (
-            __atomic_thread_fence_host(__order);
-        )
-    )
+_CCCL_HOST_DEVICE inline void __atomic_thread_fence_dispatch(memory_order __order)
+{
+  NV_DISPATCH_TARGET(
+    NV_IS_DEVICE,
+    (__atomic_thread_fence_cuda(static_cast<__memory_order_underlying_t>(__order), __thread_scope_system_tag());),
+    NV_IS_HOST,
+    (__atomic_thread_fence_host(__order);))
 }
 
-_CCCL_HOST_DEVICE inline
-void __atomic_signal_fence_dispatch(memory_order __order) {
-    NV_DISPATCH_TARGET(
-        NV_IS_DEVICE, (
-            __atomic_signal_fence_cuda(static_cast<__memory_order_underlying_t>(__order));
-        ),
-        NV_IS_HOST, (
-            __atomic_signal_fence_host(__order);
-        )
-    )
+_CCCL_HOST_DEVICE inline void __atomic_signal_fence_dispatch(memory_order __order)
+{
+  NV_DISPATCH_TARGET(NV_IS_DEVICE,
+                     (__atomic_signal_fence_cuda(static_cast<__memory_order_underlying_t>(__order));),
+                     NV_IS_HOST,
+                     (__atomic_signal_fence_host(__order);))
 }
 
-
 template <typename _Sto, typename _Up, __atomic_storage_is_base<_Sto> = 0>
-_CCCL_HOST_DEVICE inline
-void __atomic_init_dispatch(_Sto* __a, _Up __val) {
-    __atomic_assign_volatile(__a->get(), __val);
+_CCCL_HOST_DEVICE inline void __atomic_init_dispatch(_Sto* __a, _Up __val)
+{
+  __atomic_assign_volatile(__a->get(), __val);
 }
 
 template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0>
-_CCCL_HOST_DEVICE inline
-void __atomic_store_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {}) {
-    NV_DISPATCH_TARGET(
-        NV_IS_DEVICE, (
-            __atomic_store_n_cuda(__a->get(), __val, static_cast<__memory_order_underlying_t>(__order),  _Sco{});
-        ),
-        NV_IS_HOST, (
-            __atomic_store_host(__a->get(), __val, __order);
-        )
-    )
+_CCCL_HOST_DEVICE inline void __atomic_store_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {})
+{
+  NV_DISPATCH_TARGET(
+    NV_IS_DEVICE,
+    (__atomic_store_n_cuda(__a->get(), __val, static_cast<__memory_order_underlying_t>(__order), _Sco{});),
+    NV_IS_HOST,
+    (__atomic_store_host(__a->get(), __val, __order);))
 }
 
 template <typename _Sto, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0>
-_CCCL_HOST_DEVICE inline
-auto __atomic_load_dispatch(const _Sto* __a, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
-    NV_DISPATCH_TARGET(
-        NV_IS_DEVICE, (
-            return __atomic_load_n_cuda(__a->get(), static_cast<__memory_order_underlying_t>(__order),  _Sco{});
-        ),
-        NV_IS_HOST, (
-            return __atomic_load_host(__a->get(), __order);
-        )
-    )
+_CCCL_HOST_DEVICE inline auto __atomic_load_dispatch(const _Sto* __a, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
+  NV_DISPATCH_TARGET(
+    NV_IS_DEVICE,
+    (return __atomic_load_n_cuda(__a->get(), static_cast<__memory_order_underlying_t>(__order), _Sco{});),
+    NV_IS_HOST,
+    (return __atomic_load_host(__a->get(), __order);))
 }
 
 template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0>
-_CCCL_HOST_DEVICE inline
-auto __atomic_exchange_dispatch(_Sto* __a, _Up __value, memory_order __order, _Sco = {})  -> __atomic_underlying_t<_Sto> {
-    NV_DISPATCH_TARGET(
-        NV_IS_DEVICE, (
-            return __atomic_exchange_n_cuda(__a->get(), __value, static_cast<__memory_order_underlying_t>(__order), _Sco{});
-        ),
-        NV_IS_HOST, (
-            return __atomic_exchange_host(__a->get(), __value, __order);
-        )
-    )
+_CCCL_HOST_DEVICE inline auto __atomic_exchange_dispatch(_Sto* __a, _Up __value, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
+  NV_DISPATCH_TARGET(
+    NV_IS_DEVICE,
+    (return __atomic_exchange_n_cuda(__a->get(), __value, static_cast<__memory_order_underlying_t>(__order), _Sco{});),
+    NV_IS_HOST,
+    (return __atomic_exchange_host(__a->get(), __value, __order);))
 }
 
 template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0>
-_CCCL_HOST_DEVICE inline
-bool __atomic_compare_exchange_strong_dispatch(_Sto* __a, _Up* __expected, _Up __val, memory_order __success, memory_order __failure, _Sco = {}) {
-    bool __result = false;
-    NV_DISPATCH_TARGET(
-        NV_IS_DEVICE, (
-            __result = __atomic_compare_exchange_cuda(__a->get(), __expected, __val, false, static_cast<__memory_order_underlying_t>(__success), static_cast<__memory_order_underlying_t>(__failure), _Sco{});
-        ),
-        NV_IS_HOST, (
-            __result = __atomic_compare_exchange_strong_host(__a->get(), __expected, __val, __success, __failure);
-        )
-    )
-    return __result;
+_CCCL_HOST_DEVICE inline bool __atomic_compare_exchange_strong_dispatch(
+  _Sto* __a, _Up* __expected, _Up __val, memory_order __success, memory_order __failure, _Sco = {})
+{
+  bool __result = false;
+  NV_DISPATCH_TARGET(
+    NV_IS_DEVICE,
+    (__result = __atomic_compare_exchange_cuda(
+       __a->get(),
+       __expected,
+       __val,
+       false,
+       static_cast<__memory_order_underlying_t>(__success),
+       static_cast<__memory_order_underlying_t>(__failure),
+       _Sco{});),
+    NV_IS_HOST,
+    (__result = __atomic_compare_exchange_strong_host(__a->get(), __expected, __val, __success, __failure);))
+  return __result;
 }
 
 template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0>
-_CCCL_HOST_DEVICE inline
-bool __atomic_compare_exchange_weak_dispatch(_Sto* __a, _Up* __expected, _Up __val, memory_order __success, memory_order __failure, _Sco = {}) {
-    bool __result = false;
-    NV_DISPATCH_TARGET(
-        NV_IS_DEVICE, (
-            __result = __atomic_compare_exchange_cuda(__a->get(), __expected, __val,  true, static_cast<__memory_order_underlying_t>(__success), static_cast<__memory_order_underlying_t>(__failure), _Sco{});
-        ),
-        NV_IS_HOST, (
-            __result = __atomic_compare_exchange_weak_host(__a->get(), __expected, __val, __success, __failure);
-        )
-    )
-    return __result;
+_CCCL_HOST_DEVICE inline bool __atomic_compare_exchange_weak_dispatch(
+  _Sto* __a, _Up* __expected, _Up __val, memory_order __success, memory_order __failure, _Sco = {})
+{
+  bool __result = false;
+  NV_DISPATCH_TARGET(
+    NV_IS_DEVICE,
+    (__result = __atomic_compare_exchange_cuda(
+       __a->get(),
+       __expected,
+       __val,
+       true,
+       static_cast<__memory_order_underlying_t>(__success),
+       static_cast<__memory_order_underlying_t>(__failure),
+       _Sco{});),
+    NV_IS_HOST,
+    (__result = __atomic_compare_exchange_weak_host(__a->get(), __expected, __val, __success, __failure);))
+  return __result;
 }
 
 template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0>
-_CCCL_HOST_DEVICE inline
-auto __atomic_fetch_add_dispatch(_Sto* __a, _Up __delta, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
-    NV_DISPATCH_TARGET(
-        NV_IS_DEVICE, (
-            return __atomic_fetch_add_cuda(__a->get(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{});
-        ),
-        NV_IS_HOST, (
-            return __atomic_fetch_add_host(__a->get(), __delta, __order);
-        )
-    )
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_add_dispatch(_Sto* __a, _Up __delta, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
+  NV_DISPATCH_TARGET(
+    NV_IS_DEVICE,
+    (return __atomic_fetch_add_cuda(__a->get(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{});),
+    NV_IS_HOST,
+    (return __atomic_fetch_add_host(__a->get(), __delta, __order);))
 }
 
 template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0>
-_CCCL_HOST_DEVICE inline
-auto __atomic_fetch_sub_dispatch(_Sto* __a, _Up __delta, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
-    NV_DISPATCH_TARGET(
-        NV_IS_DEVICE, (
-            return __atomic_fetch_sub_cuda(__a->get(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{});
-        ),
-        NV_IS_HOST, (
-            return __atomic_fetch_sub_host(__a->get(), __delta, __order);
-        )
-    )
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_sub_dispatch(_Sto* __a, _Up __delta, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
+  NV_DISPATCH_TARGET(
+    NV_IS_DEVICE,
+    (return __atomic_fetch_sub_cuda(__a->get(), __delta, static_cast<__memory_order_underlying_t>(__order), _Sco{});),
+    NV_IS_HOST,
+    (return __atomic_fetch_sub_host(__a->get(), __delta, __order);))
 }
 
 template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0>
-_CCCL_HOST_DEVICE inline
-auto __atomic_fetch_and_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {})  -> __atomic_underlying_t<_Sto> {
-    NV_DISPATCH_TARGET(
-        NV_IS_DEVICE, (
-            return __atomic_fetch_and_cuda(__a->get(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{});
-        ),
-        NV_IS_HOST, (
-            return __atomic_fetch_and_host(__a->get(), __pattern, __order);
-        )
-    )
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_and_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
+  NV_DISPATCH_TARGET(
+    NV_IS_DEVICE,
+    (return __atomic_fetch_and_cuda(__a->get(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{});),
+    NV_IS_HOST,
+    (return __atomic_fetch_and_host(__a->get(), __pattern, __order);))
 }
 
 template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0>
-_CCCL_HOST_DEVICE inline
-auto __atomic_fetch_or_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {})  -> __atomic_underlying_t<_Sto> {
-    NV_DISPATCH_TARGET(
-        NV_IS_DEVICE, (
-            return __atomic_fetch_or_cuda(__a->get(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{});
-        ),
-        NV_IS_HOST, (
-            return __atomic_fetch_or_host(__a->get(), __pattern, __order);
-        )
-    )
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_or_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
+  NV_DISPATCH_TARGET(
+    NV_IS_DEVICE,
+    (return __atomic_fetch_or_cuda(__a->get(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{});),
+    NV_IS_HOST,
+    (return __atomic_fetch_or_host(__a->get(), __pattern, __order);))
 }
 
 template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0>
-_CCCL_HOST_DEVICE inline
-auto __atomic_fetch_xor_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {})  -> __atomic_underlying_t<_Sto> {
-    NV_DISPATCH_TARGET(
-        NV_IS_DEVICE, (
-            return __atomic_fetch_xor_cuda(__a->get(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{});
-        ),
-        NV_IS_HOST, (
-            return __atomic_fetch_xor_host(__a->get(), __pattern, __order);
-        )
-    )
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_xor_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
+  NV_DISPATCH_TARGET(
+    NV_IS_DEVICE,
+    (return __atomic_fetch_xor_cuda(__a->get(), __pattern, static_cast<__memory_order_underlying_t>(__order), _Sco{});),
+    NV_IS_HOST,
+    (return __atomic_fetch_xor_host(__a->get(), __pattern, __order);))
 }
 
 template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0>
-_CCCL_HOST_DEVICE inline
-auto __atomic_fetch_max_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {})  -> __atomic_underlying_t<_Sto> {
-    NV_IF_TARGET(
-        NV_IS_DEVICE, (
-            return __atomic_fetch_max_cuda(__a->get(), __val, static_cast<__memory_order_underlying_t>(__order), _Sco{});
-        ), (
-            return __atomic_fetch_max_host(__a->get(), __val, __order);
-        )
-    )
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_max_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
+  NV_IF_TARGET(
+    NV_IS_DEVICE,
+    (return __atomic_fetch_max_cuda(__a->get(), __val, static_cast<__memory_order_underlying_t>(__order), _Sco{});),
+    (return __atomic_fetch_max_host(__a->get(), __val, __order);))
 }
 
 template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0>
-_CCCL_HOST_DEVICE inline
-auto __atomic_fetch_min_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {})  -> __atomic_underlying_t<_Sto> {
-    NV_IF_TARGET(
-        NV_IS_DEVICE, (
-            return __atomic_fetch_min_cuda(__a->get(), __val, static_cast<__memory_order_underlying_t>(__order), _Sco{});
-        ), (
-            return __atomic_fetch_min_host(__a->get(), __val, __order);
-        )
-    )
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_min_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
+  NV_IF_TARGET(
+    NV_IS_DEVICE,
+    (return __atomic_fetch_min_cuda(__a->get(), __val, static_cast<__memory_order_underlying_t>(__order), _Sco{});),
+    (return __atomic_fetch_min_host(__a->get(), __val, __order);))
 }
 
 _LIBCUDACXX_END_NAMESPACE_STD
diff --git a/libcudacxx/include/cuda/std/__atomic/types/common.h b/libcudacxx/include/cuda/std/__atomic/types/common.h
index 501f538408..1c013c6356 100644
--- a/libcudacxx/include/cuda/std/__atomic/types/common.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/common.h
@@ -14,12 +14,13 @@
 #include <cuda/std/detail/__config>
 
 #include <cuda/std/__type_traits/enable_if.h>
-#include <cuda/std/__type_traits/remove_cv.h>
 #include <cuda/std/__type_traits/is_assignable.h>
+#include <cuda/std/__type_traits/remove_cv.h>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-enum class __atomic_tag {
+enum class __atomic_tag
+{
   __atomic_base_tag,
   __atomic_locked_tag,
   __atomic_small_tag,
@@ -29,7 +30,8 @@ enum class __atomic_tag {
 template <typename _Sto>
 using __atomic_storage_is_base = __enable_if_t<__atomic_tag::__atomic_base_tag == __remove_cvref_t<_Sto>::__tag, int>;
 template <typename _Sto>
-using __atomic_storage_is_locked = __enable_if_t<__atomic_tag::__atomic_locked_tag == __remove_cvref_t<_Sto>::__tag, int>;
+using __atomic_storage_is_locked =
+  __enable_if_t<__atomic_tag::__atomic_locked_tag == __remove_cvref_t<_Sto>::__tag, int>;
 template <typename _Sto>
 using __atomic_storage_is_small = __enable_if_t<__atomic_tag::__atomic_small_tag == __remove_cvref_t<_Sto>::__tag, int>;
 
@@ -37,43 +39,49 @@ using __atomic_storage_is_small = __enable_if_t<__atomic_tag::__atomic_small_tag
 // the default operator= in an object is not volatile, a byte-by-byte copy
 // is required.
 template <typename _Tp, typename _Tv>
-__enable_if_t<is_assignable<_Tp&, _Tv>::value>
-_CCCL_HOST_DEVICE __atomic_assign_volatile(_Tp* __a_value, _Tv const& __val) {
+__enable_if_t<is_assignable<_Tp&, _Tv>::value> _CCCL_HOST_DEVICE
+__atomic_assign_volatile(_Tp* __a_value, _Tv const& __val)
+{
   *__a_value = __val;
 }
 
 template <typename _Tp, typename _Tv>
-__enable_if_t<is_assignable<_Tp&, _Tv>::value>
-_CCCL_HOST_DEVICE __atomic_assign_volatile(_Tp volatile* __a_value, _Tv volatile const& __val) {
-  volatile char* __to = reinterpret_cast<volatile char*>(__a_value);
-  volatile char* __end = __to + sizeof(_Tp);
+__enable_if_t<is_assignable<_Tp&, _Tv>::value> _CCCL_HOST_DEVICE
+__atomic_assign_volatile(_Tp volatile* __a_value, _Tv volatile const& __val)
+{
+  volatile char* __to         = reinterpret_cast<volatile char*>(__a_value);
+  volatile char* __end        = __to + sizeof(_Tp);
   volatile const char* __from = reinterpret_cast<volatile const char*>(&__val);
   while (__to != __end)
+  {
     *__to++ = *__from++;
+  }
 }
 
 // The 'value_type' of the atomic may be 'volatile blah', so remove the volatile portion for now.
 template <typename _Tp>
 using __atomic_underlying_t = typename _Tp::__underlying_t;
 
-_CCCL_HOST_DEVICE
-inline int __atomic_memcmp(void const * __lhs, void const * __rhs, size_t __count) {
-    NV_DISPATCH_TARGET(
-        NV_IS_DEVICE, (
-            auto __lhs_c = reinterpret_cast<unsigned char const *>(__lhs);
-            auto __rhs_c = reinterpret_cast<unsigned char const *>(__rhs);
-            while (__count--) {
-                auto const __lhs_v = *__lhs_c++;
-                auto const __rhs_v = *__rhs_c++;
-                if (__lhs_v < __rhs_v) { return -1; }
-                if (__lhs_v > __rhs_v) { return 1; }
-            }
-            return 0;
-        ),
-        NV_IS_HOST, (
-            return memcmp(__lhs, __rhs, __count);
-        )
-    )
+_CCCL_HOST_DEVICE inline int __atomic_memcmp(void const* __lhs, void const* __rhs, size_t __count)
+{
+  NV_DISPATCH_TARGET(
+    NV_IS_DEVICE,
+    (auto __lhs_c = reinterpret_cast<unsigned char const*>(__lhs);
+     auto __rhs_c = reinterpret_cast<unsigned char const*>(__rhs);
+     while (__count--) {
+       auto const __lhs_v = *__lhs_c++;
+       auto const __rhs_v = *__rhs_c++;
+       if (__lhs_v < __rhs_v)
+       {
+         return -1;
+       }
+       if (__lhs_v > __rhs_v)
+       {
+         return 1;
+       }
+     } return 0;),
+    NV_IS_HOST,
+    (return memcmp(__lhs, __rhs, __count);))
 }
 
 _LIBCUDACXX_END_NAMESPACE_STD
diff --git a/libcudacxx/include/cuda/std/__atomic/types/locked.h b/libcudacxx/include/cuda/std/__atomic/types/locked.h
index 60cf464d56..3034f2ebbb 100644
--- a/libcudacxx/include/cuda/std/__atomic/types/locked.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/locked.h
@@ -13,74 +13,76 @@
 
 #include <cuda/std/detail/__config>
 
-#include <cuda/std/__type_traits/remove_cv.h>
-
-#include <cuda/std/__atomic/types/common.h>
-#include <cuda/std/__atomic/types/base.h>
-
 #include <cuda/std/__atomic/order.h>
 #include <cuda/std/__atomic/scopes.h>
+#include <cuda/std/__atomic/types/base.h>
+#include <cuda/std/__atomic/types/common.h>
+#include <cuda/std/__type_traits/remove_cv.h>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 // Locked atomics must override the dispatch to be able to implement RMW primitives around the embedded lock.
 template <typename _Tp>
-struct __atomic_locked_storage {
-  using __underlying_t = _Tp;
+struct __atomic_locked_storage
+{
+  using __underlying_t                = _Tp;
   static constexpr __atomic_tag __tag = __atomic_tag::__atomic_locked_tag;
 
   _Tp __a_value;
   mutable __atomic_storage<_LIBCUDACXX_ATOMIC_FLAG_TYPE> __a_lock;
 
-  _CCCL_HOST_DEVICE constexpr explicit inline
-  __atomic_locked_storage() noexcept
-    : __a_value{}, __a_lock{} {}
+  _CCCL_HOST_DEVICE constexpr explicit inline __atomic_locked_storage() noexcept
+      : __a_value{}
+      , __a_lock{}
+  {}
 
-  _CCCL_HOST_DEVICE constexpr explicit inline
-  __atomic_locked_storage(_Tp value) noexcept
-    : __a_value(value), __a_lock{} {}
+  _CCCL_HOST_DEVICE constexpr explicit inline __atomic_locked_storage(_Tp value) noexcept
+      : __a_value(value)
+      , __a_lock{}
+  {}
 
   template <typename _Sco>
-  _CCCL_HOST_DEVICE inline
-  void __lock(_Sco) const volatile {
-    while(1 == __atomic_exchange_dispatch(&__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), memory_order_acquire, _Sco{}))
-        /*spin*/;
+  _CCCL_HOST_DEVICE inline void __lock(_Sco) const volatile
+  {
+    while (1 == __atomic_exchange_dispatch(&__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), memory_order_acquire, _Sco{}))
+      /*spin*/;
   }
   template <typename _Sco>
-  _CCCL_HOST_DEVICE inline
-  void __lock(_Sco) const {
-    while(1 == __atomic_exchange_dispatch(&__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), memory_order_acquire, _Sco{}))
-        /*spin*/;
+  _CCCL_HOST_DEVICE inline void __lock(_Sco) const
+  {
+    while (1 == __atomic_exchange_dispatch(&__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), memory_order_acquire, _Sco{}))
+      /*spin*/;
   }
   template <typename _Sco>
-  _CCCL_HOST_DEVICE inline
-  void __unlock(_Sco) const volatile {
+  _CCCL_HOST_DEVICE inline void __unlock(_Sco) const volatile
+  {
     __atomic_store_dispatch(&__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), memory_order_release, _Sco{});
   }
   template <typename _Sco>
-  _CCCL_HOST_DEVICE inline
-  void __unlock(_Sco) const {
+  _CCCL_HOST_DEVICE inline void __unlock(_Sco) const
+  {
     __atomic_store_dispatch(&__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), memory_order_release, _Sco{});
   }
 };
 
 template <typename _Sto, typename _Up, __atomic_storage_is_locked<_Sto> = 0>
-_CCCL_HOST_DEVICE inline
-void __atomic_init_dispatch(_Sto* __a,  _Up __val) {
+_CCCL_HOST_DEVICE inline void __atomic_init_dispatch(_Sto* __a, _Up __val)
+{
   __atomic_assign_volatile(&__a->__a_value, __val);
 }
 
 template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_locked<_Sto> = 0>
-_CCCL_HOST_DEVICE inline
-void __atomic_store_dispatch(_Sto* __a,  _Up __val, memory_order, _Sco = {}) {
+_CCCL_HOST_DEVICE inline void __atomic_store_dispatch(_Sto* __a, _Up __val, memory_order, _Sco = {})
+{
   __a->__lock(_Sco{});
   __atomic_assign_volatile(&__a->__a_value, __val);
   __a->__unlock(_Sco{});
 }
 
 template <typename _Sto, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_locked<_Sto> = 0>
-_CCCL_HOST_DEVICE inline
-auto __atomic_load_dispatch(const _Sto* __a, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
+_CCCL_HOST_DEVICE inline auto __atomic_load_dispatch(const _Sto* __a, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
   using _Tp = __atomic_underlying_t<_Sto>;
   _Tp __old;
   __a->__lock(_Sco{});
@@ -90,8 +92,9 @@ auto __atomic_load_dispatch(const _Sto* __a, memory_order __order, _Sco = {}) ->
 }
 
 template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_locked<_Sto> = 0>
-_CCCL_HOST_DEVICE inline
-auto __atomic_exchange_dispatch(_Sto* __a, _Up __value, memory_order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
+_CCCL_HOST_DEVICE inline auto __atomic_exchange_dispatch(_Sto* __a, _Up __value, memory_order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
   using _Tp = __atomic_underlying_t<_Sto>;
   _Tp __old;
   __a->__lock(_Sco{});
@@ -102,40 +105,51 @@ auto __atomic_exchange_dispatch(_Sto* __a, _Up __value, memory_order, _Sco = {})
 }
 
 template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_locked<_Sto> = 0>
-_CCCL_HOST_DEVICE inline
-bool __atomic_compare_exchange_strong_dispatch(_Sto* __a, _Up* __expected, _Up __value, memory_order, memory_order, _Sco = {}) {
+_CCCL_HOST_DEVICE inline bool __atomic_compare_exchange_strong_dispatch(
+  _Sto* __a, _Up* __expected, _Up __value, memory_order, memory_order, _Sco = {})
+{
   using _Tp = __atomic_underlying_t<_Sto>;
   _Tp __temp;
   __a->__lock(_Sco{});
   __atomic_assign_volatile(&__temp, __a->__a_value);
   bool __ret = __temp == *__expected;
-  if(__ret)
+  if (__ret)
+  {
     __atomic_assign_volatile(&__a->__a_value, __value);
+  }
   else
+  {
     __atomic_assign_volatile(__expected, __a->__a_value);
+  }
   __a->__unlock(_Sco{});
   return __ret;
 }
 
 template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_locked<_Sto> = 0>
-_CCCL_HOST_DEVICE inline
-bool __atomic_compare_exchange_weak_dispatch(_Sto* __a, _Up* __expected, _Up __value, memory_order, memory_order, _Sco = {}) {
+_CCCL_HOST_DEVICE inline bool
+__atomic_compare_exchange_weak_dispatch(_Sto* __a, _Up* __expected, _Up __value, memory_order, memory_order, _Sco = {})
+{
   using _Tp = __atomic_underlying_t<_Sto>;
   _Tp __temp;
   __a->__lock(_Sco{});
   __atomic_assign_volatile(&__temp, __a->__a_value);
   bool __ret = __temp == *__expected;
-  if(__ret)
+  if (__ret)
+  {
     __atomic_assign_volatile(&__a->__a_value, __value);
+  }
   else
+  {
     __atomic_assign_volatile(__expected, __a->__a_value);
+  }
   __a->__unlock(_Sco{});
   return __ret;
 }
 
 template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_locked<_Sto> = 0>
-_CCCL_HOST_DEVICE inline
-auto __atomic_fetch_add_dispatch(_Sto* __a, _Up __delta, memory_order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_add_dispatch(_Sto* __a, _Up __delta, memory_order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
   using _Tp = __atomic_underlying_t<_Sto>;
   _Tp __old;
   __a->__lock(_Sco{});
@@ -146,8 +160,9 @@ auto __atomic_fetch_add_dispatch(_Sto* __a, _Up __delta, memory_order, _Sco = {}
 }
 
 template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_locked<_Sto> = 0>
-_CCCL_HOST_DEVICE inline
-auto __atomic_fetch_sub_dispatch(_Sto* __a, _Up __delta, memory_order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_sub_dispatch(_Sto* __a, _Up __delta, memory_order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
   using _Tp = __atomic_underlying_t<_Sto>;
   _Tp __old;
   __a->__lock(_Sco{});
@@ -158,8 +173,9 @@ auto __atomic_fetch_sub_dispatch(_Sto* __a, _Up __delta, memory_order, _Sco = {}
 }
 
 template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_locked<_Sto> = 0>
-_CCCL_HOST_DEVICE inline
-auto __atomic_fetch_and_dispatch(_Sto* __a, _Up __pattern, memory_order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_and_dispatch(_Sto* __a, _Up __pattern, memory_order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
   using _Tp = __atomic_underlying_t<_Sto>;
   _Tp __old;
   __a->__lock(_Sco{});
@@ -169,10 +185,10 @@ auto __atomic_fetch_and_dispatch(_Sto* __a, _Up __pattern, memory_order, _Sco =
   return __old;
 }
 
-
 template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_locked<_Sto> = 0>
-_CCCL_HOST_DEVICE inline
-auto __atomic_fetch_or_dispatch(_Sto* __a, _Up __pattern, memory_order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_or_dispatch(_Sto* __a, _Up __pattern, memory_order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
   using _Tp = __atomic_underlying_t<_Sto>;
   _Tp __old;
   __a->__lock(_Sco{});
@@ -182,10 +198,10 @@ auto __atomic_fetch_or_dispatch(_Sto* __a, _Up __pattern, memory_order, _Sco = {
   return __old;
 }
 
-
 template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_locked<_Sto> = 0>
-_CCCL_HOST_DEVICE inline
-auto __atomic_fetch_xor_dispatch(_Sto* __a, _Up __pattern, memory_order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_xor_dispatch(_Sto* __a, _Up __pattern, memory_order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
   using _Tp = __atomic_underlying_t<_Sto>;
   _Tp __old;
   __a->__lock(_Sco{});
diff --git a/libcudacxx/include/cuda/std/__atomic/types/reference.h b/libcudacxx/include/cuda/std/__atomic/types/reference.h
index 77af449842..9629bcc60e 100644
--- a/libcudacxx/include/cuda/std/__atomic/types/reference.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/reference.h
@@ -13,45 +13,45 @@
 
 #include <cuda/std/detail/__config>
 
-#include <cuda/std/type_traits>
-
 #include <cuda/std/__atomic/types/base.h>
+#include <cuda/std/type_traits>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 // Reference is compatible with __atomic_base_tag and uses the default dispatch
 template <typename _Tp>
-struct __atomic_ref_storage {
-  using __underlying_t = _Tp;
+struct __atomic_ref_storage
+{
+  using __underlying_t                = _Tp;
   static constexpr __atomic_tag __tag = __atomic_tag::__atomic_base_tag;
 
 #if !defined(_CCCL_COMPILER_GCC) || (__GNUC__ >= 5)
   static_assert(is_trivially_copyable<_Tp>::value,
-    "std::atomic_ref<Tp> requires that 'Tp' be a trivially copyable type");
+                "std::atomic_ref<Tp> requires that 'Tp' be a trivially copyable type");
 #endif
 
   _Tp* __a_value;
 
   __atomic_ref_storage() = delete;
 
-  _CCCL_HOST_DEVICE constexpr explicit inline
-  __atomic_ref_storage(_Tp* value) noexcept
-    : __a_value(value) {}
+  _CCCL_HOST_DEVICE constexpr explicit inline __atomic_ref_storage(_Tp* value) noexcept
+      : __a_value(value)
+  {}
 
-  _CCCL_HOST_DEVICE inline
-  auto get() -> __underlying_t* {
+  _CCCL_HOST_DEVICE inline auto get() -> __underlying_t*
+  {
     return __a_value;
   }
-  _CCCL_HOST_DEVICE inline
-  auto get() const -> __underlying_t* {
+  _CCCL_HOST_DEVICE inline auto get() const -> __underlying_t*
+  {
     return __a_value;
   }
-  _CCCL_HOST_DEVICE inline
-  auto get() volatile -> volatile __underlying_t* {
+  _CCCL_HOST_DEVICE inline auto get() volatile -> volatile __underlying_t*
+  {
     return __a_value;
   }
-  _CCCL_HOST_DEVICE inline
-  auto get() const volatile -> volatile __underlying_t* {
+  _CCCL_HOST_DEVICE inline auto get() const volatile -> volatile __underlying_t*
+  {
     return __a_value;
   }
 };
diff --git a/libcudacxx/include/cuda/std/__atomic/types/small.h b/libcudacxx/include/cuda/std/__atomic/types/small.h
index 30deef7f7e..d4e831db7c 100644
--- a/libcudacxx/include/cuda/std/__atomic/types/small.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/small.h
@@ -13,11 +13,10 @@
 
 #include <cuda/std/detail/__config>
 
-#include <cuda/std/type_traits>
-
-#include <cuda/std/__atomic/types/base.h>
 #include <cuda/std/__atomic/order.h>
 #include <cuda/std/__atomic/scopes.h>
+#include <cuda/std/__atomic/types/base.h>
+#include <cuda/std/type_traits>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
@@ -27,152 +26,185 @@ template <typename _Tp>
 using __atomic_small_proxy_t = __conditional_t<is_signed<_Tp>::value, int32_t, uint32_t>;
 
 // Arithmetic conversions to/from proxy types
-template<class _Tp, __enable_if_t<is_arithmetic<_Tp>::value, int> = 0>
-constexpr _CCCL_HOST_DEVICE inline __atomic_small_proxy_t<_Tp> __atomic_small_to_32(_Tp __val) {
-    return static_cast<__atomic_small_proxy_t<_Tp>>(__val);
+template <class _Tp, __enable_if_t<is_arithmetic<_Tp>::value, int> = 0>
+constexpr _CCCL_HOST_DEVICE inline __atomic_small_proxy_t<_Tp> __atomic_small_to_32(_Tp __val)
+{
+  return static_cast<__atomic_small_proxy_t<_Tp>>(__val);
 }
 
-template<class _Tp, __enable_if_t<is_arithmetic<_Tp>::value, int> = 0>
-constexpr _CCCL_HOST_DEVICE inline _Tp __atomic_small_from_32(__atomic_small_proxy_t<_Tp> __val) {
-    return static_cast<_Tp>(__val);
+template <class _Tp, __enable_if_t<is_arithmetic<_Tp>::value, int> = 0>
+constexpr _CCCL_HOST_DEVICE inline _Tp __atomic_small_from_32(__atomic_small_proxy_t<_Tp> __val)
+{
+  return static_cast<_Tp>(__val);
 }
 
 // Non-arithmetic conversion to/from proxy types
-template<class _Tp, __enable_if_t<!is_arithmetic<_Tp>::value, int> = 0>
-_CCCL_HOST_DEVICE inline __atomic_small_proxy_t<_Tp> __atomic_small_to_32(_Tp __val) {
-    __atomic_small_proxy_t<_Tp> __temp{};
-    memcpy(&__temp, &__val, sizeof(_Tp));
-    return __temp;
+template <class _Tp, __enable_if_t<!is_arithmetic<_Tp>::value, int> = 0>
+_CCCL_HOST_DEVICE inline __atomic_small_proxy_t<_Tp> __atomic_small_to_32(_Tp __val)
+{
+  __atomic_small_proxy_t<_Tp> __temp{};
+  memcpy(&__temp, &__val, sizeof(_Tp));
+  return __temp;
 }
 
-template<class _Tp, __enable_if_t<!is_arithmetic<_Tp>::value, int> = 0>
-_CCCL_HOST_DEVICE inline _Tp __atomic_small_from_32(__atomic_small_proxy_t<_Tp> __val) {
-    _Tp __temp{};
-    memcpy(&__temp, &__val, sizeof(_Tp));
-    return __temp;
+template <class _Tp, __enable_if_t<!is_arithmetic<_Tp>::value, int> = 0>
+_CCCL_HOST_DEVICE inline _Tp __atomic_small_from_32(__atomic_small_proxy_t<_Tp> __val)
+{
+  _Tp __temp{};
+  memcpy(&__temp, &__val, sizeof(_Tp));
+  return __temp;
 }
 
 template <typename _Tp>
-struct __atomic_small_storage {
-    using __underlying_t = _Tp;
-    using __proxy_t = __atomic_small_proxy_t<_Tp>;
-    static constexpr __atomic_tag __tag = __atomic_tag::__atomic_small_tag;
+struct __atomic_small_storage
+{
+  using __underlying_t                = _Tp;
+  using __proxy_t                     = __atomic_small_proxy_t<_Tp>;
+  static constexpr __atomic_tag __tag = __atomic_tag::__atomic_small_tag;
 
-    _CCCL_HOST_DEVICE constexpr inline explicit
-    __atomic_small_storage() noexcept
-        : __a_value{__proxy_t{}} {}
-    ;
+  _CCCL_HOST_DEVICE constexpr inline explicit __atomic_small_storage() noexcept
+      : __a_value{__proxy_t{}} {};
 
-    _CCCL_HOST_DEVICE constexpr inline explicit
-    __atomic_small_storage(_Tp __value) noexcept
-        : __a_value{__atomic_small_to_32(__value)} {}
+  _CCCL_HOST_DEVICE constexpr inline explicit __atomic_small_storage(_Tp __value) noexcept
+      : __a_value{__atomic_small_to_32(__value)}
+  {}
 
-    __atomic_storage<__proxy_t> __a_value;
+  __atomic_storage<__proxy_t> __a_value;
 };
 
 template <typename _Sto, typename _Up, __atomic_storage_is_small<_Sto> = 0>
-_CCCL_HOST_DEVICE inline
-void __atomic_init_dispatch(_Sto* __a, _Up __val) {
-    __atomic_init_dispatch(&__a->__a_value, __atomic_small_to_32(__val));
+_CCCL_HOST_DEVICE inline void __atomic_init_dispatch(_Sto* __a, _Up __val)
+{
+  __atomic_init_dispatch(&__a->__a_value, __atomic_small_to_32(__val));
 }
 
 template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_small<_Sto> = 0>
-_CCCL_HOST_DEVICE inline
-void __atomic_store_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {}) {
-    __atomic_store_dispatch(&__a->__a_value, __atomic_small_to_32(__val), __order, _Sco{});
+_CCCL_HOST_DEVICE inline void __atomic_store_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {})
+{
+  __atomic_store_dispatch(&__a->__a_value, __atomic_small_to_32(__val), __order, _Sco{});
 }
 
 template <typename _Sto, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_small<_Sto> = 0>
-_CCCL_HOST_DEVICE inline
-auto __atomic_load_dispatch(const _Sto* __a, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
-    using _Tp = __atomic_underlying_t<_Sto>;
-    return __atomic_small_from_32<_Tp>(__atomic_load_dispatch(&__a->__a_value, __order, _Sco{}));
+_CCCL_HOST_DEVICE inline auto __atomic_load_dispatch(const _Sto* __a, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
+  using _Tp = __atomic_underlying_t<_Sto>;
+  return __atomic_small_from_32<_Tp>(__atomic_load_dispatch(&__a->__a_value, __order, _Sco{}));
 }
 
 template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_small<_Sto> = 0>
-_CCCL_HOST_DEVICE inline
-auto __atomic_exchange_dispatch(_Sto* __a, _Up __value, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
-    using _Tp = __atomic_underlying_t<_Sto>;
-    return __atomic_small_from_32<_Tp>(__atomic_exchange_dispatch(&__a->__a_value, __atomic_small_to_32(__value), __order, _Sco{}));
+_CCCL_HOST_DEVICE inline auto __atomic_exchange_dispatch(_Sto* __a, _Up __value, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
+  using _Tp = __atomic_underlying_t<_Sto>;
+  return __atomic_small_from_32<_Tp>(
+    __atomic_exchange_dispatch(&__a->__a_value, __atomic_small_to_32(__value), __order, _Sco{}));
 }
 
 template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_small<_Sto> = 0>
-_CCCL_HOST_DEVICE inline
-bool __atomic_compare_exchange_weak_dispatch(_Sto* __a, _Up* __expected, _Up __value, memory_order __success, memory_order __failure, _Sco = {}) {
-    using _Tp = __atomic_underlying_t<_Sto>;
-    auto __temp_expected = __atomic_small_to_32(*__expected);
-    auto const __ret = __atomic_compare_exchange_weak_dispatch(&__a->__a_value, &__temp_expected, __atomic_small_to_32(__value), __success, __failure, _Sco{});
-    auto const __actual = __atomic_small_from_32<_Tp>(__temp_expected);
-    constexpr auto __mask = static_cast<decltype(__temp_expected)>((1u << (8*sizeof(_Tp))) - 1);
-    if(!__ret) {
-        if(0 == __atomic_memcmp(&__actual, __expected, sizeof(_Tp)))
-            __atomic_fetch_and_dispatch(&__a->__a_value, __mask, memory_order_relaxed, _Sco{});
-        else
-            *__expected = __actual;
+_CCCL_HOST_DEVICE inline bool __atomic_compare_exchange_weak_dispatch(
+  _Sto* __a, _Up* __expected, _Up __value, memory_order __success, memory_order __failure, _Sco = {})
+{
+  using _Tp            = __atomic_underlying_t<_Sto>;
+  auto __temp_expected = __atomic_small_to_32(*__expected);
+  auto const __ret     = __atomic_compare_exchange_weak_dispatch(
+    &__a->__a_value, &__temp_expected, __atomic_small_to_32(__value), __success, __failure, _Sco{});
+  auto const __actual   = __atomic_small_from_32<_Tp>(__temp_expected);
+  constexpr auto __mask = static_cast<decltype(__temp_expected)>((1u << (8 * sizeof(_Tp))) - 1);
+  if (!__ret)
+  {
+    if (0 == __atomic_memcmp(&__actual, __expected, sizeof(_Tp)))
+    {
+      __atomic_fetch_and_dispatch(&__a->__a_value, __mask, memory_order_relaxed, _Sco{});
+    }
+    else
+    {
+      *__expected = __actual;
     }
-    return __ret;
+  }
+  return __ret;
 }
 
 template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_small<_Sto> = 0>
-_CCCL_HOST_DEVICE inline
-bool __atomic_compare_exchange_strong_dispatch(_Sto* __a, _Up* __expected, _Up __value, memory_order __success, memory_order __failure, _Sco = {}) {
-    using _Tp = __atomic_underlying_t<_Sto>;
-    auto const __old = *__expected;
-    while(1) {
-        if(__atomic_compare_exchange_weak_dispatch(__a, __expected, __value, __success, __failure, _Sco{}))
-            return true;
-        if(0 != __atomic_memcmp(&__old, __expected, sizeof(_Tp)))
-            return false;
+_CCCL_HOST_DEVICE inline bool __atomic_compare_exchange_strong_dispatch(
+  _Sto* __a, _Up* __expected, _Up __value, memory_order __success, memory_order __failure, _Sco = {})
+{
+  using _Tp        = __atomic_underlying_t<_Sto>;
+  auto const __old = *__expected;
+  while (1)
+  {
+    if (__atomic_compare_exchange_weak_dispatch(__a, __expected, __value, __success, __failure, _Sco{}))
+    {
+      return true;
+    }
+    if (0 != __atomic_memcmp(&__old, __expected, sizeof(_Tp)))
+    {
+      return false;
     }
+  }
 }
 
 template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_small<_Sto> = 0>
-_CCCL_HOST_DEVICE inline
-auto __atomic_fetch_add_dispatch(_Sto* __a, _Up __delta, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
-    using _Tp = __atomic_underlying_t<_Sto>;
-    return __atomic_small_from_32<_Tp>(__atomic_fetch_add_dispatch(&__a->__a_value, __atomic_small_to_32(__delta), __order, _Sco{}));
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_add_dispatch(_Sto* __a, _Up __delta, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
+  using _Tp = __atomic_underlying_t<_Sto>;
+  return __atomic_small_from_32<_Tp>(
+    __atomic_fetch_add_dispatch(&__a->__a_value, __atomic_small_to_32(__delta), __order, _Sco{}));
 }
 
 template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_small<_Sto> = 0>
-_CCCL_HOST_DEVICE inline
-auto __atomic_fetch_sub_dispatch(_Sto* __a, _Up __delta, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
-    using _Tp = __atomic_underlying_t<_Sto>;
-    return __atomic_small_from_32<_Tp>(__atomic_fetch_sub_dispatch(&__a->__a_value, __atomic_small_to_32(__delta), __order, _Sco{}));
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_sub_dispatch(_Sto* __a, _Up __delta, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
+  using _Tp = __atomic_underlying_t<_Sto>;
+  return __atomic_small_from_32<_Tp>(
+    __atomic_fetch_sub_dispatch(&__a->__a_value, __atomic_small_to_32(__delta), __order, _Sco{}));
 }
 
 template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_small<_Sto> = 0>
-_CCCL_HOST_DEVICE inline
-auto __atomic_fetch_and_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
-    using _Tp = __atomic_underlying_t<_Sto>;
-    return __atomic_small_from_32<_Tp>(__atomic_fetch_and_dispatch(&__a->__a_value, __atomic_small_to_32(__pattern), __order, _Sco{}));
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_and_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
+  using _Tp = __atomic_underlying_t<_Sto>;
+  return __atomic_small_from_32<_Tp>(
+    __atomic_fetch_and_dispatch(&__a->__a_value, __atomic_small_to_32(__pattern), __order, _Sco{}));
 }
 
 template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_small<_Sto> = 0>
-_CCCL_HOST_DEVICE inline
-auto __atomic_fetch_or_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
-    using _Tp = __atomic_underlying_t<_Sto>;
-    return __atomic_small_from_32<_Tp>(__atomic_fetch_or_dispatch(&__a->__a_value, __atomic_small_to_32(__pattern), __order, _Sco{}));
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_or_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
+  using _Tp = __atomic_underlying_t<_Sto>;
+  return __atomic_small_from_32<_Tp>(
+    __atomic_fetch_or_dispatch(&__a->__a_value, __atomic_small_to_32(__pattern), __order, _Sco{}));
 }
 
 template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_small<_Sto> = 0>
-_CCCL_HOST_DEVICE inline
-auto __atomic_fetch_xor_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
-    using _Tp = __atomic_underlying_t<_Sto>;
-    return __atomic_small_from_32<_Tp>(__atomic_fetch_xor_dispatch(&__a->__a_value, __atomic_small_to_32(__pattern), __order, _Sco{}));
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_xor_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
+  using _Tp = __atomic_underlying_t<_Sto>;
+  return __atomic_small_from_32<_Tp>(
+    __atomic_fetch_xor_dispatch(&__a->__a_value, __atomic_small_to_32(__pattern), __order, _Sco{}));
 }
 
 template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_small<_Sto> = 0>
-_CCCL_HOST_DEVICE inline
-auto __atomic_fetch_max_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
-    using _Tp = __atomic_underlying_t<_Sto>;
-    return __atomic_small_from_32<_Tp>(__atomic_fetch_max_dispatch(&__a->__a_value, __atomic_small_to_32(__val), __order, _Sco{}));
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_max_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
+  using _Tp = __atomic_underlying_t<_Sto>;
+  return __atomic_small_from_32<_Tp>(
+    __atomic_fetch_max_dispatch(&__a->__a_value, __atomic_small_to_32(__val), __order, _Sco{}));
 }
 
 template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_small<_Sto> = 0>
-_CCCL_HOST_DEVICE inline
-auto __atomic_fetch_min_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {}) -> __atomic_underlying_t<_Sto> {
-    using _Tp = __atomic_underlying_t<_Sto>;
-    return __atomic_small_from_32<_Tp>(__atomic_fetch_min_dispatch(&__a->__a_value, __atomic_small_to_32(__val), __order, _Sco{}));
+_CCCL_HOST_DEVICE inline auto __atomic_fetch_min_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {})
+  -> __atomic_underlying_t<_Sto>
+{
+  using _Tp = __atomic_underlying_t<_Sto>;
+  return __atomic_small_from_32<_Tp>(
+    __atomic_fetch_min_dispatch(&__a->__a_value, __atomic_small_to_32(__val), __order, _Sco{}));
 }
 
 _LIBCUDACXX_END_NAMESPACE_STD
diff --git a/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h b/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h
index c9359c9627..387af5cc2a 100644
--- a/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h
+++ b/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h
@@ -13,9 +13,8 @@
 
 #include <cuda/std/detail/__config>
 
-#include <cuda/std/__atomic/scopes.h>
 #include <cuda/std/__atomic/order.h>
-
+#include <cuda/std/__atomic/scopes.h>
 #include <cuda/std/__atomic/wait/polling.h>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
@@ -23,58 +22,59 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 extern "C" _CCCL_DEVICE void __atomic_try_wait_unsupported_before_SM_70__();
 
 template <typename _Tp, typename _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY void __atomic_try_wait_slow(_Tp const volatile* __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco) {
-    NV_DISPATCH_TARGET(
-        NV_PROVIDES_SM_70,
-            __atomic_try_wait_slow_fallback(__a, __val, __order, _Sco{});,
-        NV_IS_HOST,
-            __atomic_try_wait_slow_fallback(__a, __val, __order, _Sco{});,
-        NV_ANY_TARGET,
-            __atomic_try_wait_unsupported_before_SM_70__();
-    );
+_LIBCUDACXX_INLINE_VISIBILITY void
+__atomic_try_wait_slow(_Tp const volatile* __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco)
+{
+  NV_DISPATCH_TARGET(NV_PROVIDES_SM_70, __atomic_try_wait_slow_fallback(__a, __val, __order, _Sco{});
+                     , NV_IS_HOST, __atomic_try_wait_slow_fallback(__a, __val, __order, _Sco{});
+                     , NV_ANY_TARGET, __atomic_try_wait_unsupported_before_SM_70__(););
 }
 
 template <typename _Tp, typename _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY void __atomic_notify_one(_Tp const volatile*, _Sco) {
-    NV_DISPATCH_TARGET(
-        NV_PROVIDES_SM_70,,
-        NV_IS_HOST,,
-        NV_ANY_TARGET,
-            __atomic_try_wait_unsupported_before_SM_70__();
-    );
+_LIBCUDACXX_INLINE_VISIBILITY void __atomic_notify_one(_Tp const volatile*, _Sco)
+{
+  NV_DISPATCH_TARGET(NV_PROVIDES_SM_70, , NV_IS_HOST, , NV_ANY_TARGET, __atomic_try_wait_unsupported_before_SM_70__(););
 }
 
 template <typename _Tp, typename _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY void __atomic_notify_all(_Tp const volatile*, _Sco) {
-    NV_DISPATCH_TARGET(
-        NV_PROVIDES_SM_70,,
-        NV_IS_HOST,,
-        NV_ANY_TARGET,
-            __atomic_try_wait_unsupported_before_SM_70__();
-    );
+_LIBCUDACXX_INLINE_VISIBILITY void __atomic_notify_all(_Tp const volatile*, _Sco)
+{
+  NV_DISPATCH_TARGET(NV_PROVIDES_SM_70, , NV_IS_HOST, , NV_ANY_TARGET, __atomic_try_wait_unsupported_before_SM_70__(););
 }
 
-template <typename _Tp> _LIBCUDACXX_INLINE_VISIBILITY
-bool __nonatomic_compare_equal(_Tp const& __lhs, _Tp const& __rhs) {
+template <typename _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY bool __nonatomic_compare_equal(_Tp const& __lhs, _Tp const& __rhs)
+{
 #if defined(_CCCL_CUDA_COMPILER)
-    return __lhs == __rhs;
+  return __lhs == __rhs;
 #else
-    return memcmp(&__lhs, &__rhs, sizeof(_Tp)) == 0;
+  return memcmp(&__lhs, &__rhs, sizeof(_Tp)) == 0;
 #endif
 }
 
 template <typename _Tp, typename _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY void __atomic_wait(_Tp const volatile* __a, __remove_cv_t<__atomic_underlying_t<_Tp>> const __val, memory_order __order, _Sco = {}) {
-    for(int __i = 0; __i < _LIBCUDACXX_POLLING_COUNT; ++__i) {
-        if(!__nonatomic_compare_equal(__atomic_load_dispatch(__a, __order, _Sco{}), __val))
-            return;
-        if(__i < 12)
-            __libcpp_thread_yield_processor();
-        else
-            __libcpp_thread_yield();
+_LIBCUDACXX_INLINE_VISIBILITY void __atomic_wait(
+  _Tp const volatile* __a, __remove_cv_t<__atomic_underlying_t<_Tp>> const __val, memory_order __order, _Sco = {})
+{
+  for (int __i = 0; __i < _LIBCUDACXX_POLLING_COUNT; ++__i)
+  {
+    if (!__nonatomic_compare_equal(__atomic_load_dispatch(__a, __order, _Sco{}), __val))
+    {
+      return;
+    }
+    if (__i < 12)
+    {
+      __libcpp_thread_yield_processor();
+    }
+    else
+    {
+      __libcpp_thread_yield();
     }
-    while(__nonatomic_compare_equal(__atomic_load_dispatch(__a, __order, _Sco{}), __val))
-        __atomic_try_wait_slow(__a, __val, __order, _Sco{});
+  }
+  while (__nonatomic_compare_equal(__atomic_load_dispatch(__a, __order, _Sco{}), __val))
+  {
+    __atomic_try_wait_slow(__a, __val, __order, _Sco{});
+  }
 }
 
 _LIBCUDACXX_END_NAMESPACE_STD
diff --git a/libcudacxx/include/cuda/std/__atomic/wait/polling.h b/libcudacxx/include/cuda/std/__atomic/wait/polling.h
index 4785f36433..5116b1564a 100644
--- a/libcudacxx/include/cuda/std/__atomic/wait/polling.h
+++ b/libcudacxx/include/cuda/std/__atomic/wait/polling.h
@@ -13,40 +13,39 @@
 
 #include <cuda/std/detail/__config>
 
-#include <cuda/std/__atomic/types.h>
-
 #include <cuda/std/__atomic/order.h>
 #include <cuda/std/__atomic/scopes.h>
-
+#include <cuda/std/__atomic/types.h>
 #include <cuda/std/detail/libcxx/include/__threading_support>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <typename _Tp, typename _Sco>
-struct __atomic_poll_tester {
-    using __underlying_t = __atomic_underlying_t<_Tp>;
+struct __atomic_poll_tester
+{
+  using __underlying_t = __atomic_underlying_t<_Tp>;
 
-    _Tp const volatile* __atom;
-    __underlying_t __val;
-    memory_order __order;
+  _Tp const volatile* __atom;
+  __underlying_t __val;
+  memory_order __order;
 
-    _CCCL_HOST_DEVICE
-    __atomic_poll_tester(_Tp const volatile* __a, __underlying_t __v, memory_order __o)
+  _CCCL_HOST_DEVICE __atomic_poll_tester(_Tp const volatile* __a, __underlying_t __v, memory_order __o)
       : __atom(__a)
       , __val(__v)
       , __order(__o)
-    {}
+  {}
 
-    _CCCL_HOST_DEVICE
-    bool operator()() const {
-      return !(__atomic_load_dispatch(__atom, __order, _Sco{}) == __val);
-    }
+  _CCCL_HOST_DEVICE bool operator()() const
+  {
+    return !(__atomic_load_dispatch(__atom, __order, _Sco{}) == __val);
+  }
 };
 
 template <typename _Tp, typename _Sco>
-_CCCL_HOST_DEVICE
-void __atomic_try_wait_slow_fallback(_Tp const volatile* __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco) {
-    __libcpp_thread_poll_with_backoff(__atomic_poll_tester<_Tp, _Sco>(__a, __val, __order));
+_CCCL_HOST_DEVICE void
+__atomic_try_wait_slow_fallback(_Tp const volatile* __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco)
+{
+  __libcpp_thread_poll_with_backoff(__atomic_poll_tester<_Tp, _Sco>(__a, __val, __order));
 }
 
 _LIBCUDACXX_END_NAMESPACE_STD
diff --git a/libcudacxx/include/cuda/std/__cuda/atomic.h b/libcudacxx/include/cuda/std/__cuda/atomic.h
index 25d1c773ed..d70481c699 100644
--- a/libcudacxx/include/cuda/std/__cuda/atomic.h
+++ b/libcudacxx/include/cuda/std/__cuda/atomic.h
@@ -12,6 +12,7 @@
 #define _LIBCUDACXX___CUDA_ATOMIC_H
 
 #include <cuda/std/detail/__config>
+
 #include <cuda/std/atomic>
 
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
@@ -27,46 +28,45 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 // atomic<T>
 
 template <class _Tp, thread_scope _Sco = thread_scope::thread_scope_system>
-struct atomic
-  : public std::__atomic_impl<_Tp, _Sco>
+struct atomic : public std::__atomic_impl<_Tp, _Sco>
 {
-    using value_type = _Tp;
+  using value_type = _Tp;
 
   constexpr atomic() noexcept = default;
 
-  _LIBCUDACXX_INLINE_VISIBILITY
-  constexpr atomic(_Tp __d) noexcept
-      : std::__atomic_impl<_Tp, _Sco>(__d) {}
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr atomic(_Tp __d) noexcept
+      : std::__atomic_impl<_Tp, _Sco>(__d)
+  {}
 
-  atomic(const atomic&) = delete;
-  atomic& operator=(const atomic&) = delete;
+  atomic(const atomic&)                     = delete;
+  atomic& operator=(const atomic&)          = delete;
   atomic& operator=(const atomic&) volatile = delete;
 
-  _LIBCUDACXX_INLINE_VISIBILITY
-  _Tp operator=(_Tp __d) volatile noexcept
-      {this->store(__d); return __d;}
-  _LIBCUDACXX_INLINE_VISIBILITY
-  _Tp operator=(_Tp __d) noexcept
-      {this->store(__d); return __d;}
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator=(_Tp __d) volatile noexcept
+  {
+    this->store(__d);
+    return __d;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator=(_Tp __d) noexcept
+  {
+    this->store(__d);
+    return __d;
+  }
 
-  _LIBCUDACXX_INLINE_VISIBILITY
-  _Tp fetch_max(const _Tp& __op, memory_order __m = memory_order_seq_cst) noexcept
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_max(const _Tp& __op, memory_order __m = memory_order_seq_cst) noexcept
   {
     return std::__atomic_fetch_max_dispatch(&this->__a, __op, __m, std::__scope_to_tag<_Sco>{});
   }
-  _LIBCUDACXX_INLINE_VISIBILITY
-  _Tp fetch_max(const _Tp& __op, memory_order __m = memory_order_seq_cst) volatile noexcept
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_max(const _Tp& __op, memory_order __m = memory_order_seq_cst) volatile noexcept
   {
     return std::__atomic_fetch_max_dispatch(&this->__a, __op, __m, std::__scope_to_tag<_Sco>{});
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY
-  _Tp fetch_min(const _Tp& __op, memory_order __m = memory_order_seq_cst) noexcept
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_min(const _Tp& __op, memory_order __m = memory_order_seq_cst) noexcept
   {
     return std::__atomic_fetch_min_dispatch(&this->__a, __op, __m, std::__scope_to_tag<_Sco>{});
   }
-  _LIBCUDACXX_INLINE_VISIBILITY
-  _Tp fetch_min(const _Tp& __op, memory_order __m = memory_order_seq_cst) volatile noexcept
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_min(const _Tp& __op, memory_order __m = memory_order_seq_cst) volatile noexcept
   {
     return std::__atomic_fetch_min_dispatch(&this->__a, __op, __m, std::__scope_to_tag<_Sco>{});
   }
@@ -75,8 +75,7 @@ struct atomic
 // atomic_ref<T>
 
 template <class _Tp, thread_scope _Sco = thread_scope::thread_scope_system>
-struct atomic_ref
-  : public std::__atomic_ref_impl<_Tp, _Sco>
+struct atomic_ref : public std::__atomic_ref_impl<_Tp, _Sco>
 {
   using value_type = _Tp;
 
@@ -84,25 +83,26 @@ struct atomic_ref
 
   static constexpr bool is_always_lock_free = sizeof(_Tp) <= 8;
 
-  _LIBCUDACXX_INLINE_VISIBILITY
-  explicit atomic_ref(_Tp& __ref)
-    : std::__atomic_ref_impl<_Tp, _Sco>(__ref) {}
+  _LIBCUDACXX_INLINE_VISIBILITY explicit atomic_ref(_Tp& __ref)
+      : std::__atomic_ref_impl<_Tp, _Sco>(__ref)
+  {}
 
-  _LIBCUDACXX_INLINE_VISIBILITY
-  _Tp operator=(_Tp __v) const noexcept {this->store(__v); return __v;}
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator=(_Tp __v) const noexcept
+  {
+    this->store(__v);
+    return __v;
+  }
 
-  atomic_ref(const atomic_ref&) noexcept = default;
-  atomic_ref& operator=(const atomic_ref&) = delete;
+  atomic_ref(const atomic_ref&) noexcept         = default;
+  atomic_ref& operator=(const atomic_ref&)       = delete;
   atomic_ref& operator=(const atomic_ref&) const = delete;
 
-  _LIBCUDACXX_INLINE_VISIBILITY
-  _Tp fetch_max(const _Tp& __op, memory_order __m = memory_order_seq_cst) const noexcept
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_max(const _Tp& __op, memory_order __m = memory_order_seq_cst) const noexcept
   {
     return std::__atomic_fetch_max_dispatch(&this->__a, __op, __m, std::__scope_to_tag<_Sco>{});
   }
 
-  _LIBCUDACXX_INLINE_VISIBILITY
-  _Tp fetch_min(const _Tp& __op, memory_order __m = memory_order_seq_cst) const noexcept
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_min(const _Tp& __op, memory_order __m = memory_order_seq_cst) const noexcept
   {
     return std::__atomic_fetch_min_dispatch(&this->__a, __op, __m, std::__scope_to_tag<_Sco>{});
   }
diff --git a/libcudacxx/test/libcudacxx/heterogeneous/atomic.pass.cpp b/libcudacxx/test/libcudacxx/heterogeneous/atomic.pass.cpp
index 704bff1358..1c79130f48 100644
--- a/libcudacxx/test/libcudacxx/heterogeneous/atomic.pass.cpp
+++ b/libcudacxx/test/libcudacxx/heterogeneous/atomic.pass.cpp
@@ -145,18 +145,15 @@ using basic_testers =
               exchange_tester<-12, 17>>;
 
 using arithmetic_atomic_testers =
-  append<basic_testers,
-        fetch_add_tester<17, 13, 30>,
-        fetch_sub_tester<30, 21, 9>,
-        fetch_sub_tester<9, 17, -8>>;
+  append<basic_testers, fetch_add_tester<17, 13, 30>, fetch_sub_tester<30, 21, 9>, fetch_sub_tester<9, 17, -8>>;
 
 using bitwise_atomic_testers =
   append<arithmetic_atomic_testers,
-        fetch_add_tester<-8, 10, 2>,
-        fetch_or_tester<2, 13, 15>,
-        fetch_and_tester<15, 8, 8>,
-        fetch_and_tester<8, 13, 8>,
-        fetch_xor_tester<8, 12, 4>>;
+         fetch_add_tester<-8, 10, 2>,
+         fetch_or_tester<2, 13, 15>,
+         fetch_and_tester<15, 8, 8>,
+         fetch_and_tester<8, 13, 8>,
+         fetch_xor_tester<8, 12, 4>>;
 
 class big_not_lockfree_type
 {
diff --git a/libcudacxx/test/libcudacxx/heterogeneous/atomic_ref.pass.cpp b/libcudacxx/test/libcudacxx/heterogeneous/atomic_ref.pass.cpp
index dd98d04e8a..435d641d52 100644
--- a/libcudacxx/test/libcudacxx/heterogeneous/atomic_ref.pass.cpp
+++ b/libcudacxx/test/libcudacxx/heterogeneous/atomic_ref.pass.cpp
@@ -155,10 +155,7 @@ using basic_testers =
               exchange_tester<-12, 17>>;
 
 using arithmetic_atomic_testers =
-  append<basic_testers,
-         fetch_add_tester<17, 13, 30>,
-         fetch_sub_tester<30, 21, 9>,
-         fetch_sub_tester<9, 17, -8>>;
+  append<basic_testers, fetch_add_tester<17, 13, 30>, fetch_sub_tester<30, 21, 9>, fetch_sub_tester<9, 17, -8>>;
 
 using bitwise_atomic_testers =
   append<arithmetic_atomic_testers,
diff --git a/libcudacxx/test/libcudacxx/heterogeneous/barrier.pass.cpp b/libcudacxx/test/libcudacxx/heterogeneous/barrier.pass.cpp
index 61b84544a8..a4d455c882 100644
--- a/libcudacxx/test/libcudacxx/heterogeneous/barrier.pass.cpp
+++ b/libcudacxx/test/libcudacxx/heterogeneous/barrier.pass.cpp
@@ -12,8 +12,8 @@
 // uncomment for a really verbose output detailing what test steps are being launched
 // #define DEBUG_TESTERS
 
-#include <cuda/std/cassert>
 #include <cuda/barrier>
+#include <cuda/std/cassert>
 
 #include "helpers.h"
 
diff --git a/libcudacxx/test/libcudacxx/heterogeneous/helpers.h b/libcudacxx/test/libcudacxx/heterogeneous/helpers.h
index d1d3c5870f..748f20cd1e 100644
--- a/libcudacxx/test/libcudacxx/heterogeneous/helpers.h
+++ b/libcudacxx/test/libcudacxx/heterogeneous/helpers.h
@@ -16,18 +16,20 @@
 #include <thread>
 #include <vector>
 
-#include <stdlib.h>
-
 #include "meta.h"
+#include <stdlib.h>
 
-template <typename ... T>
-struct void_sink{};
+template <typename... T>
+struct void_sink
+{};
 
 template <typename T, typename = int>
-struct has_threadcount : std::false_type { };
+struct has_threadcount : std::false_type
+{};
 
 template <typename T>
-struct has_threadcount <T, decltype((void) T::threadcount, (int)0)> : std::true_type {};
+struct has_threadcount<T, decltype((void) T::threadcount, (int) 0)> : std::true_type
+{};
 
 #define DEFINE_ASYNC_TRAIT(...)                                             \
   template <typename T, typename = cuda::std::true_type>                    \
@@ -202,7 +204,8 @@ void device_destroy(T* object)
   HETEROGENEOUS_SAFE_CALL(cudaDeviceSynchronize());
 }
 template <typename Fn>
-void device_launch_async(Fn& launcher) {
+void device_launch_async(Fn& launcher)
+{
   auto streamManager = [launcher]() {
     cudaStream_t stream;
     HETEROGENEOUS_SAFE_CALL(cudaStreamCreate(&stream));
@@ -227,7 +230,7 @@ void device_initialize(T& object)
   auto kernel_launcher = [&object](cudaStream_t stream) {
     constexpr auto tc = threadcount_trait<Tester>::value;
 #ifdef DEBUG_TESTERS
-    printf("%i device init threads launched\r\n", (int)tc);
+    printf("%i device init threads launched\r\n", (int) tc);
     fflush(stdout);
 #endif
     initialization_kernel<Tester><<<1, tc, 0, stream>>>(object);
@@ -257,7 +260,7 @@ void device_validate(T& object)
   auto kernel_launcher = [&object](cudaStream_t stream) {
     constexpr auto tc = threadcount_trait<Tester>::value;
 #ifdef DEBUG_TESTERS
-    printf("%i device validate threads launched\r\n", (int)tc);
+    printf("%i device validate threads launched\r\n", (int) tc);
     fflush(stdout);
 #endif
     validation_kernel<Tester><<<1, tc, 0, stream>>>(object);
@@ -286,11 +289,12 @@ void host_initialize(T& object)
 
   constexpr auto tc = threadcount_trait<Tester>::value;
 #ifdef DEBUG_TESTERS
-  printf("%i host init threads launched\r\n", (int)tc);
+  printf("%i host init threads launched\r\n", (int) tc);
   fflush(stdout);
 #endif
 
-  for (size_t i = 0; i < tc; i++) {
+  for (size_t i = 0; i < tc; i++)
+  {
     host_threads().emplace_back([&] {
       initialize<Tester>(object);
     });
@@ -317,11 +321,12 @@ void host_validate(T& object)
 
   constexpr auto tc = threadcount_trait<Tester>::value;
 #ifdef DEBUG_TESTERS
-    printf("%i host validate threads launched\r\n", (int)tc);
-    fflush(stdout);
+  printf("%i host validate threads launched\r\n", (int) tc);
+  fflush(stdout);
 #endif
 
-  for (size_t i = 0; i < tc; i++) {
+  for (size_t i = 0; i < tc; i++)
+  {
     host_threads().emplace_back([&] {
       validate<Tester>(object);
     });
@@ -345,28 +350,34 @@ template <typename T>
 using performer = void (*)(T&);
 
 template <typename T>
-struct initializer_validator {
+struct initializer_validator
+{
   performer<T> initializer;
   performer<T> validator;
 };
 
-struct host_launcher {
+struct host_launcher
+{
   template <typename T, typename Tester>
-  static initializer_validator<T> get_exec() {
+  static initializer_validator<T> get_exec()
+  {
     return initializer_validator<T>{host_initialize<Tester>, host_validate<Tester>};
   }
 };
 
-struct device_launcher {
+struct device_launcher
+{
   template <typename T, typename Tester>
-  static initializer_validator<T> get_exec() {
+  static initializer_validator<T> get_exec()
+  {
     return initializer_validator<T>{device_initialize<Tester>, device_validate<Tester>};
   }
 };
 
-template <typename T, typename ... Testers, typename ... Launchers, typename... Args>
-void do_heterogeneous_test(type_list<Testers...>, type_list<Launchers...>, Args... args) {
-  void *pointer = nullptr;
+template <typename T, typename... Testers, typename... Launchers, typename... Args>
+void do_heterogeneous_test(type_list<Testers...>, type_list<Launchers...>, Args... args)
+{
+  void* pointer = nullptr;
   HETEROGENEOUS_SAFE_CALL(cudaMallocHost(&pointer, sizeof(T)));
   T& object = *device_construct<T>(pointer, args...);
 
@@ -388,25 +399,38 @@ void do_heterogeneous_test(type_list<Testers...>, type_list<Launchers...>, Args.
 }
 
 template <size_t Idx>
-using enable_if_permutations_remain = typename std::enable_if<Idx!=0, int>::type;
+using enable_if_permutations_remain = typename std::enable_if<Idx != 0, int>::type;
 template <size_t Idx>
-using enable_if_no_permutations_remain = typename std::enable_if<Idx==0, int>::type;
+using enable_if_no_permutations_remain = typename std::enable_if<Idx == 0, int>::type;
 
-template <size_t Idx, typename T, typename ... Testers, typename ... Launchers, typename... Args, enable_if_permutations_remain<Idx> = 0>
-void permute_tests(type_list<Testers...>, type_list<Launchers...>, Args... args) {
+template <size_t Idx,
+          typename T,
+          typename... Testers,
+          typename... Launchers,
+          typename... Args,
+          enable_if_permutations_remain<Idx> = 0>
+void permute_tests(type_list<Testers...>, type_list<Launchers...>, Args... args)
+{
 #ifdef DEBUG_TESTERS
   printf("Testing permutation %zu of %zu\r\n", Idx, sizeof...(Testers));
   fflush(stdout);
 #endif
   do_heterogeneous_test<T>(type_list<Testers...>{}, type_list<Launchers...>{}, args...);
-  permute_tests<Idx-1, T>(type_list<Testers...>{}, rotl<Launchers...>{}, args...);
+  permute_tests<Idx - 1, T>(type_list<Testers...>{}, rotl<Launchers...>{}, args...);
 }
 
-template <size_t Idx, typename T, typename ... Testers, typename ... Launchers, typename... Args, enable_if_no_permutations_remain<Idx> = 0>
-void permute_tests(type_list<Testers...>, type_list<Launchers...>, Args... args) {}
+template <size_t Idx,
+          typename T,
+          typename... Testers,
+          typename... Launchers,
+          typename... Args,
+          enable_if_no_permutations_remain<Idx> = 0>
+void permute_tests(type_list<Testers...>, type_list<Launchers...>, Args... args)
+{}
 
-template <typename T, typename ... Testers, typename ... Launchers, typename... Args>
-void permute_tests(type_list<Testers...>, type_list<Launchers...>, Args... args) {
+template <typename T, typename... Testers, typename... Launchers, typename... Args>
+void permute_tests(type_list<Testers...>, type_list<Launchers...>, Args... args)
+{
   permute_tests<sizeof...(Testers), T>(type_list<Testers...>{}, type_list<Launchers...>{}, args...);
 }
 
@@ -414,7 +438,7 @@ template <typename T, typename... Testers, typename... Args>
 void validate_device_dynamic(tester_list<Testers...>, Args... args)
 {
   // ex: type_list<device_launcher, host_launcher, host_launcher>
-  using initial_launcher_list = append_n<sizeof...(Testers)-1, type_list<device_launcher>, host_launcher>;
+  using initial_launcher_list = append_n<sizeof...(Testers) - 1, type_list<device_launcher>, host_launcher>;
   permute_tests<T>(type_list<Testers...>{}, initial_launcher_list{}, args...);
 }
 
@@ -644,7 +668,7 @@ struct performer_adapter<Performer, performer_side::initialize>
   using async_initialize = async_trait<Performer>;
   using async_validate   = async_trait<Performer>;
 
-  static constexpr auto threadcount   = threadcount_trait<Performer>::value;
+  static constexpr auto threadcount = threadcount_trait<Performer>::value;
 
   template <typename T>
   __host__ __device__ static void initialize(T& t)
@@ -659,7 +683,7 @@ struct performer_adapter<Performer, performer_side::validate>
   using async_initialize = async_trait<Performer>;
   using async_validate   = async_trait<Performer>;
 
-  static constexpr auto threadcount   = threadcount_trait<Performer>::value;
+  static constexpr auto threadcount = threadcount_trait<Performer>::value;
 
   template <typename T>
   __host__ __device__ static void initialize(T&)
diff --git a/libcudacxx/test/libcudacxx/heterogeneous/latch.pass.cpp b/libcudacxx/test/libcudacxx/heterogeneous/latch.pass.cpp
index 60fcffff7d..b869f74d79 100644
--- a/libcudacxx/test/libcudacxx/heterogeneous/latch.pass.cpp
+++ b/libcudacxx/test/libcudacxx/heterogeneous/latch.pass.cpp
@@ -11,8 +11,8 @@
 // uncomment for a really verbose output detailing what test steps are being launched
 // #define DEBUG_TESTERS
 
-#include <cuda/std/latch>
 #include <cuda/std/cassert>
+#include <cuda/std/latch>
 
 #include "helpers.h"
 
@@ -31,7 +31,7 @@ struct count_down
 template <int N>
 struct arrive_and_wait
 {
-  using async = cuda::std::true_type;
+  using async                         = cuda::std::true_type;
   static constexpr size_t threadcount = N;
 
   template <typename Latch>
diff --git a/libcudacxx/test/libcudacxx/heterogeneous/latch_abi_v2.pass.cpp b/libcudacxx/test/libcudacxx/heterogeneous/latch_abi_v2.pass.cpp
index 11b621b555..bf5cb06278 100644
--- a/libcudacxx/test/libcudacxx/heterogeneous/latch_abi_v2.pass.cpp
+++ b/libcudacxx/test/libcudacxx/heterogeneous/latch_abi_v2.pass.cpp
@@ -13,8 +13,8 @@
 
 #define _LIBCUDACXX_CUDA_ABI_VERSION 2
 
-#include <cuda/std/latch>
 #include <cuda/std/cassert>
+#include <cuda/std/latch>
 
 #include "helpers.h"
 
diff --git a/libcudacxx/test/libcudacxx/heterogeneous/meta.h b/libcudacxx/test/libcudacxx/heterogeneous/meta.h
index e5a777c400..bfd6b8049d 100644
--- a/libcudacxx/test/libcudacxx/heterogeneous/meta.h
+++ b/libcudacxx/test/libcudacxx/heterogeneous/meta.h
@@ -17,41 +17,48 @@
 #ifndef HETEROGENEOUS_META_H
 #define HETEROGENEOUS_META_H
 
-template <typename ...Tx>
-struct type_list{};
+template <typename... Tx>
+struct type_list
+{};
 
 // Rotates the typelist by removing the head and moving it to the tail
-template <typename ...Tx>
-struct rotl_impl {
-    // Empty or 1 element case
-    using type = type_list<Tx...>;
+template <typename... Tx>
+struct rotl_impl
+{
+  // Empty or 1 element case
+  using type = type_list<Tx...>;
 };
-template <typename T, typename ...Tx>
-struct rotl_impl<T, Tx...>  {
-    using type = type_list<Tx..., T>;
+template <typename T, typename... Tx>
+struct rotl_impl<T, Tx...>
+{
+  using type = type_list<Tx..., T>;
 };
-template <typename ...Tx>
-struct rotl_impl<type_list<Tx...>> {
-    using type = typename rotl_impl<Tx...>::type;
+template <typename... Tx>
+struct rotl_impl<type_list<Tx...>>
+{
+  using type = typename rotl_impl<Tx...>::type;
 };
 
-template <typename ...Tx>
+template <typename... Tx>
 using rotl = typename rotl_impl<Tx...>::type;
 
 // static_assert(std::is_same<rotl<int, char>, type_list<char,int>>(), "");
 // static_assert(std::is_same<rotl<int, char, short>, type_list<char,short,int>>(), "");
 
 template <size_t Idx, typename T, template <typename> typename Fn>
-struct for_n_impl {
-    using type = typename for_n_impl<Idx-1, Fn<T>, Fn>::type;
+struct for_n_impl
+{
+  using type = typename for_n_impl<Idx - 1, Fn<T>, Fn>::type;
 };
 template <typename T, template <typename> typename Fn>
-struct for_n_impl<1, T, Fn> {
-    using type = Fn<T>;
+struct for_n_impl<1, T, Fn>
+{
+  using type = Fn<T>;
 };
 template <typename T, template <typename> typename Fn>
-struct for_n_impl<0, T, Fn> {
-    using type = T;
+struct for_n_impl<0, T, Fn>
+{
+  using type = T;
 };
 
 template <size_t Idx, typename T, template <typename> typename Fn>
@@ -61,14 +68,16 @@ using for_n = typename for_n_impl<Idx, T, Fn>::type;
 // static_assert(std::is_same<for_n<3, type_list<int, char, short>, rotl>, type_list<int, char, short>>(), "");
 
 template <template <typename...> typename Fn, typename B>
-struct bind_last_impl {
-    template <typename T>
-    struct bound {
-        using type = Fn<T, B>;
-    };
-
-    template <typename T>
-    using type = typename bound<T>::type;
+struct bind_last_impl
+{
+  template <typename T>
+  struct bound
+  {
+    using type = Fn<T, B>;
+  };
+
+  template <typename T>
+  using type = typename bound<T>::type;
 };
 
 template <template <typename> typename Fn, typename B>
@@ -77,11 +86,13 @@ using bind_last = bind_last_impl<Fn, B>;
 template <typename TypeList, typename... Tx>
 struct append_impl;
 template <typename... Old, typename... New>
-struct append_impl<type_list<Old...>, New...> {
+struct append_impl<type_list<Old...>, New...>
+{
   using type = type_list<Old..., New...>;
 };
 template <typename... Old, typename... New>
-struct append_impl<type_list<Old...>, type_list<New...>> {
+struct append_impl<type_list<Old...>, type_list<New...>>
+{
   using type = type_list<Old..., New...>;
 };
 
@@ -92,7 +103,7 @@ using append = typename append_impl<TypeList, Tx...>::type;
 // static_assert(std::is_same<append<type_list<int>, int>, type_list<int,int>>(), "");
 
 template <size_t Idx, typename TypeList, typename T>
-using append_n = for_n<Idx, TypeList, bind_last<append,T>::template type>;
+using append_n = for_n<Idx, TypeList, bind_last<append, T>::template type>;
 
 // static_assert(std::is_same<append_n<3, type_list<char>, int>, type_list<char, int, int, int>>(), "");
 // static_assert(std::is_same<append_n<5, type_list<char>, int>, type_list<char, int, int, int, int, int>>(), "");
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.lockfree/isalwayslockfree.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.lockfree/isalwayslockfree.pass.cpp
index e951b1da78..7ec8db0973 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.lockfree/isalwayslockfree.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.lockfree/isalwayslockfree.pass.cpp
@@ -158,7 +158,8 @@ __host__ __device__ void run()
   static_assert(cuda::std::atomic<unsigned long>::is_always_lock_free == (2 == LIBCUDACXX_ATOMIC_LONG_LOCK_FREE), "");
   checkLongLongTypes();
   static_assert(cuda::std::atomic<void*>::is_always_lock_free == (2 == LIBCUDACXX_ATOMIC_POINTER_LOCK_FREE), "");
-  static_assert(cuda::std::atomic<cuda::std::nullptr_t>::is_always_lock_free == (2 == LIBCUDACXX_ATOMIC_POINTER_LOCK_FREE), "");
+  static_assert(
+    cuda::std::atomic<cuda::std::nullptr_t>::is_always_lock_free == (2 == LIBCUDACXX_ATOMIC_POINTER_LOCK_FREE), "");
 }
 
 int main(int, char**)
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.lockfree/lockfree.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.lockfree/lockfree.pass.cpp
index fdb53f8c5c..f0853813ad 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.lockfree/lockfree.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.lockfree/lockfree.pass.cpp
@@ -29,16 +29,26 @@
 
 int main(int, char**)
 {
-  assert(LIBCUDACXX_ATOMIC_BOOL_LOCK_FREE == 0 || LIBCUDACXX_ATOMIC_BOOL_LOCK_FREE == 1 || LIBCUDACXX_ATOMIC_BOOL_LOCK_FREE == 2);
-  assert(LIBCUDACXX_ATOMIC_CHAR_LOCK_FREE == 0 || LIBCUDACXX_ATOMIC_CHAR_LOCK_FREE == 1 || LIBCUDACXX_ATOMIC_CHAR_LOCK_FREE == 2);
-  assert(LIBCUDACXX_ATOMIC_CHAR16_T_LOCK_FREE == 0 || LIBCUDACXX_ATOMIC_CHAR16_T_LOCK_FREE == 1 || LIBCUDACXX_ATOMIC_CHAR16_T_LOCK_FREE == 2);
-  assert(LIBCUDACXX_ATOMIC_CHAR32_T_LOCK_FREE == 0 || LIBCUDACXX_ATOMIC_CHAR32_T_LOCK_FREE == 1 || LIBCUDACXX_ATOMIC_CHAR32_T_LOCK_FREE == 2);
-  assert(LIBCUDACXX_ATOMIC_WCHAR_T_LOCK_FREE == 0 || LIBCUDACXX_ATOMIC_WCHAR_T_LOCK_FREE == 1 || LIBCUDACXX_ATOMIC_WCHAR_T_LOCK_FREE == 2);
-  assert(LIBCUDACXX_ATOMIC_SHORT_LOCK_FREE == 0 || LIBCUDACXX_ATOMIC_SHORT_LOCK_FREE == 1 || LIBCUDACXX_ATOMIC_SHORT_LOCK_FREE == 2);
-  assert(LIBCUDACXX_ATOMIC_INT_LOCK_FREE == 0 || LIBCUDACXX_ATOMIC_INT_LOCK_FREE == 1 || LIBCUDACXX_ATOMIC_INT_LOCK_FREE == 2);
-  assert(LIBCUDACXX_ATOMIC_LONG_LOCK_FREE == 0 || LIBCUDACXX_ATOMIC_LONG_LOCK_FREE == 1 || LIBCUDACXX_ATOMIC_LONG_LOCK_FREE == 2);
-  assert(LIBCUDACXX_ATOMIC_LLONG_LOCK_FREE == 0 || LIBCUDACXX_ATOMIC_LLONG_LOCK_FREE == 1 || LIBCUDACXX_ATOMIC_LLONG_LOCK_FREE == 2);
-  assert(LIBCUDACXX_ATOMIC_POINTER_LOCK_FREE == 0 || LIBCUDACXX_ATOMIC_POINTER_LOCK_FREE == 1 || LIBCUDACXX_ATOMIC_POINTER_LOCK_FREE == 2);
+  assert(LIBCUDACXX_ATOMIC_BOOL_LOCK_FREE == 0 || LIBCUDACXX_ATOMIC_BOOL_LOCK_FREE == 1
+         || LIBCUDACXX_ATOMIC_BOOL_LOCK_FREE == 2);
+  assert(LIBCUDACXX_ATOMIC_CHAR_LOCK_FREE == 0 || LIBCUDACXX_ATOMIC_CHAR_LOCK_FREE == 1
+         || LIBCUDACXX_ATOMIC_CHAR_LOCK_FREE == 2);
+  assert(LIBCUDACXX_ATOMIC_CHAR16_T_LOCK_FREE == 0 || LIBCUDACXX_ATOMIC_CHAR16_T_LOCK_FREE == 1
+         || LIBCUDACXX_ATOMIC_CHAR16_T_LOCK_FREE == 2);
+  assert(LIBCUDACXX_ATOMIC_CHAR32_T_LOCK_FREE == 0 || LIBCUDACXX_ATOMIC_CHAR32_T_LOCK_FREE == 1
+         || LIBCUDACXX_ATOMIC_CHAR32_T_LOCK_FREE == 2);
+  assert(LIBCUDACXX_ATOMIC_WCHAR_T_LOCK_FREE == 0 || LIBCUDACXX_ATOMIC_WCHAR_T_LOCK_FREE == 1
+         || LIBCUDACXX_ATOMIC_WCHAR_T_LOCK_FREE == 2);
+  assert(LIBCUDACXX_ATOMIC_SHORT_LOCK_FREE == 0 || LIBCUDACXX_ATOMIC_SHORT_LOCK_FREE == 1
+         || LIBCUDACXX_ATOMIC_SHORT_LOCK_FREE == 2);
+  assert(LIBCUDACXX_ATOMIC_INT_LOCK_FREE == 0 || LIBCUDACXX_ATOMIC_INT_LOCK_FREE == 1
+         || LIBCUDACXX_ATOMIC_INT_LOCK_FREE == 2);
+  assert(LIBCUDACXX_ATOMIC_LONG_LOCK_FREE == 0 || LIBCUDACXX_ATOMIC_LONG_LOCK_FREE == 1
+         || LIBCUDACXX_ATOMIC_LONG_LOCK_FREE == 2);
+  assert(LIBCUDACXX_ATOMIC_LLONG_LOCK_FREE == 0 || LIBCUDACXX_ATOMIC_LLONG_LOCK_FREE == 1
+         || LIBCUDACXX_ATOMIC_LLONG_LOCK_FREE == 2);
+  assert(LIBCUDACXX_ATOMIC_POINTER_LOCK_FREE == 0 || LIBCUDACXX_ATOMIC_POINTER_LOCK_FREE == 1
+         || LIBCUDACXX_ATOMIC_POINTER_LOCK_FREE == 2);
 
   return 0;
 }

From 516448420ae43169df339770492d509e2e1a6be2 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Fri, 3 May 2024 13:24:45 -0700
Subject: [PATCH 27/71] Fix `<atomic>` header include guard.

---
 libcudacxx/include/cuda/std/atomic | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libcudacxx/include/cuda/std/atomic b/libcudacxx/include/cuda/std/atomic
index ec35ad16c1..41d45cd58f 100644
--- a/libcudacxx/include/cuda/std/atomic
+++ b/libcudacxx/include/cuda/std/atomic
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCUDACXX_ATOMIC
-#define _LIBCUDACXX_ATOMIC
+#ifndef _CUDA_STD_ATOMIC
+#define _CUDA_STD_ATOMIC
 
 /*
     atomic synopsis
@@ -1551,4 +1551,4 @@ _LIBCUDACXX_END_NAMESPACE_STD
 #include <cuda/std/__cuda/atomic.h>
 #include <cuda/std/detail/libcxx/include/__pragma_pop>
 
-#endif  // _LIBCUDACXX_ATOMIC
+#endif  // _CUDA_STD_ATOMIC

From 92412384645715299a046b35203601dfeb1c9a36 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Fri, 3 May 2024 13:28:12 -0700
Subject: [PATCH 28/71] Move thread_count trait around and remove unused sink.

---
 .../test/libcudacxx/heterogeneous/helpers.h   | 22 ++++++-------------
 1 file changed, 7 insertions(+), 15 deletions(-)

diff --git a/libcudacxx/test/libcudacxx/heterogeneous/helpers.h b/libcudacxx/test/libcudacxx/heterogeneous/helpers.h
index 748f20cd1e..b7e51c3251 100644
--- a/libcudacxx/test/libcudacxx/heterogeneous/helpers.h
+++ b/libcudacxx/test/libcudacxx/heterogeneous/helpers.h
@@ -17,19 +17,6 @@
 #include <vector>
 
 #include "meta.h"
-#include <stdlib.h>
-
-template <typename... T>
-struct void_sink
-{};
-
-template <typename T, typename = int>
-struct has_threadcount : std::false_type
-{};
-
-template <typename T>
-struct has_threadcount<T, decltype((void) T::threadcount, (int) 0)> : std::true_type
-{};
 
 #define DEFINE_ASYNC_TRAIT(...)                                             \
   template <typename T, typename = cuda::std::true_type>                    \
@@ -51,6 +38,13 @@ DEFINE_ASYNC_TRAIT()
 DEFINE_ASYNC_TRAIT(_initialize)
 DEFINE_ASYNC_TRAIT(_validate)
 
+#undef DEFINE_ASYNC_TRAIT
+
+template <typename T, typename = int>
+struct has_threadcount : std::false_type { };
+template <typename T>
+struct has_threadcount <T, decltype((void) T::threadcount, (int)0)> : std::true_type {};
+
 template <typename T, bool = has_threadcount<T>::value>
 struct threadcount_trait_impl
 {
@@ -66,8 +60,6 @@ struct threadcount_trait_impl<T, true>
 template <typename T>
 using threadcount_trait = threadcount_trait_impl<T>;
 
-#undef DEFINE_ASYNC_TRAIT
-
 #define HETEROGENEOUS_SAFE_CALL(...)                                                  \
   do                                                                                  \
   {                                                                                   \

From 835cea3d7157cea048255c7d74c6713dc487c6d6 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Fri, 3 May 2024 14:03:13 -0700
Subject: [PATCH 29/71] Fix mistakes from merging clang-format changes.

---
 .../cuda/std/detail/libcxx/include/barrier    | 38 +------------------
 1 file changed, 1 insertion(+), 37 deletions(-)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/barrier b/libcudacxx/include/cuda/std/detail/libcxx/include/barrier
index 59fa6cc863..e656df1cef 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/barrier
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/barrier
@@ -303,43 +303,7 @@ public:
       , __phase(false)
   {}
 
-  ~__barrier_base() = default;
-
-  __barrier_base(__barrier_base const&)            = delete;
-  __barrier_base& operator=(__barrier_base const&) = delete;
-
-  _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY arrival_token arrive(ptrdiff_t __update = 1)
-  {
-    auto const __old_phase    = __phase.load(memory_order_relaxed);
-    auto const __result       = __arrived.fetch_sub(__update, memory_order_acq_rel) - __update;
-    auto const __new_expected = __expected.load(memory_order_relaxed);
-
-    _LIBCUDACXX_DEBUG_ASSERT(__result >= 0, "");
-
-    if (0 == __result)
-    {
-      __completion();
-      __arrived.store(__new_expected, memory_order_relaxed);
-      __phase.store(!__old_phase, memory_order_release);
-      __cxx_atomic_notify_all(&__phase.__a_);
-    }
-    return __old_phase;
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY void wait(arrival_token&& __old_phase) const
-  {
-    __phase.wait(__old_phase, memory_order_acquire);
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY void arrive_and_wait()
-  {
-    wait(arrive());
-  }
-  _LIBCUDACXX_INLINE_VISIBILITY void arrive_and_drop()
-  {
-    __expected.fetch_sub(1, memory_order_relaxed);
-    (void) arrive();
-  }
-
-    ~__barrier_base() = default;
+   ~__barrier_base() = default;
 
     __barrier_base(__barrier_base const&) = delete;
     __barrier_base& operator=(__barrier_base const&) = delete;

From bff1381ba4a52e48cec88e786c79ecbb486b3f51 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 3 May 2024 21:08:06 +0000
Subject: [PATCH 30/71] [pre-commit.ci] auto code formatting

---
 libcudacxx/test/libcudacxx/heterogeneous/helpers.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/libcudacxx/test/libcudacxx/heterogeneous/helpers.h b/libcudacxx/test/libcudacxx/heterogeneous/helpers.h
index b7e51c3251..94c0cf34f5 100644
--- a/libcudacxx/test/libcudacxx/heterogeneous/helpers.h
+++ b/libcudacxx/test/libcudacxx/heterogeneous/helpers.h
@@ -41,9 +41,11 @@ DEFINE_ASYNC_TRAIT(_validate)
 #undef DEFINE_ASYNC_TRAIT
 
 template <typename T, typename = int>
-struct has_threadcount : std::false_type { };
+struct has_threadcount : std::false_type
+{};
 template <typename T>
-struct has_threadcount <T, decltype((void) T::threadcount, (int)0)> : std::true_type {};
+struct has_threadcount<T, decltype((void) T::threadcount, (int) 0)> : std::true_type
+{};
 
 template <typename T, bool = has_threadcount<T>::value>
 struct threadcount_trait_impl

From 198490e58b392f38a1fb086252f9a1d63bb8a0a2 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Wed, 8 May 2024 12:09:55 -0700
Subject: [PATCH 31/71] Add system header guards to every new `__atomic` header

---
 libcudacxx/codegen/codegen.cpp                            | 8 ++++++++
 libcudacxx/include/cuda/std/__atomic/api/common.h         | 8 ++++++++
 libcudacxx/include/cuda/std/__atomic/api/owned.h          | 8 ++++++++
 libcudacxx/include/cuda/std/__atomic/api/reference.h      | 8 ++++++++
 libcudacxx/include/cuda/std/__atomic/functions.h          | 8 ++++++++
 .../cuda/std/__atomic/functions/cuda_ptx_derived.h        | 8 ++++++++
 .../cuda/std/__atomic/functions/cuda_ptx_generated.h      | 8 ++++++++
 libcudacxx/include/cuda/std/__atomic/functions/host.h     | 8 ++++++++
 libcudacxx/include/cuda/std/__atomic/order.h              | 8 ++++++++
 libcudacxx/include/cuda/std/__atomic/platform.h           | 8 ++++++++
 .../include/cuda/std/__atomic/platform/msvc_to_builtins.h | 8 ++++++++
 libcudacxx/include/cuda/std/__atomic/scopes.h             | 8 ++++++++
 libcudacxx/include/cuda/std/__atomic/types.h              | 8 ++++++++
 libcudacxx/include/cuda/std/__atomic/types/base.h         | 8 ++++++++
 libcudacxx/include/cuda/std/__atomic/types/common.h       | 8 ++++++++
 libcudacxx/include/cuda/std/__atomic/types/locked.h       | 8 ++++++++
 libcudacxx/include/cuda/std/__atomic/types/reference.h    | 8 ++++++++
 libcudacxx/include/cuda/std/__atomic/types/small.h        | 8 ++++++++
 libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h   | 8 ++++++++
 libcudacxx/include/cuda/std/__atomic/wait/polling.h       | 8 ++++++++
 20 files changed, 160 insertions(+)

diff --git a/libcudacxx/codegen/codegen.cpp b/libcudacxx/codegen/codegen.cpp
index bf661ba612..d913a9b9cd 100644
--- a/libcudacxx/codegen/codegen.cpp
+++ b/libcudacxx/codegen/codegen.cpp
@@ -86,6 +86,14 @@ int main()
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
 #include <cuda/std/cstdint>
 
 #include <cuda/std/__type_traits/enable_if.h>
diff --git a/libcudacxx/include/cuda/std/__atomic/api/common.h b/libcudacxx/include/cuda/std/__atomic/api/common.h
index 3a3de3f80e..e3f8c7c3e6 100644
--- a/libcudacxx/include/cuda/std/__atomic/api/common.h
+++ b/libcudacxx/include/cuda/std/__atomic/api/common.h
@@ -13,6 +13,14 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
 #include <cuda/std/__atomic/types/base.h>
 
 // API definitions for the base atomic implementation
diff --git a/libcudacxx/include/cuda/std/__atomic/api/owned.h b/libcudacxx/include/cuda/std/__atomic/api/owned.h
index c598e19c90..6ca525dfd9 100644
--- a/libcudacxx/include/cuda/std/__atomic/api/owned.h
+++ b/libcudacxx/include/cuda/std/__atomic/api/owned.h
@@ -13,6 +13,14 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
 #include <cuda/std/__atomic/api/common.h>
 #include <cuda/std/__atomic/order.h>
 #include <cuda/std/__atomic/scopes.h>
diff --git a/libcudacxx/include/cuda/std/__atomic/api/reference.h b/libcudacxx/include/cuda/std/__atomic/api/reference.h
index 6e82124e4c..0d501a330b 100644
--- a/libcudacxx/include/cuda/std/__atomic/api/reference.h
+++ b/libcudacxx/include/cuda/std/__atomic/api/reference.h
@@ -13,6 +13,14 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
 #include <cuda/std/__atomic/api/common.h>
 #include <cuda/std/__atomic/order.h>
 #include <cuda/std/__atomic/scopes.h>
diff --git a/libcudacxx/include/cuda/std/__atomic/functions.h b/libcudacxx/include/cuda/std/__atomic/functions.h
index b33afe5e3e..76cea325ce 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions.h
@@ -13,6 +13,14 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
 #include <cuda/std/__atomic/platform.h>
 
 // Device atomics
diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
index 3cd5d2b0d9..328d84cd57 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
@@ -13,6 +13,14 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
 #include <cuda/std/__atomic/functions/cuda_ptx_generated.h>
 #include <cuda/std/cstdint>
 
diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
index 8a1fda3d1d..889e612487 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
@@ -16,6 +16,14 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
 #include <cuda/std/cstdint>
 
 #include <cuda/std/__type_traits/enable_if.h>
diff --git a/libcudacxx/include/cuda/std/__atomic/functions/host.h b/libcudacxx/include/cuda/std/__atomic/functions/host.h
index b2a251f854..59b848e2aa 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/host.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/host.h
@@ -13,6 +13,14 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
 #include <cuda/std/__atomic/order.h>
 #include <cuda/std/__atomic/platform.h>
 
diff --git a/libcudacxx/include/cuda/std/__atomic/order.h b/libcudacxx/include/cuda/std/__atomic/order.h
index 64a9089915..b99db0d59d 100644
--- a/libcudacxx/include/cuda/std/__atomic/order.h
+++ b/libcudacxx/include/cuda/std/__atomic/order.h
@@ -13,6 +13,14 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
 #include <cuda/std/__type_traits/underlying_type.h>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
diff --git a/libcudacxx/include/cuda/std/__atomic/platform.h b/libcudacxx/include/cuda/std/__atomic/platform.h
index 77a5035c5c..6367e20234 100644
--- a/libcudacxx/include/cuda/std/__atomic/platform.h
+++ b/libcudacxx/include/cuda/std/__atomic/platform.h
@@ -13,6 +13,14 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
 #if defined(_CCCL_COMPILER_MSVC)
 #  include <cuda/std/__atomic/platform/msvc_to_builtins.h>
 #endif
diff --git a/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h b/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h
index fa5a0a3398..38fb90638e 100644
--- a/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h
+++ b/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h
@@ -13,6 +13,14 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
 #ifndef _MSC_VER
 #  error "This file is only for CL.EXE's benefit"
 #endif
diff --git a/libcudacxx/include/cuda/std/__atomic/scopes.h b/libcudacxx/include/cuda/std/__atomic/scopes.h
index fc5ea0b7a0..70af777d5c 100644
--- a/libcudacxx/include/cuda/std/__atomic/scopes.h
+++ b/libcudacxx/include/cuda/std/__atomic/scopes.h
@@ -13,6 +13,14 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 // REMEMBER CHANGES TO THESE ARE ABI BREAKING
diff --git a/libcudacxx/include/cuda/std/__atomic/types.h b/libcudacxx/include/cuda/std/__atomic/types.h
index 48ba7a380c..07f4ce91ab 100644
--- a/libcudacxx/include/cuda/std/__atomic/types.h
+++ b/libcudacxx/include/cuda/std/__atomic/types.h
@@ -13,6 +13,14 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
 #include <cuda/std/__atomic/types/base.h>
 #include <cuda/std/__atomic/types/locked.h>
 #include <cuda/std/__atomic/types/reference.h>
diff --git a/libcudacxx/include/cuda/std/__atomic/types/base.h b/libcudacxx/include/cuda/std/__atomic/types/base.h
index fff5d554bd..604d6bdc57 100644
--- a/libcudacxx/include/cuda/std/__atomic/types/base.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/base.h
@@ -13,6 +13,14 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
 #include <cuda/std/__atomic/functions.h>
 #include <cuda/std/__atomic/types/common.h>
 #include <cuda/std/__type_traits/is_trivially_copyable.h>
diff --git a/libcudacxx/include/cuda/std/__atomic/types/common.h b/libcudacxx/include/cuda/std/__atomic/types/common.h
index 1c013c6356..a82b8b03e2 100644
--- a/libcudacxx/include/cuda/std/__atomic/types/common.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/common.h
@@ -13,6 +13,14 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
 #include <cuda/std/__type_traits/enable_if.h>
 #include <cuda/std/__type_traits/is_assignable.h>
 #include <cuda/std/__type_traits/remove_cv.h>
diff --git a/libcudacxx/include/cuda/std/__atomic/types/locked.h b/libcudacxx/include/cuda/std/__atomic/types/locked.h
index 3034f2ebbb..5dedfc81fb 100644
--- a/libcudacxx/include/cuda/std/__atomic/types/locked.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/locked.h
@@ -13,6 +13,14 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
 #include <cuda/std/__atomic/order.h>
 #include <cuda/std/__atomic/scopes.h>
 #include <cuda/std/__atomic/types/base.h>
diff --git a/libcudacxx/include/cuda/std/__atomic/types/reference.h b/libcudacxx/include/cuda/std/__atomic/types/reference.h
index 9629bcc60e..222e8762f1 100644
--- a/libcudacxx/include/cuda/std/__atomic/types/reference.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/reference.h
@@ -13,6 +13,14 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
 #include <cuda/std/__atomic/types/base.h>
 #include <cuda/std/type_traits>
 
diff --git a/libcudacxx/include/cuda/std/__atomic/types/small.h b/libcudacxx/include/cuda/std/__atomic/types/small.h
index d4e831db7c..ba49dc5c1e 100644
--- a/libcudacxx/include/cuda/std/__atomic/types/small.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/small.h
@@ -13,6 +13,14 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
 #include <cuda/std/__atomic/order.h>
 #include <cuda/std/__atomic/scopes.h>
 #include <cuda/std/__atomic/types/base.h>
diff --git a/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h b/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h
index 387af5cc2a..76ac62fa90 100644
--- a/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h
+++ b/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h
@@ -13,6 +13,14 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
 #include <cuda/std/__atomic/order.h>
 #include <cuda/std/__atomic/scopes.h>
 #include <cuda/std/__atomic/wait/polling.h>
diff --git a/libcudacxx/include/cuda/std/__atomic/wait/polling.h b/libcudacxx/include/cuda/std/__atomic/wait/polling.h
index 5116b1564a..7bc9481c81 100644
--- a/libcudacxx/include/cuda/std/__atomic/wait/polling.h
+++ b/libcudacxx/include/cuda/std/__atomic/wait/polling.h
@@ -13,6 +13,14 @@
 
 #include <cuda/std/detail/__config>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
 #include <cuda/std/__atomic/order.h>
 #include <cuda/std/__atomic/scopes.h>
 #include <cuda/std/__atomic/types.h>

From 321706fbe5eb6e11cd3a0ab9290857d119dd7684 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Wed, 8 May 2024 12:17:09 -0700
Subject: [PATCH 32/71] Delete `<atomic>` header synopsis.

---
 libcudacxx/include/cuda/std/atomic | 536 -----------------------------
 1 file changed, 536 deletions(-)

diff --git a/libcudacxx/include/cuda/std/atomic b/libcudacxx/include/cuda/std/atomic
index 896b6dfc1e..7026f38324 100644
--- a/libcudacxx/include/cuda/std/atomic
+++ b/libcudacxx/include/cuda/std/atomic
@@ -10,542 +10,6 @@
 #ifndef _CUDA_STD_ATOMIC
 #define _CUDA_STD_ATOMIC
 
-/*
-    atomic synopsis
-
-namespace std
-{
-
-// feature test macro
-
-#define __cpp_lib_atomic_is_always_lock_free // as specified by SG10
-
- // order and consistency
-
- enum memory_order: unspecified // enum class in C++20
- {
-    relaxed,
-    consume, // load-consume
-    acquire, // load-acquire
-    release, // store-release
-    acq_rel, // store-release load-acquire
-    seq_cst // store-release load-acquire
- };
-
- inline constexpr auto memory_order_relaxed = memory_order::relaxed;
- inline constexpr auto memory_order_consume = memory_order::consume;
- inline constexpr auto memory_order_acquire = memory_order::acquire;
- inline constexpr auto memory_order_release = memory_order::release;
- inline constexpr auto memory_order_acq_rel = memory_order::acq_rel;
- inline constexpr auto memory_order_seq_cst = memory_order::seq_cst;
-
-template <class T> T kill_dependency(T y) noexcept;
-
-// lock-free property
-
-#define LIBCUDACXX_ATOMIC_BOOL_LOCK_FREE unspecified
-#define LIBCUDACXX_ATOMIC_CHAR_LOCK_FREE unspecified
-#define LIBCUDACXX_ATOMIC_CHAR16_T_LOCK_FREE unspecified
-#define LIBCUDACXX_ATOMIC_CHAR32_T_LOCK_FREE unspecified
-#define LIBCUDACXX_ATOMIC_WCHAR_T_LOCK_FREE unspecified
-#define LIBCUDACXX_ATOMIC_SHORT_LOCK_FREE unspecified
-#define LIBCUDACXX_ATOMIC_INT_LOCK_FREE unspecified
-#define LIBCUDACXX_ATOMIC_LONG_LOCK_FREE unspecified
-#define LIBCUDACXX_ATOMIC_LLONG_LOCK_FREE unspecified
-#define LIBCUDACXX_ATOMIC_POINTER_LOCK_FREE unspecified
-
-// flag type and operations
-
-typedef struct atomic_flag
-{
-    bool test_and_set(memory_order m = memory_order_seq_cst) volatile noexcept;
-    bool test_and_set(memory_order m = memory_order_seq_cst) noexcept;
-    void clear(memory_order m = memory_order_seq_cst) volatile noexcept;
-    void clear(memory_order m = memory_order_seq_cst) noexcept;
-    atomic_flag()  noexcept = default;
-    atomic_flag(const atomic_flag&) = delete;
-    atomic_flag& operator=(const atomic_flag&) = delete;
-    atomic_flag& operator=(const atomic_flag&) volatile = delete;
-} atomic_flag;
-
-bool
-    atomic_flag_test_and_set(volatile atomic_flag* obj) noexcept;
-
-bool
-    atomic_flag_test_and_set(atomic_flag* obj) noexcept;
-
-bool
-    atomic_flag_test_and_set_explicit(volatile atomic_flag* obj,
-                                      memory_order m) noexcept;
-
-bool
-    atomic_flag_test_and_set_explicit(atomic_flag* obj, memory_order m) noexcept;
-
-void
-    atomic_flag_clear(volatile atomic_flag* obj) noexcept;
-
-void
-    atomic_flag_clear(atomic_flag* obj) noexcept;
-
-void
-    atomic_flag_clear_explicit(volatile atomic_flag* obj, memory_order m) noexcept;
-
-void
-    atomic_flag_clear_explicit(atomic_flag* obj, memory_order m) noexcept;
-
-#define ATOMIC_FLAG_INIT see below
-#define ATOMIC_VAR_INIT(value) see below
-
-template <class T>
-struct atomic
-{
-    static constexpr bool is_always_lock_free;
-    bool is_lock_free() const volatile noexcept;
-    bool is_lock_free() const noexcept;
-    void store(T desr, memory_order m = memory_order_seq_cst) volatile noexcept;
-    void store(T desr, memory_order m = memory_order_seq_cst) noexcept;
-    T load(memory_order m = memory_order_seq_cst) const volatile noexcept;
-    T load(memory_order m = memory_order_seq_cst) const noexcept;
-    operator T() const volatile noexcept;
-    operator T() const noexcept;
-    T exchange(T desr, memory_order m = memory_order_seq_cst) volatile noexcept;
-    T exchange(T desr, memory_order m = memory_order_seq_cst) noexcept;
-    bool compare_exchange_weak(T& expc, T desr,
-                               memory_order s, memory_order f) volatile noexcept;
-    bool compare_exchange_weak(T& expc, T desr, memory_order s, memory_order f) noexcept;
-    bool compare_exchange_strong(T& expc, T desr,
-                                 memory_order s, memory_order f) volatile noexcept;
-    bool compare_exchange_strong(T& expc, T desr,
-                                 memory_order s, memory_order f) noexcept;
-    bool compare_exchange_weak(T& expc, T desr,
-                               memory_order m = memory_order_seq_cst) volatile noexcept;
-    bool compare_exchange_weak(T& expc, T desr,
-                               memory_order m = memory_order_seq_cst) noexcept;
-    bool compare_exchange_strong(T& expc, T desr,
-                                memory_order m = memory_order_seq_cst) volatile noexcept;
-    bool compare_exchange_strong(T& expc, T desr,
-                                 memory_order m = memory_order_seq_cst) noexcept;
-
-    atomic() noexcept = default;
-    constexpr atomic(T desr) noexcept;
-    atomic(const atomic&) = delete;
-    atomic& operator=(const atomic&) = delete;
-    atomic& operator=(const atomic&) volatile = delete;
-    T operator=(T) volatile noexcept;
-    T operator=(T) noexcept;
-};
-
-template <>
-struct atomic<integral>
-{
-    static constexpr bool is_always_lock_free;
-    bool is_lock_free() const volatile noexcept;
-    bool is_lock_free() const noexcept;
-    void store(integral desr, memory_order m = memory_order_seq_cst) volatile noexcept;
-    void store(integral desr, memory_order m = memory_order_seq_cst) noexcept;
-    integral load(memory_order m = memory_order_seq_cst) const volatile noexcept;
-    integral load(memory_order m = memory_order_seq_cst) const noexcept;
-    operator integral() const volatile noexcept;
-    operator integral() const noexcept;
-    integral exchange(integral desr,
-                      memory_order m = memory_order_seq_cst) volatile noexcept;
-    integral exchange(integral desr, memory_order m = memory_order_seq_cst) noexcept;
-    bool compare_exchange_weak(integral& expc, integral desr,
-                               memory_order s, memory_order f) volatile noexcept;
-    bool compare_exchange_weak(integral& expc, integral desr,
-                               memory_order s, memory_order f) noexcept;
-    bool compare_exchange_strong(integral& expc, integral desr,
-                                 memory_order s, memory_order f) volatile noexcept;
-    bool compare_exchange_strong(integral& expc, integral desr,
-                                 memory_order s, memory_order f) noexcept;
-    bool compare_exchange_weak(integral& expc, integral desr,
-                               memory_order m = memory_order_seq_cst) volatile noexcept;
-    bool compare_exchange_weak(integral& expc, integral desr,
-                               memory_order m = memory_order_seq_cst) noexcept;
-    bool compare_exchange_strong(integral& expc, integral desr,
-                                memory_order m = memory_order_seq_cst) volatile noexcept;
-    bool compare_exchange_strong(integral& expc, integral desr,
-                                 memory_order m = memory_order_seq_cst) noexcept;
-
-    integral
-        fetch_add(integral op, memory_order m = memory_order_seq_cst) volatile noexcept;
-    integral fetch_add(integral op, memory_order m = memory_order_seq_cst) noexcept;
-    integral
-        fetch_sub(integral op, memory_order m = memory_order_seq_cst) volatile noexcept;
-    integral fetch_sub(integral op, memory_order m = memory_order_seq_cst) noexcept;
-    integral
-        fetch_and(integral op, memory_order m = memory_order_seq_cst) volatile noexcept;
-    integral fetch_and(integral op, memory_order m = memory_order_seq_cst) noexcept;
-    integral
-        fetch_or(integral op, memory_order m = memory_order_seq_cst) volatile noexcept;
-    integral fetch_or(integral op, memory_order m = memory_order_seq_cst) noexcept;
-    integral
-        fetch_xor(integral op, memory_order m = memory_order_seq_cst) volatile noexcept;
-    integral fetch_xor(integral op, memory_order m = memory_order_seq_cst) noexcept;
-
-    atomic() noexcept = default;
-    constexpr atomic(integral desr) noexcept;
-    atomic(const atomic&) = delete;
-    atomic& operator=(const atomic&) = delete;
-    atomic& operator=(const atomic&) volatile = delete;
-    integral operator=(integral desr) volatile noexcept;
-    integral operator=(integral desr) noexcept;
-
-    integral operator++(int) volatile noexcept;
-    integral operator++(int) noexcept;
-    integral operator--(int) volatile noexcept;
-    integral operator--(int) noexcept;
-    integral operator++() volatile noexcept;
-    integral operator++() noexcept;
-    integral operator--() volatile noexcept;
-    integral operator--() noexcept;
-    integral operator+=(integral op) volatile noexcept;
-    integral operator+=(integral op) noexcept;
-    integral operator-=(integral op) volatile noexcept;
-    integral operator-=(integral op) noexcept;
-    integral operator&=(integral op) volatile noexcept;
-    integral operator&=(integral op) noexcept;
-    integral operator|=(integral op) volatile noexcept;
-    integral operator|=(integral op) noexcept;
-    integral operator^=(integral op) volatile noexcept;
-    integral operator^=(integral op) noexcept;
-};
-
-template <class T>
-struct atomic<T*>
-{
-    static constexpr bool is_always_lock_free;
-    bool is_lock_free() const volatile noexcept;
-    bool is_lock_free() const noexcept;
-    void store(T* desr, memory_order m = memory_order_seq_cst) volatile noexcept;
-    void store(T* desr, memory_order m = memory_order_seq_cst) noexcept;
-    T* load(memory_order m = memory_order_seq_cst) const volatile noexcept;
-    T* load(memory_order m = memory_order_seq_cst) const noexcept;
-    operator T*() const volatile noexcept;
-    operator T*() const noexcept;
-    T* exchange(T* desr, memory_order m = memory_order_seq_cst) volatile noexcept;
-    T* exchange(T* desr, memory_order m = memory_order_seq_cst) noexcept;
-    bool compare_exchange_weak(T*& expc, T* desr,
-                               memory_order s, memory_order f) volatile noexcept;
-    bool compare_exchange_weak(T*& expc, T* desr,
-                               memory_order s, memory_order f) noexcept;
-    bool compare_exchange_strong(T*& expc, T* desr,
-                                 memory_order s, memory_order f) volatile noexcept;
-    bool compare_exchange_strong(T*& expc, T* desr,
-                                 memory_order s, memory_order f) noexcept;
-    bool compare_exchange_weak(T*& expc, T* desr,
-                               memory_order m = memory_order_seq_cst) volatile noexcept;
-    bool compare_exchange_weak(T*& expc, T* desr,
-                               memory_order m = memory_order_seq_cst) noexcept;
-    bool compare_exchange_strong(T*& expc, T* desr,
-                                memory_order m = memory_order_seq_cst) volatile noexcept;
-    bool compare_exchange_strong(T*& expc, T* desr,
-                                 memory_order m = memory_order_seq_cst) noexcept;
-    T* fetch_add(ptrdiff_t op, memory_order m = memory_order_seq_cst) volatile noexcept;
-    T* fetch_add(ptrdiff_t op, memory_order m = memory_order_seq_cst) noexcept;
-    T* fetch_sub(ptrdiff_t op, memory_order m = memory_order_seq_cst) volatile noexcept;
-    T* fetch_sub(ptrdiff_t op, memory_order m = memory_order_seq_cst) noexcept;
-
-    atomic() noexcept = default;
-    constexpr atomic(T* desr) noexcept;
-    atomic(const atomic&) = delete;
-    atomic& operator=(const atomic&) = delete;
-    atomic& operator=(const atomic&) volatile = delete;
-
-    T* operator=(T*) volatile noexcept;
-    T* operator=(T*) noexcept;
-    T* operator++(int) volatile noexcept;
-    T* operator++(int) noexcept;
-    T* operator--(int) volatile noexcept;
-    T* operator--(int) noexcept;
-    T* operator++() volatile noexcept;
-    T* operator++() noexcept;
-    T* operator--() volatile noexcept;
-    T* operator--() noexcept;
-    T* operator+=(ptrdiff_t op) volatile noexcept;
-    T* operator+=(ptrdiff_t op) noexcept;
-    T* operator-=(ptrdiff_t op) volatile noexcept;
-    T* operator-=(ptrdiff_t op) noexcept;
-};
-
-
-template <class T>
-    bool
-    atomic_is_lock_free(const volatile atomic<T>* obj) noexcept;
-
-template <class T>
-    bool
-    atomic_is_lock_free(const atomic<T>* obj) noexcept;
-
-template <class T>
-    void
-    atomic_init(volatile atomic<T>* obj, T desr) noexcept;
-
-template <class T>
-    void
-    atomic_init(atomic<T>* obj, T desr) noexcept;
-
-template <class T>
-    void
-    atomic_store(volatile atomic<T>* obj, T desr) noexcept;
-
-template <class T>
-    void
-    atomic_store(atomic<T>* obj, T desr) noexcept;
-
-template <class T>
-    void
-    atomic_store_explicit(volatile atomic<T>* obj, T desr, memory_order m) noexcept;
-
-template <class T>
-    void
-    atomic_store_explicit(atomic<T>* obj, T desr, memory_order m) noexcept;
-
-template <class T>
-    T
-    atomic_load(const volatile atomic<T>* obj) noexcept;
-
-template <class T>
-    T
-    atomic_load(const atomic<T>* obj) noexcept;
-
-template <class T>
-    T
-    atomic_load_explicit(const volatile atomic<T>* obj, memory_order m) noexcept;
-
-template <class T>
-    T
-    atomic_load_explicit(const atomic<T>* obj, memory_order m) noexcept;
-
-template <class T>
-    T
-    atomic_exchange(volatile atomic<T>* obj, T desr) noexcept;
-
-template <class T>
-    T
-    atomic_exchange(atomic<T>* obj, T desr) noexcept;
-
-template <class T>
-    T
-    atomic_exchange_explicit(volatile atomic<T>* obj, T desr, memory_order m) noexcept;
-
-template <class T>
-    T
-    atomic_exchange_explicit(atomic<T>* obj, T desr, memory_order m) noexcept;
-
-template <class T>
-    bool
-    atomic_compare_exchange_weak(volatile atomic<T>* obj, T* expc, T desr) noexcept;
-
-template <class T>
-    bool
-    atomic_compare_exchange_weak(atomic<T>* obj, T* expc, T desr) noexcept;
-
-template <class T>
-    bool
-    atomic_compare_exchange_strong(volatile atomic<T>* obj, T* expc, T desr) noexcept;
-
-template <class T>
-    bool
-    atomic_compare_exchange_strong(atomic<T>* obj, T* expc, T desr) noexcept;
-
-template <class T>
-    bool
-    atomic_compare_exchange_weak_explicit(volatile atomic<T>* obj, T* expc,
-                                          T desr,
-                                          memory_order s, memory_order f) noexcept;
-
-template <class T>
-    bool
-    atomic_compare_exchange_weak_explicit(atomic<T>* obj, T* expc, T desr,
-                                          memory_order s, memory_order f) noexcept;
-
-template <class T>
-    bool
-    atomic_compare_exchange_strong_explicit(volatile atomic<T>* obj,
-                                            T* expc, T desr,
-                                            memory_order s, memory_order f) noexcept;
-
-template <class T>
-    bool
-    atomic_compare_exchange_strong_explicit(atomic<T>* obj, T* expc,
-                                            T desr,
-                                            memory_order s, memory_order f) noexcept;
-
-template <class Integral>
-    Integral
-    atomic_fetch_add(volatile atomic<Integral>* obj, Integral op) noexcept;
-
-template <class Integral>
-    Integral
-    atomic_fetch_add(atomic<Integral>* obj, Integral op) noexcept;
-
-template <class Integral>
-    Integral
-    atomic_fetch_add_explicit(volatile atomic<Integral>* obj, Integral op,
-                              memory_order m) noexcept;
-template <class Integral>
-    Integral
-    atomic_fetch_add_explicit(atomic<Integral>* obj, Integral op,
-                              memory_order m) noexcept;
-template <class Integral>
-    Integral
-    atomic_fetch_sub(volatile atomic<Integral>* obj, Integral op) noexcept;
-
-template <class Integral>
-    Integral
-    atomic_fetch_sub(atomic<Integral>* obj, Integral op) noexcept;
-
-template <class Integral>
-    Integral
-    atomic_fetch_sub_explicit(volatile atomic<Integral>* obj, Integral op,
-                              memory_order m) noexcept;
-template <class Integral>
-    Integral
-    atomic_fetch_sub_explicit(atomic<Integral>* obj, Integral op,
-                              memory_order m) noexcept;
-template <class Integral>
-    Integral
-    atomic_fetch_and(volatile atomic<Integral>* obj, Integral op) noexcept;
-
-template <class Integral>
-    Integral
-    atomic_fetch_and(atomic<Integral>* obj, Integral op) noexcept;
-
-template <class Integral>
-    Integral
-    atomic_fetch_and_explicit(volatile atomic<Integral>* obj, Integral op,
-                              memory_order m) noexcept;
-template <class Integral>
-    Integral
-    atomic_fetch_and_explicit(atomic<Integral>* obj, Integral op,
-                              memory_order m) noexcept;
-template <class Integral>
-    Integral
-    atomic_fetch_or(volatile atomic<Integral>* obj, Integral op) noexcept;
-
-template <class Integral>
-    Integral
-    atomic_fetch_or(atomic<Integral>* obj, Integral op) noexcept;
-
-template <class Integral>
-    Integral
-    atomic_fetch_or_explicit(volatile atomic<Integral>* obj, Integral op,
-                             memory_order m) noexcept;
-template <class Integral>
-    Integral
-    atomic_fetch_or_explicit(atomic<Integral>* obj, Integral op,
-                             memory_order m) noexcept;
-template <class Integral>
-    Integral
-    atomic_fetch_xor(volatile atomic<Integral>* obj, Integral op) noexcept;
-
-template <class Integral>
-    Integral
-    atomic_fetch_xor(atomic<Integral>* obj, Integral op) noexcept;
-
-template <class Integral>
-    Integral
-    atomic_fetch_xor_explicit(volatile atomic<Integral>* obj, Integral op,
-                              memory_order m) noexcept;
-template <class Integral>
-    Integral
-    atomic_fetch_xor_explicit(atomic<Integral>* obj, Integral op,
-                              memory_order m) noexcept;
-
-template <class T>
-    T*
-    atomic_fetch_add(volatile atomic<T*>* obj, ptrdiff_t op) noexcept;
-
-template <class T>
-    T*
-    atomic_fetch_add(atomic<T*>* obj, ptrdiff_t op) noexcept;
-
-template <class T>
-    T*
-    atomic_fetch_add_explicit(volatile atomic<T*>* obj, ptrdiff_t op,
-                              memory_order m) noexcept;
-template <class T>
-    T*
-    atomic_fetch_add_explicit(atomic<T*>* obj, ptrdiff_t op, memory_order m) noexcept;
-
-template <class T>
-    T*
-    atomic_fetch_sub(volatile atomic<T*>* obj, ptrdiff_t op) noexcept;
-
-template <class T>
-    T*
-    atomic_fetch_sub(atomic<T*>* obj, ptrdiff_t op) noexcept;
-
-template <class T>
-    T*
-    atomic_fetch_sub_explicit(volatile atomic<T*>* obj, ptrdiff_t op,
-                              memory_order m) noexcept;
-template <class T>
-    T*
-    atomic_fetch_sub_explicit(atomic<T*>* obj, ptrdiff_t op, memory_order m) noexcept;
-
-// Atomics for standard typedef types
-
-typedef atomic<bool>               atomic_bool;
-typedef atomic<char>               atomic_char;
-typedef atomic<signed char>        atomic_schar;
-typedef atomic<unsigned char>      atomic_uchar;
-typedef atomic<short>              atomic_short;
-typedef atomic<unsigned short>     atomic_ushort;
-typedef atomic<int>                atomic_int;
-typedef atomic<unsigned int>       atomic_uint;
-typedef atomic<long>               atomic_long;
-typedef atomic<unsigned long>      atomic_ulong;
-typedef atomic<long long>          atomic_llong;
-typedef atomic<unsigned long long> atomic_ullong;
-typedef atomic<char16_t>           atomic_char16_t;
-typedef atomic<char32_t>           atomic_char32_t;
-typedef atomic<wchar_t>            atomic_wchar_t;
-
-typedef atomic<int_least8_t>   atomic_int_least8_t;
-typedef atomic<uint_least8_t>  atomic_uint_least8_t;
-typedef atomic<int_least16_t>  atomic_int_least16_t;
-typedef atomic<uint_least16_t> atomic_uint_least16_t;
-typedef atomic<int_least32_t>  atomic_int_least32_t;
-typedef atomic<uint_least32_t> atomic_uint_least32_t;
-typedef atomic<int_least64_t>  atomic_int_least64_t;
-typedef atomic<uint_least64_t> atomic_uint_least64_t;
-
-typedef atomic<int_fast8_t>   atomic_int_fast8_t;
-typedef atomic<uint_fast8_t>  atomic_uint_fast8_t;
-typedef atomic<int_fast16_t>  atomic_int_fast16_t;
-typedef atomic<uint_fast16_t> atomic_uint_fast16_t;
-typedef atomic<int_fast32_t>  atomic_int_fast32_t;
-typedef atomic<uint_fast32_t> atomic_uint_fast32_t;
-typedef atomic<int_fast64_t>  atomic_int_fast64_t;
-typedef atomic<uint_fast64_t> atomic_uint_fast64_t;
-
-typedef atomic<int8_t>   atomic_int8_t;
-typedef atomic<uint8_t>  atomic_uint8_t;
-typedef atomic<int16_t>  atomic_int16_t;
-typedef atomic<uint16_t> atomic_uint16_t;
-typedef atomic<int32_t>  atomic_int32_t;
-typedef atomic<uint32_t> atomic_uint32_t;
-typedef atomic<int64_t>  atomic_int64_t;
-typedef atomic<uint64_t> atomic_uint64_t;
-
-typedef atomic<intptr_t>  atomic_intptr_t;
-typedef atomic<uintptr_t> atomic_uintptr_t;
-typedef atomic<size_t>    atomic_size_t;
-typedef atomic<ptrdiff_t> atomic_ptrdiff_t;
-typedef atomic<intmax_t>  atomic_intmax_t;
-typedef atomic<uintmax_t> atomic_uintmax_t;
-
-// fences
-
-void atomic_thread_fence(memory_order m) noexcept;
-void atomic_signal_fence(memory_order m) noexcept;
-
-}  // std
-
-*/
-
 // clang-format off
 
 #include <cuda/std/detail/__config>

From a5e3d8858e7020e93d9727ef0a1f10abf3a45df1 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Wed, 8 May 2024 12:20:20 -0700
Subject: [PATCH 33/71] Fix push/pop macros in `<atomic>`.

---
 libcudacxx/include/cuda/std/atomic                        | 2 --
 libcudacxx/include/cuda/std/detail/libcxx/include/barrier | 1 +
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/libcudacxx/include/cuda/std/atomic b/libcudacxx/include/cuda/std/atomic
index 7026f38324..e900f38a28 100644
--- a/libcudacxx/include/cuda/std/atomic
+++ b/libcudacxx/include/cuda/std/atomic
@@ -50,8 +50,6 @@ _CCCL_PUSH_MACROS
 
 // clang-format on
 
-_CCCL_POP_MACROS
-
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/barrier b/libcudacxx/include/cuda/std/detail/libcxx/include/barrier
index 693261d468..0fdfefa541 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/barrier
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/barrier
@@ -461,6 +461,7 @@ public:
 _LIBCUDACXX_END_NAMESPACE_STD
 
 #include <cuda/std/__cuda/barrier.h>
+
 _CCCL_POP_MACROS
 
 #endif //_LIBCUDACXX_BARRIER

From 836addb4d42b1a66c14b43a86198f52afff599ac Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Wed, 8 May 2024 14:06:28 -0700
Subject: [PATCH 34/71] `ATOMIC_VAR_INIT->LIBCUDACXX_ATOMIC_VAR_INIT` * This
 avoids conflicting with the host's definition.

---
 libcudacxx/examples/rtc_example.cpp                         | 6 +++---
 libcudacxx/examples/trie.cu                                 | 6 +++---
 libcudacxx/examples/trie_mt.cpp                             | 6 +++---
 libcudacxx/include/cuda/std/atomic                          | 4 ++--
 libcudacxx/include/cuda/std/detail/libcxx/include/barrier   | 2 +-
 .../libcudacxx/cuda/stream_ref/stream_ref.wait.pass.cpp     | 4 +++-
 .../test/libcudacxx/std/atomics/atomics.flag/init.pass.cpp  | 6 +++---
 .../atomics.types.operations.req/atomic_var_init.pass.cpp   | 4 ++--
 .../atomics.types.operations.req/ctor.pass.cpp              | 2 +-
 9 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/libcudacxx/examples/rtc_example.cpp b/libcudacxx/examples/rtc_example.cpp
index 08ce22adf2..513e580584 100644
--- a/libcudacxx/examples/rtc_example.cpp
+++ b/libcudacxx/examples/rtc_example.cpp
@@ -50,11 +50,11 @@ template<class T> static constexpr T min(T a, T b) { return a < b ? a : b; }
 
 struct trie {
     struct ref {
-        cuda::std::atomic<trie*> ptr = ATOMIC_VAR_INIT(nullptr);
+        cuda::std::atomic<trie*> ptr = LIBCUDACXX_ATOMIC_VAR_INIT(nullptr);
         // the flag will protect against multiple pointer updates
-        cuda::std::atomic_flag flag = ATOMIC_FLAG_INIT;
+        cuda::std::atomic_flag flag = LIBCUDACXX_ATOMIC_FLAG_INIT;
     } next[26];
-    cuda::std::atomic<int> count = ATOMIC_VAR_INIT(0);
+    cuda::std::atomic<int> count = LIBCUDACXX_ATOMIC_VAR_INIT(0);
 };
 __host__ __device__
 int index_of(char c) {
diff --git a/libcudacxx/examples/trie.cu b/libcudacxx/examples/trie.cu
index b4b7a7a5f1..3a16fdceeb 100644
--- a/libcudacxx/examples/trie.cu
+++ b/libcudacxx/examples/trie.cu
@@ -36,11 +36,11 @@ struct trie
 {
   struct ref
   {
-    cuda::atomic<trie*, cuda::thread_scope_device> ptr = ATOMIC_VAR_INIT(nullptr);
+    cuda::atomic<trie*, cuda::thread_scope_device> ptr = LIBCUDACXX_ATOMIC_VAR_INIT(nullptr);
     // the flag will protect against multiple pointer updates
-    cuda::std::atomic_flag flag = ATOMIC_FLAG_INIT;
+    cuda::std::atomic_flag flag = LIBCUDACXX_ATOMIC_FLAG_INIT;
   } next[26];
-  cuda::std::atomic<short> count = ATOMIC_VAR_INIT(0);
+  cuda::std::atomic<short> count = LIBCUDACXX_ATOMIC_VAR_INIT(0);
 };
 __host__ __device__ int index_of(char c)
 {
diff --git a/libcudacxx/examples/trie_mt.cpp b/libcudacxx/examples/trie_mt.cpp
index 22fdb68499..2e2a46df29 100644
--- a/libcudacxx/examples/trie_mt.cpp
+++ b/libcudacxx/examples/trie_mt.cpp
@@ -36,11 +36,11 @@ struct trie
 {
   struct ref
   {
-    std::atomic<trie*> ptr = ATOMIC_VAR_INIT(nullptr);
+    std::atomic<trie*> ptr = LIBCUDACXX_ATOMIC_VAR_INIT(nullptr);
     // the flag will protect against multiple pointer updates
-    std::atomic_flag flag = ATOMIC_VAR_INIT(0);
+    std::atomic_flag flag = LIBCUDACXX_ATOMIC_VAR_INIT(0);
   } next[26];
-  std::atomic<int> count = ATOMIC_VAR_INIT(0);
+  std::atomic<int> count = LIBCUDACXX_ATOMIC_VAR_INIT(0);
 };
 int index_of(char c)
 {
diff --git a/libcudacxx/include/cuda/std/atomic b/libcudacxx/include/cuda/std/atomic
index e900f38a28..166cdf0c16 100644
--- a/libcudacxx/include/cuda/std/atomic
+++ b/libcudacxx/include/cuda/std/atomic
@@ -1011,8 +1011,8 @@ static_assert(LIBCUDACXX_ATOMIC_INT_LOCK_FREE, "This library assumes atomic<int>
 typedef atomic<int>       atomic_signed_lock_free;
 typedef atomic<unsigned>  atomic_unsigned_lock_free;
 
-#define ATOMIC_FLAG_INIT {false}
-#define ATOMIC_VAR_INIT(__v) {__v}
+#define LIBCUDACXX_ATOMIC_FLAG_INIT {false}
+#define LIBCUDACXX_ATOMIC_VAR_INIT(__v) {__v}
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/barrier b/libcudacxx/include/cuda/std/detail/libcxx/include/barrier
index 0fdfefa541..b12ff0347c 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/barrier
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/barrier
@@ -88,7 +88,7 @@ class alignas(64) __barrier_base
     struct alignas(64) __state_t
     {
         struct {
-            __atomic_impl<__phase_t, _Sco> __phase = ATOMIC_VAR_INIT(0);
+            __atomic_impl<__phase_t, _Sco> __phase = LIBCUDACXX_ATOMIC_VAR_INIT(0);
         } __tickets[64];
     };
     ::std::vector<__state_t>   __state;
diff --git a/libcudacxx/test/libcudacxx/cuda/stream_ref/stream_ref.wait.pass.cpp b/libcudacxx/test/libcudacxx/cuda/stream_ref/stream_ref.wait.pass.cpp
index 271bddd1ef..8f6a017964 100644
--- a/libcudacxx/test/libcudacxx/cuda/stream_ref/stream_ref.wait.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/stream_ref/stream_ref.wait.pass.cpp
@@ -45,7 +45,9 @@ int main(int argc, char** argv)
   NV_IF_TARGET(
     NV_IS_HOST,
     ( // passing case
-      cudaStream_t stream; cudaStreamCreate(&stream); std::atomic_flag flag = ATOMIC_FLAG_INIT;
+      cudaStream_t stream;
+      cudaStreamCreate(&stream);
+      std::atomic_flag flag = LIBCUDACXX_ATOMIC_FLAG_INIT;
       cudaStreamAddCallback(stream, callback, &flag, 0);
       cuda::stream_ref ref{stream};
       test_wait(ref);
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.flag/init.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.flag/init.pass.cpp
index 72090475a4..9bf8624f67 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.flag/init.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.flag/init.pass.cpp
@@ -14,7 +14,7 @@
 
 // struct atomic_flag
 
-// atomic_flag() = ATOMIC_FLAG_INIT;
+// atomic_flag() = LIBCUDACXX_ATOMIC_FLAG_INIT;
 
 #include <cuda/std/atomic>
 #include <cuda/std/cassert>
@@ -24,9 +24,9 @@
 int main(int, char**)
 {
   NV_DISPATCH_TARGET(NV_IS_HOST,
-                     (cuda::std::atomic_flag f = ATOMIC_FLAG_INIT; assert(f.test_and_set() == 0);),
+                     (cuda::std::atomic_flag f = LIBCUDACXX_ATOMIC_FLAG_INIT; assert(f.test_and_set() == 0);),
                      NV_PROVIDES_SM_70,
-                     (cuda::std::atomic_flag f = ATOMIC_FLAG_INIT; assert(f.test_and_set() == 0);))
+                     (cuda::std::atomic_flag f = LIBCUDACXX_ATOMIC_FLAG_INIT; assert(f.test_and_set() == 0);))
 
   return 0;
 }
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_var_init.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_var_init.pass.cpp
index a3acff9845..d81e4d11e9 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_var_init.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_var_init.pass.cpp
@@ -12,7 +12,7 @@
 
 // <cuda/std/atomic>
 
-// #define ATOMIC_VAR_INIT(value)
+// #define LIBCUDACXX_ATOMIC_VAR_INIT(value)
 
 #include <cuda/std/atomic>
 #include <cuda/std/cassert>
@@ -22,7 +22,7 @@
 
 int main(int, char**)
 {
-  cuda::std::atomic<int> v = ATOMIC_VAR_INIT(5);
+  cuda::std::atomic<int> v = LIBCUDACXX_ATOMIC_VAR_INIT(5);
   assert(v == 5);
 
   return 0;
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.operations/atomics.types.operations.req/ctor.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.operations/atomics.types.operations.req/ctor.pass.cpp
index 050bb36e72..30b6c195d6 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.operations/atomics.types.operations.req/ctor.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.operations/atomics.types.operations.req/ctor.pass.cpp
@@ -62,7 +62,7 @@ struct TestFunc
 #if !defined(_GNUC_VER) || _GNUC_VER >= 409
     // TODO: Figure out why this is failing with GCC 4.8.2 on CentOS 7 only.
     {
-      constexpr Atomic a = ATOMIC_VAR_INIT(t);
+      constexpr Atomic a = LIBCUDACXX_ATOMIC_VAR_INIT(t);
       assert(a == t);
     }
 #endif

From 137b854c79e38df1039a731c353958c975aacef9 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Wed, 8 May 2024 14:06:52 -0700
Subject: [PATCH 35/71] Make `<cuda/atomic>` tests include the correct header.

---
 .../libcudacxx/cuda/atomics/atomic.ext/atomic_fetch.fail.cpp  | 4 ++--
 .../cuda/atomics/atomic.ext/atomic_fetch_max.pass.cpp         | 4 ++--
 .../cuda/atomics/atomic.ext/atomic_fetch_min.pass.cpp         | 4 ++--
 .../test/libcudacxx/cuda/atomics/atomic.ext/atomic_helpers.h  | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_fetch.fail.cpp b/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_fetch.fail.cpp
index e2d73258c9..2a855a6223 100644
--- a/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_fetch.fail.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_fetch.fail.cpp
@@ -9,9 +9,9 @@
 // UNSUPPORTED: libcpp-has-no-threads, pre-sm-60
 // UNSUPPORTED: windows && pre-sm-70
 
-// <cuda/std/atomic>
+// <cuda/atomic>
 
-#include <cuda/std/atomic>
+#include <cuda/atomic>
 #include <cuda/std/cassert>
 #include <cuda/std/type_traits>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_fetch_max.pass.cpp b/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_fetch_max.pass.cpp
index 2c83f5d66e..3818fc3ab7 100644
--- a/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_fetch_max.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_fetch_max.pass.cpp
@@ -9,9 +9,9 @@
 // UNSUPPORTED: libcpp-has-no-threads, pre-sm-60
 // UNSUPPORTED: windows && pre-sm-70
 
-// <cuda/std/atomic>
+// <cuda/atomic>
 
-#include <cuda/std/atomic>
+#include <cuda/atomic>
 #include <cuda/std/cassert>
 #include <cuda/std/type_traits>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_fetch_min.pass.cpp b/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_fetch_min.pass.cpp
index 05920744c6..4a5c9dfef2 100644
--- a/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_fetch_min.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_fetch_min.pass.cpp
@@ -9,9 +9,9 @@
 // UNSUPPORTED: libcpp-has-no-threads, pre-sm-60
 // UNSUPPORTED: windows && pre-sm-70
 
-// <cuda/std/atomic>
+// <cuda/atomic>
 
-#include <cuda/std/atomic>
+#include <cuda/atomic>
 #include <cuda/std/cassert>
 #include <cuda/std/type_traits>
 
diff --git a/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_helpers.h b/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_helpers.h
index ae3ac2ec5e..cc54eda725 100644
--- a/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_helpers.h
+++ b/libcudacxx/test/libcudacxx/cuda/atomics/atomic.ext/atomic_helpers.h
@@ -9,7 +9,7 @@
 #ifndef ATOMIC_HELPERS_H
 #define ATOMIC_HELPERS_H
 
-#include <cuda/std/atomic>
+#include <cuda/atomic>
 #include <cuda/std/cassert>
 
 #include "test_macros.h"

From 38f188aba6a7c48f6384b779482b8c5ef2e4c921 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Wed, 8 May 2024 15:49:28 -0700
Subject: [PATCH 36/71] Fix typing with volatile atomic types.

---
 libcudacxx/include/cuda/std/__atomic/types/common.h | 9 +++++----
 libcudacxx/include/cuda/std/__atomic/wait/polling.h | 4 ++--
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__atomic/types/common.h b/libcudacxx/include/cuda/std/__atomic/types/common.h
index a82b8b03e2..8b2885f4cc 100644
--- a/libcudacxx/include/cuda/std/__atomic/types/common.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/common.h
@@ -43,6 +43,11 @@ using __atomic_storage_is_locked =
 template <typename _Sto>
 using __atomic_storage_is_small = __enable_if_t<__atomic_tag::__atomic_small_tag == __remove_cvref_t<_Sto>::__tag, int>;
 
+template <typename _Tp>
+using __atomic_underlying_t = typename _Tp::__underlying_t;
+template <typename _Tp>
+using __atomic_underlying_remove_cv_t = __remove_cv_t<typename _Tp::__underlying_t>;
+
 // [atomics.types.generic]p1 guarantees _Tp is trivially copyable. Because
 // the default operator= in an object is not volatile, a byte-by-byte copy
 // is required.
@@ -66,10 +71,6 @@ __atomic_assign_volatile(_Tp volatile* __a_value, _Tv volatile const& __val)
   }
 }
 
-// The 'value_type' of the atomic may be 'volatile blah', so remove the volatile portion for now.
-template <typename _Tp>
-using __atomic_underlying_t = typename _Tp::__underlying_t;
-
 _CCCL_HOST_DEVICE inline int __atomic_memcmp(void const* __lhs, void const* __rhs, size_t __count)
 {
   NV_DISPATCH_TARGET(
diff --git a/libcudacxx/include/cuda/std/__atomic/wait/polling.h b/libcudacxx/include/cuda/std/__atomic/wait/polling.h
index 7bc9481c81..0fb0ec3755 100644
--- a/libcudacxx/include/cuda/std/__atomic/wait/polling.h
+++ b/libcudacxx/include/cuda/std/__atomic/wait/polling.h
@@ -31,7 +31,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 template <typename _Tp, typename _Sco>
 struct __atomic_poll_tester
 {
-  using __underlying_t = __atomic_underlying_t<_Tp>;
+  using __underlying_t = __atomic_underlying_remove_cv_t<_Tp>;
 
   _Tp const volatile* __atom;
   __underlying_t __val;
@@ -51,7 +51,7 @@ struct __atomic_poll_tester
 
 template <typename _Tp, typename _Sco>
 _CCCL_HOST_DEVICE void
-__atomic_try_wait_slow_fallback(_Tp const volatile* __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco)
+__atomic_try_wait_slow_fallback(_Tp const volatile* __a, __atomic_underlying_remove_cv_t<_Tp> __val, memory_order __order, _Sco)
 {
   __libcpp_thread_poll_with_backoff(__atomic_poll_tester<_Tp, _Sco>(__a, __val, __order));
 }

From 634dfaeca4994b5e31e0f94a771ff2b8b21188e1 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Wed, 8 May 2024 15:50:10 -0700
Subject: [PATCH 37/71] Include correct header for `cuda::atomic`.

---
 .../std/atomics/atomics.types.generic/address.pass.cpp           | 1 +
 .../std/atomics/atomics.types.generic/address_ref.pass.cpp       | 1 +
 .../atomics/atomics.types.generic/address_ref_constness.pass.cpp | 1 +
 .../std/atomics/atomics.types.generic/atomic_copyable.pass.cpp   | 1 +
 .../libcudacxx/std/atomics/atomics.types.generic/bool.pass.cpp   | 1 +
 .../std/atomics/atomics.types.generic/cstdint_typedefs.pass.cpp  | 1 +
 .../std/atomics/atomics.types.generic/enum_class.pass.cpp        | 1 +
 .../std/atomics/atomics.types.generic/floating_point.pass.cpp    | 1 +
 .../atomics/atomics.types.generic/floating_point_ref.pass.cpp    | 1 +
 .../atomics.types.generic/floating_point_ref_constness.pass.cpp  | 1 +
 .../std/atomics/atomics.types.generic/integral.pass.cpp          | 1 +
 .../std/atomics/atomics.types.generic/integral_ref.pass.cpp      | 1 +
 .../atomics.types.generic/integral_ref_constness.pass.cpp        | 1 +
 .../atomics.types.operations.req/ctor.pass.cpp                   | 1 +
 14 files changed, 14 insertions(+)

diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/address.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/address.pass.cpp
index 37bfc73300..b914b9c712 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/address.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/address.pass.cpp
@@ -69,6 +69,7 @@
 // };
 
 #include <cuda/std/atomic>
+#include <cuda/atomic>
 #include <cuda/std/cassert>
 #include <cuda/std/type_traits>
 
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/address_ref.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/address_ref.pass.cpp
index 0cae7e53a6..8b4baf81f3 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/address_ref.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/address_ref.pass.cpp
@@ -69,6 +69,7 @@
 // };
 
 #include <cuda/std/atomic>
+#include <cuda/atomic>
 #include <cuda/std/cassert>
 #include <cuda/std/type_traits>
 
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/address_ref_constness.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/address_ref_constness.pass.cpp
index 9108280b80..84b4b07b5f 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/address_ref_constness.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/address_ref_constness.pass.cpp
@@ -69,6 +69,7 @@
 // };
 
 #include <cuda/std/atomic>
+#include <cuda/atomic>
 #include <cuda/std/cassert>
 #include <cuda/std/type_traits>
 
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/atomic_copyable.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/atomic_copyable.pass.cpp
index 3650b84f07..fbf2e12126 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/atomic_copyable.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/atomic_copyable.pass.cpp
@@ -16,6 +16,7 @@
 // <cuda/std/atomic>
 
 #include <cuda/std/atomic>
+#include <cuda/atomic>
 #include <cuda/std/cassert>
 #include <cuda/std/utility>
 // #include <cuda/std/thread> // for thread_id
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/bool.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/bool.pass.cpp
index 6dc016dabf..b4009abed5 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/bool.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/bool.pass.cpp
@@ -52,6 +52,7 @@
 // typedef atomic<bool> atomic_bool;
 
 #include <cuda/std/atomic>
+#include <cuda/atomic>
 #include <cuda/std/cassert>
 
 #include "test_macros.h"
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/cstdint_typedefs.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/cstdint_typedefs.pass.cpp
index 6105a54918..4d00b8a8f2 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/cstdint_typedefs.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/cstdint_typedefs.pass.cpp
@@ -37,6 +37,7 @@
 // typedef atomic<uintmax_t> atomic_uintmax_t;
 
 #include <cuda/std/atomic>
+#include <cuda/atomic>
 #include <cuda/std/cstdint>
 #include <cuda/std/type_traits>
 
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/enum_class.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/enum_class.pass.cpp
index 1904c53206..1ca0f8da66 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/enum_class.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/enum_class.pass.cpp
@@ -50,6 +50,7 @@
 // };
 
 #include <cuda/std/atomic>
+#include <cuda/atomic>
 #include <cuda/std/cassert>
 
 #include "cuda_space_selector.h"
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point.pass.cpp
index f000d0e69a..6f8027557f 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point.pass.cpp
@@ -73,6 +73,7 @@
 // };
 
 #include <cuda/std/atomic>
+#include <cuda/atomic>
 #include <cuda/std/cassert>
 
 #include "test_macros.h"
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point_ref.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point_ref.pass.cpp
index c790be5b6a..fdc7426975 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point_ref.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point_ref.pass.cpp
@@ -73,6 +73,7 @@
 // };
 
 #include <cuda/std/atomic>
+#include <cuda/atomic>
 #include <cuda/std/cassert>
 
 #include "test_macros.h"
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point_ref_constness.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point_ref_constness.pass.cpp
index 6ff9981471..3f534b49a4 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point_ref_constness.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point_ref_constness.pass.cpp
@@ -13,6 +13,7 @@
 // <cuda/std/atomic>
 
 #include <cuda/std/atomic>
+#include <cuda/atomic>
 #include <cuda/std/cassert>
 
 #include "test_macros.h"
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral.pass.cpp
index 272cedff26..53cde8cb25 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral.pass.cpp
@@ -87,6 +87,7 @@
 // };
 
 #include <cuda/std/atomic>
+#include <cuda/atomic>
 #include <cuda/std/cassert>
 
 #include "test_macros.h"
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref.pass.cpp
index b685255e02..007de5f333 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref.pass.cpp
@@ -87,6 +87,7 @@
 // };
 
 #include <cuda/std/atomic>
+#include <cuda/atomic>
 #include <cuda/std/cassert>
 
 #include "test_macros.h"
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref_constness.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref_constness.pass.cpp
index 2b20eb7841..d593700dfd 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref_constness.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref_constness.pass.cpp
@@ -87,6 +87,7 @@
 // };
 
 #include <cuda/std/atomic>
+#include <cuda/atomic>
 #include <cuda/std/cassert>
 
 #include "test_macros.h"
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.operations/atomics.types.operations.req/ctor.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.operations/atomics.types.operations.req/ctor.pass.cpp
index 30b6c195d6..2020cf54a0 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.operations/atomics.types.operations.req/ctor.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.operations/atomics.types.operations.req/ctor.pass.cpp
@@ -21,6 +21,7 @@
 #define _LIBCUDACXX_DISABLE_DEPRECATION_WARNINGS
 
 #include <cuda/std/atomic>
+#include <cuda/atomic>
 #include <cuda/std/cassert>
 #include <cuda/std/type_traits>
 

From e417e9b5eb405fe5028484d9d6164544eb365dba Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Wed, 8 May 2024 15:50:32 -0700
Subject: [PATCH 38/71] Fix underlying_t in `notify_wait.h`

---
 libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h b/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h
index 76ac62fa90..29130ee244 100644
--- a/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h
+++ b/libcudacxx/include/cuda/std/__atomic/wait/notify_wait.h
@@ -31,7 +31,7 @@ extern "C" _CCCL_DEVICE void __atomic_try_wait_unsupported_before_SM_70__();
 
 template <typename _Tp, typename _Sco>
 _LIBCUDACXX_INLINE_VISIBILITY void
-__atomic_try_wait_slow(_Tp const volatile* __a, __atomic_underlying_t<_Tp> __val, memory_order __order, _Sco)
+__atomic_try_wait_slow(_Tp const volatile* __a, __atomic_underlying_remove_cv_t<_Tp> __val, memory_order __order, _Sco)
 {
   NV_DISPATCH_TARGET(NV_PROVIDES_SM_70, __atomic_try_wait_slow_fallback(__a, __val, __order, _Sco{});
                      , NV_IS_HOST, __atomic_try_wait_slow_fallback(__a, __val, __order, _Sco{});
@@ -62,7 +62,7 @@ _LIBCUDACXX_INLINE_VISIBILITY bool __nonatomic_compare_equal(_Tp const& __lhs, _
 
 template <typename _Tp, typename _Sco>
 _LIBCUDACXX_INLINE_VISIBILITY void __atomic_wait(
-  _Tp const volatile* __a, __remove_cv_t<__atomic_underlying_t<_Tp>> const __val, memory_order __order, _Sco = {})
+  _Tp const volatile* __a, __atomic_underlying_remove_cv_t<_Tp> const __val, memory_order __order, _Sco = {})
 {
   for (int __i = 0; __i < _LIBCUDACXX_POLLING_COUNT; ++__i)
   {

From 90a182c2b3565e19c93202cdfa14a4815d33006e Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Wed, 8 May 2024 15:51:11 -0700
Subject: [PATCH 39/71] Revert using `volatile` in latch.

---
 libcudacxx/include/cuda/std/detail/libcxx/include/latch | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/latch b/libcudacxx/include/cuda/std/detail/libcxx/include/latch
index 665db464cc..74fb72fcb3 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/latch
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/latch
@@ -73,7 +73,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 template<thread_scope _Sco = thread_scope_system>
 class __latch_base
 {
-    _LIBCUDACXX_LATCH_ALIGNMENT __atomic_impl<volatile ptrdiff_t, _Sco> __counter;
+    _LIBCUDACXX_LATCH_ALIGNMENT __atomic_impl<ptrdiff_t, _Sco> __counter;
 public:
   inline _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit __latch_base(ptrdiff_t __expected)
       : __counter(__expected)

From 8929189e05d8c969e3c4af4ce3377a5769101520 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Wed, 8 May 2024 15:51:35 -0700
Subject: [PATCH 40/71] Make helpers more useful in tests.

---
 .../test/libcudacxx/heterogeneous/helpers.h   | 54 ++++++++-----------
 1 file changed, 21 insertions(+), 33 deletions(-)

diff --git a/libcudacxx/test/libcudacxx/heterogeneous/helpers.h b/libcudacxx/test/libcudacxx/heterogeneous/helpers.h
index 7691912558..3b6759d61c 100644
--- a/libcudacxx/test/libcudacxx/heterogeneous/helpers.h
+++ b/libcudacxx/test/libcudacxx/heterogeneous/helpers.h
@@ -81,10 +81,6 @@ __host__ inline std::vector<std::thread>& host_threads()
 
 __host__ inline void sync_host_threads()
 {
-#ifdef DEBUG_TESTERS
-  printf("%s\n", __PRETTY_FUNCTION__);
-  fflush(stdout);
-#endif
   for (auto&& thread : host_threads())
   {
     thread.join();
@@ -100,10 +96,6 @@ __host__ inline std::vector<std::thread>& device_threads()
 
 __host__ inline void sync_device_threads()
 {
-#ifdef DEBUG_TESTERS
-  printf("%s\n", __PRETTY_FUNCTION__);
-  fflush(stdout);
-#endif
   for (auto&& thread : device_threads())
   {
     thread.join();
@@ -217,14 +209,14 @@ template <typename Tester, typename T>
 void device_initialize(T& object)
 {
 #ifdef DEBUG_TESTERS
-  printf("%s\n", __PRETTY_FUNCTION__);
+  printf("    %s\n", __PRETTY_FUNCTION__);
   fflush(stdout);
 #endif
 
   auto kernel_launcher = [&object](cudaStream_t stream) {
     constexpr auto tc = threadcount_trait<Tester>::value;
 #ifdef DEBUG_TESTERS
-    printf("%i device init threads launched\r\n", (int) tc);
+    printf("      %i device init threads launched\r\n", (int) tc);
     fflush(stdout);
 #endif
     initialization_kernel<Tester><<<1, tc, 0, stream>>>(object);
@@ -234,10 +226,6 @@ void device_initialize(T& object)
 
   if (!async_initialize_trait<Tester>::value)
   {
-#ifdef DEBUG_TESTERS
-    printf("init not async, synchronizing\r\n");
-    fflush(stdout);
-#endif
     HETEROGENEOUS_SAFE_CALL(cudaDeviceSynchronize());
     sync_all();
   }
@@ -247,14 +235,14 @@ template <typename Tester, typename T>
 void device_validate(T& object)
 {
 #ifdef DEBUG_TESTERS
-  printf("%s\n", __PRETTY_FUNCTION__);
+  printf("    %s\n", __PRETTY_FUNCTION__);
   fflush(stdout);
 #endif
 
   auto kernel_launcher = [&object](cudaStream_t stream) {
     constexpr auto tc = threadcount_trait<Tester>::value;
 #ifdef DEBUG_TESTERS
-    printf("%i device validate threads launched\r\n", (int) tc);
+    printf("     %i device validate threads launched\r\n", (int) tc);
     fflush(stdout);
 #endif
     validation_kernel<Tester><<<1, tc, 0, stream>>>(object);
@@ -264,10 +252,6 @@ void device_validate(T& object)
 
   if (!async_validate_trait<Tester>::value)
   {
-#ifdef DEBUG_TESTERS
-    printf("validate not async, synchronizing\r\n");
-    fflush(stdout);
-#endif
     HETEROGENEOUS_SAFE_CALL(cudaDeviceSynchronize());
     sync_all();
   }
@@ -277,13 +261,13 @@ template <typename Tester, typename T>
 void host_initialize(T& object)
 {
 #ifdef DEBUG_TESTERS
-  printf("%s\n", __PRETTY_FUNCTION__);
+  printf("    %s\n", __PRETTY_FUNCTION__);
   fflush(stdout);
 #endif
 
   constexpr auto tc = threadcount_trait<Tester>::value;
 #ifdef DEBUG_TESTERS
-  printf("%i host init threads launched\r\n", (int) tc);
+  printf("      %i host init threads launched\r\n", (int) tc);
   fflush(stdout);
 #endif
 
@@ -296,10 +280,6 @@ void host_initialize(T& object)
 
   if (!async_initialize_trait<Tester>::value)
   {
-#ifdef DEBUG_TESTERS
-    printf("init not async, synchronizing\r\n");
-    fflush(stdout);
-#endif
     HETEROGENEOUS_SAFE_CALL(cudaDeviceSynchronize());
     sync_all();
   }
@@ -309,13 +289,13 @@ template <typename Tester, typename T>
 void host_validate(T& object)
 {
 #ifdef DEBUG_TESTERS
-  printf("%s\n", __PRETTY_FUNCTION__);
+  printf("    %s\n", __PRETTY_FUNCTION__);
   fflush(stdout);
 #endif
 
   constexpr auto tc = threadcount_trait<Tester>::value;
 #ifdef DEBUG_TESTERS
-  printf("%i host validate threads launched\r\n", (int) tc);
+  printf("      %i host validate threads launched\r\n", (int) tc);
   fflush(stdout);
 #endif
 
@@ -328,10 +308,6 @@ void host_validate(T& object)
 
   if (!async_initialize_trait<Tester>::value)
   {
-#ifdef DEBUG_TESTERS
-    printf("validate not async, synchronizing\r\n");
-    fflush(stdout);
-#endif
     HETEROGENEOUS_SAFE_CALL(cudaDeviceSynchronize());
     sync_all();
   }
@@ -396,7 +372,7 @@ template <size_t Idx, typename Fn, typename Launchers, enable_if_permutations_re
 void permute_tests(const Fn& fn, Launchers launchers)
 {
 #ifdef DEBUG_TESTERS
-  printf("Testing permutation %zu of %zu\r\n", Idx, sizeof...(Launchers));
+  printf("  Testing permutation %zd (%s)\r\n", Idx, __PRETTY_FUNCTION__);
   fflush(stdout);
 #endif
   fn(launchers);
@@ -447,6 +423,10 @@ void validate_device_dynamic(tester_list<Testers...> testers, Args... args)
 
   // ex: type_list<device_launcher, host_launcher, host_launcher>
   using initial_launcher_list = append_n<sizeof...(Testers) - 1, type_list<device_launcher<T>>, host_launcher<T>>;
+#ifdef DEBUG_TESTERS
+  printf("Launching %zd permutations\r\n", sizeof...(Testers));
+  fflush(stdout);
+#endif
   permute_tests(test_harness, initial_launcher_list{});
 }
 
@@ -652,10 +632,18 @@ void validate_pinned(Args... args)
 {
   using list_t = typename validate_list<false, TesterList>::type;
   list_t list0;
+#ifdef DEBUG_TESTERS
+  printf("%s\n", "Launching permuted H/D tests");
+  fflush(stdout);
+#endif
   validate_device_dynamic<T>(list0, args...);
 
   if (check_managed_memory_support(is_tester_list_async<list_t>::value))
   {
+#ifdef DEBUG_TESTERS
+    printf("%s\n", "Launching mixed H/D tests");
+    fflush(stdout);
+#endif
     typename validate_list<true, TesterList>::type list1;
     validate_managed<T>(list1, args...);
   }

From 6cdbf6bc8f82c183a0385b2b58b32901b8fa998e Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Wed, 8 May 2024 19:59:15 -0700
Subject: [PATCH 41/71] Prevent non-CUDA compilers from seeing PTX.

---
 libcudacxx/codegen/codegen.cpp                                | 3 +++
 .../include/cuda/std/__atomic/functions/cuda_ptx_generated.h  | 4 ++++
 2 files changed, 7 insertions(+)

diff --git a/libcudacxx/codegen/codegen.cpp b/libcudacxx/codegen/codegen.cpp
index d913a9b9cd..cd5d1ddda7 100644
--- a/libcudacxx/codegen/codegen.cpp
+++ b/libcudacxx/codegen/codegen.cpp
@@ -105,6 +105,8 @@ int main()
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
+#if defined(_CCCL_CUDA_COMPILER)
+
 )XXX";
 
   auto scopenametag = [&](auto scope) {
@@ -528,6 +530,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
     }
   }
 
+  out << "\n#endif // defined(_CCCL_CUDA_COMPILER)\n";
   out << "\n_LIBCUDACXX_END_NAMESPACE_STD\n";
   out << "\n#endif // _LIBCUDACXX___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H\n";
   out << "\n// clang-format on\n";
diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
index 889e612487..f8deee65fb 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
@@ -35,6 +35,8 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
+#if defined(_CCCL_CUDA_COMPILER)
+
 static inline _CCCL_DEVICE void __cuda_membar_block() { asm volatile("membar.cta;":::"memory"); }
 static inline _CCCL_DEVICE void __cuda_fence_acq_rel_block() { asm volatile("fence.acq_rel.cta;":::"memory"); }
 static inline _CCCL_DEVICE void __cuda_fence_sc_block() { asm volatile("fence.sc.cta;":::"memory"); }
@@ -6567,6 +6569,8 @@ _CCCL_DEVICE _Type* __atomic_fetch_sub_cuda(_Type **__ptr, ptrdiff_t __val, int
     return __ret;
 }
 
+#endif // defined(_CCCL_CUDA_COMPILER)
+
 _LIBCUDACXX_END_NAMESPACE_STD
 
 #endif // _LIBCUDACXX___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H

From a886baef858c18779ab6cf1b1bf244124f6804d2 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Wed, 8 May 2024 19:59:47 -0700
Subject: [PATCH 42/71] Make the MSVC atomic header a little more friendly.

---
 .../include/cuda/std/__atomic/platform/msvc_to_builtins.h   | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h b/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h
index 38fb90638e..c02ca6e5ff 100644
--- a/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h
+++ b/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h
@@ -21,9 +21,7 @@
 #  pragma system_header
 #endif // no system header
 
-#ifndef _MSC_VER
-#  error "This file is only for CL.EXE's benefit"
-#endif
+#if defined(_CCCL_COMPILER_MSVC)
 
 #include <cuda/std/__atomic/order.h>
 #include <cuda/std/cassert>
@@ -639,4 +637,6 @@ _Type __atomic_fetch_min(_Type volatile* __ptr, _Delta __val, int __memorder)
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
+#endif // defined(_CCCL_COMPILER_MSVC)
+
 #endif // __LIBCUDACXX___ATOMIC_PLATFORM_MSVC_H

From 38d5f36501067782967bd1fb623d53ce4d58f9b7 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Wed, 8 May 2024 20:00:18 -0700
Subject: [PATCH 43/71] Make the derived PTX header only visible to CUDA
 compilers.

---
 .../include/cuda/std/__atomic/functions/cuda_ptx_derived.h    | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
index 328d84cd57..d7a8033dd0 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
@@ -26,6 +26,8 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
+#if defined(_CCCL_CUDA_COMPILER)
+
 template <typename _Tp, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
 bool _CCCL_DEVICE __atomic_compare_exchange_cuda(
   _Tp volatile* __ptr, _Tp* __expected, const _Tp __desired, bool, int __success_memorder, int __failure_memorder, _Sco)
@@ -194,6 +196,8 @@ static inline _CCCL_DEVICE void __atomic_signal_fence_cuda(int)
   asm volatile("" ::: "memory");
 }
 
+#endif // defined(_CCCL_CUDA_COMPILER)
+
 _LIBCUDACXX_END_NAMESPACE_STD
 
 #endif // __LIBCUDACXX___ATOMIC_FUNCTIONS_DERIVED_H

From 80978c3662d3697cee660cac81dec81f27a86f8e Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Wed, 8 May 2024 20:00:48 -0700
Subject: [PATCH 44/71] Remove the defaulted scope specifier on the atomic type
 layer.

---
 .../include/cuda/std/__atomic/types/base.h    | 24 +++++++++----------
 .../include/cuda/std/__atomic/types/common.h  |  1 +
 .../include/cuda/std/__atomic/types/locked.h  | 20 ++++++++--------
 .../include/cuda/std/__atomic/types/small.h   | 24 +++++++++----------
 4 files changed, 35 insertions(+), 34 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__atomic/types/base.h b/libcudacxx/include/cuda/std/__atomic/types/base.h
index 604d6bdc57..413df176d2 100644
--- a/libcudacxx/include/cuda/std/__atomic/types/base.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/base.h
@@ -88,7 +88,7 @@ _CCCL_HOST_DEVICE inline void __atomic_init_dispatch(_Sto* __a, _Up __val)
   __atomic_assign_volatile(__a->get(), __val);
 }
 
-template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0>
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_base<_Sto> = 0>
 _CCCL_HOST_DEVICE inline void __atomic_store_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {})
 {
   NV_DISPATCH_TARGET(
@@ -98,7 +98,7 @@ _CCCL_HOST_DEVICE inline void __atomic_store_dispatch(_Sto* __a, _Up __val, memo
     (__atomic_store_host(__a->get(), __val, __order);))
 }
 
-template <typename _Sto, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0>
+template <typename _Sto, typename _Sco, __atomic_storage_is_base<_Sto> = 0>
 _CCCL_HOST_DEVICE inline auto __atomic_load_dispatch(const _Sto* __a, memory_order __order, _Sco = {})
   -> __atomic_underlying_t<_Sto>
 {
@@ -109,7 +109,7 @@ _CCCL_HOST_DEVICE inline auto __atomic_load_dispatch(const _Sto* __a, memory_ord
     (return __atomic_load_host(__a->get(), __order);))
 }
 
-template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0>
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_base<_Sto> = 0>
 _CCCL_HOST_DEVICE inline auto __atomic_exchange_dispatch(_Sto* __a, _Up __value, memory_order __order, _Sco = {})
   -> __atomic_underlying_t<_Sto>
 {
@@ -120,7 +120,7 @@ _CCCL_HOST_DEVICE inline auto __atomic_exchange_dispatch(_Sto* __a, _Up __value,
     (return __atomic_exchange_host(__a->get(), __value, __order);))
 }
 
-template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0>
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_base<_Sto> = 0>
 _CCCL_HOST_DEVICE inline bool __atomic_compare_exchange_strong_dispatch(
   _Sto* __a, _Up* __expected, _Up __val, memory_order __success, memory_order __failure, _Sco = {})
 {
@@ -140,7 +140,7 @@ _CCCL_HOST_DEVICE inline bool __atomic_compare_exchange_strong_dispatch(
   return __result;
 }
 
-template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0>
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_base<_Sto> = 0>
 _CCCL_HOST_DEVICE inline bool __atomic_compare_exchange_weak_dispatch(
   _Sto* __a, _Up* __expected, _Up __val, memory_order __success, memory_order __failure, _Sco = {})
 {
@@ -160,7 +160,7 @@ _CCCL_HOST_DEVICE inline bool __atomic_compare_exchange_weak_dispatch(
   return __result;
 }
 
-template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0>
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_base<_Sto> = 0>
 _CCCL_HOST_DEVICE inline auto __atomic_fetch_add_dispatch(_Sto* __a, _Up __delta, memory_order __order, _Sco = {})
   -> __atomic_underlying_t<_Sto>
 {
@@ -171,7 +171,7 @@ _CCCL_HOST_DEVICE inline auto __atomic_fetch_add_dispatch(_Sto* __a, _Up __delta
     (return __atomic_fetch_add_host(__a->get(), __delta, __order);))
 }
 
-template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0>
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_base<_Sto> = 0>
 _CCCL_HOST_DEVICE inline auto __atomic_fetch_sub_dispatch(_Sto* __a, _Up __delta, memory_order __order, _Sco = {})
   -> __atomic_underlying_t<_Sto>
 {
@@ -182,7 +182,7 @@ _CCCL_HOST_DEVICE inline auto __atomic_fetch_sub_dispatch(_Sto* __a, _Up __delta
     (return __atomic_fetch_sub_host(__a->get(), __delta, __order);))
 }
 
-template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0>
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_base<_Sto> = 0>
 _CCCL_HOST_DEVICE inline auto __atomic_fetch_and_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {})
   -> __atomic_underlying_t<_Sto>
 {
@@ -193,7 +193,7 @@ _CCCL_HOST_DEVICE inline auto __atomic_fetch_and_dispatch(_Sto* __a, _Up __patte
     (return __atomic_fetch_and_host(__a->get(), __pattern, __order);))
 }
 
-template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0>
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_base<_Sto> = 0>
 _CCCL_HOST_DEVICE inline auto __atomic_fetch_or_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {})
   -> __atomic_underlying_t<_Sto>
 {
@@ -204,7 +204,7 @@ _CCCL_HOST_DEVICE inline auto __atomic_fetch_or_dispatch(_Sto* __a, _Up __patter
     (return __atomic_fetch_or_host(__a->get(), __pattern, __order);))
 }
 
-template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0>
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_base<_Sto> = 0>
 _CCCL_HOST_DEVICE inline auto __atomic_fetch_xor_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {})
   -> __atomic_underlying_t<_Sto>
 {
@@ -215,7 +215,7 @@ _CCCL_HOST_DEVICE inline auto __atomic_fetch_xor_dispatch(_Sto* __a, _Up __patte
     (return __atomic_fetch_xor_host(__a->get(), __pattern, __order);))
 }
 
-template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0>
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_base<_Sto> = 0>
 _CCCL_HOST_DEVICE inline auto __atomic_fetch_max_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {})
   -> __atomic_underlying_t<_Sto>
 {
@@ -225,7 +225,7 @@ _CCCL_HOST_DEVICE inline auto __atomic_fetch_max_dispatch(_Sto* __a, _Up __val,
     (return __atomic_fetch_max_host(__a->get(), __val, __order);))
 }
 
-template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_base<_Sto> = 0>
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_base<_Sto> = 0>
 _CCCL_HOST_DEVICE inline auto __atomic_fetch_min_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {})
   -> __atomic_underlying_t<_Sto>
 {
diff --git a/libcudacxx/include/cuda/std/__atomic/types/common.h b/libcudacxx/include/cuda/std/__atomic/types/common.h
index 8b2885f4cc..f9382cdcf6 100644
--- a/libcudacxx/include/cuda/std/__atomic/types/common.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/common.h
@@ -21,6 +21,7 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/std/detail/libcxx/include/cstring>
 #include <cuda/std/__type_traits/enable_if.h>
 #include <cuda/std/__type_traits/is_assignable.h>
 #include <cuda/std/__type_traits/remove_cv.h>
diff --git a/libcudacxx/include/cuda/std/__atomic/types/locked.h b/libcudacxx/include/cuda/std/__atomic/types/locked.h
index 5dedfc81fb..3f1fa3e240 100644
--- a/libcudacxx/include/cuda/std/__atomic/types/locked.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/locked.h
@@ -79,7 +79,7 @@ _CCCL_HOST_DEVICE inline void __atomic_init_dispatch(_Sto* __a, _Up __val)
   __atomic_assign_volatile(&__a->__a_value, __val);
 }
 
-template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_locked<_Sto> = 0>
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_locked<_Sto> = 0>
 _CCCL_HOST_DEVICE inline void __atomic_store_dispatch(_Sto* __a, _Up __val, memory_order, _Sco = {})
 {
   __a->__lock(_Sco{});
@@ -87,7 +87,7 @@ _CCCL_HOST_DEVICE inline void __atomic_store_dispatch(_Sto* __a, _Up __val, memo
   __a->__unlock(_Sco{});
 }
 
-template <typename _Sto, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_locked<_Sto> = 0>
+template <typename _Sto, typename _Sco, __atomic_storage_is_locked<_Sto> = 0>
 _CCCL_HOST_DEVICE inline auto __atomic_load_dispatch(const _Sto* __a, memory_order __order, _Sco = {})
   -> __atomic_underlying_t<_Sto>
 {
@@ -99,7 +99,7 @@ _CCCL_HOST_DEVICE inline auto __atomic_load_dispatch(const _Sto* __a, memory_ord
   return __old;
 }
 
-template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_locked<_Sto> = 0>
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_locked<_Sto> = 0>
 _CCCL_HOST_DEVICE inline auto __atomic_exchange_dispatch(_Sto* __a, _Up __value, memory_order, _Sco = {})
   -> __atomic_underlying_t<_Sto>
 {
@@ -112,7 +112,7 @@ _CCCL_HOST_DEVICE inline auto __atomic_exchange_dispatch(_Sto* __a, _Up __value,
   return __old;
 }
 
-template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_locked<_Sto> = 0>
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_locked<_Sto> = 0>
 _CCCL_HOST_DEVICE inline bool __atomic_compare_exchange_strong_dispatch(
   _Sto* __a, _Up* __expected, _Up __value, memory_order, memory_order, _Sco = {})
 {
@@ -133,7 +133,7 @@ _CCCL_HOST_DEVICE inline bool __atomic_compare_exchange_strong_dispatch(
   return __ret;
 }
 
-template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_locked<_Sto> = 0>
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_locked<_Sto> = 0>
 _CCCL_HOST_DEVICE inline bool
 __atomic_compare_exchange_weak_dispatch(_Sto* __a, _Up* __expected, _Up __value, memory_order, memory_order, _Sco = {})
 {
@@ -154,7 +154,7 @@ __atomic_compare_exchange_weak_dispatch(_Sto* __a, _Up* __expected, _Up __value,
   return __ret;
 }
 
-template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_locked<_Sto> = 0>
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_locked<_Sto> = 0>
 _CCCL_HOST_DEVICE inline auto __atomic_fetch_add_dispatch(_Sto* __a, _Up __delta, memory_order, _Sco = {})
   -> __atomic_underlying_t<_Sto>
 {
@@ -167,7 +167,7 @@ _CCCL_HOST_DEVICE inline auto __atomic_fetch_add_dispatch(_Sto* __a, _Up __delta
   return __old;
 }
 
-template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_locked<_Sto> = 0>
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_locked<_Sto> = 0>
 _CCCL_HOST_DEVICE inline auto __atomic_fetch_sub_dispatch(_Sto* __a, _Up __delta, memory_order, _Sco = {})
   -> __atomic_underlying_t<_Sto>
 {
@@ -180,7 +180,7 @@ _CCCL_HOST_DEVICE inline auto __atomic_fetch_sub_dispatch(_Sto* __a, _Up __delta
   return __old;
 }
 
-template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_locked<_Sto> = 0>
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_locked<_Sto> = 0>
 _CCCL_HOST_DEVICE inline auto __atomic_fetch_and_dispatch(_Sto* __a, _Up __pattern, memory_order, _Sco = {})
   -> __atomic_underlying_t<_Sto>
 {
@@ -193,7 +193,7 @@ _CCCL_HOST_DEVICE inline auto __atomic_fetch_and_dispatch(_Sto* __a, _Up __patte
   return __old;
 }
 
-template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_locked<_Sto> = 0>
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_locked<_Sto> = 0>
 _CCCL_HOST_DEVICE inline auto __atomic_fetch_or_dispatch(_Sto* __a, _Up __pattern, memory_order, _Sco = {})
   -> __atomic_underlying_t<_Sto>
 {
@@ -206,7 +206,7 @@ _CCCL_HOST_DEVICE inline auto __atomic_fetch_or_dispatch(_Sto* __a, _Up __patter
   return __old;
 }
 
-template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_locked<_Sto> = 0>
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_locked<_Sto> = 0>
 _CCCL_HOST_DEVICE inline auto __atomic_fetch_xor_dispatch(_Sto* __a, _Up __pattern, memory_order, _Sco = {})
   -> __atomic_underlying_t<_Sto>
 {
diff --git a/libcudacxx/include/cuda/std/__atomic/types/small.h b/libcudacxx/include/cuda/std/__atomic/types/small.h
index ba49dc5c1e..5aea50831d 100644
--- a/libcudacxx/include/cuda/std/__atomic/types/small.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/small.h
@@ -86,13 +86,13 @@ _CCCL_HOST_DEVICE inline void __atomic_init_dispatch(_Sto* __a, _Up __val)
   __atomic_init_dispatch(&__a->__a_value, __atomic_small_to_32(__val));
 }
 
-template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_small<_Sto> = 0>
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_small<_Sto> = 0>
 _CCCL_HOST_DEVICE inline void __atomic_store_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {})
 {
   __atomic_store_dispatch(&__a->__a_value, __atomic_small_to_32(__val), __order, _Sco{});
 }
 
-template <typename _Sto, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_small<_Sto> = 0>
+template <typename _Sto, typename _Sco, __atomic_storage_is_small<_Sto> = 0>
 _CCCL_HOST_DEVICE inline auto __atomic_load_dispatch(const _Sto* __a, memory_order __order, _Sco = {})
   -> __atomic_underlying_t<_Sto>
 {
@@ -100,7 +100,7 @@ _CCCL_HOST_DEVICE inline auto __atomic_load_dispatch(const _Sto* __a, memory_ord
   return __atomic_small_from_32<_Tp>(__atomic_load_dispatch(&__a->__a_value, __order, _Sco{}));
 }
 
-template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_small<_Sto> = 0>
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_small<_Sto> = 0>
 _CCCL_HOST_DEVICE inline auto __atomic_exchange_dispatch(_Sto* __a, _Up __value, memory_order __order, _Sco = {})
   -> __atomic_underlying_t<_Sto>
 {
@@ -109,7 +109,7 @@ _CCCL_HOST_DEVICE inline auto __atomic_exchange_dispatch(_Sto* __a, _Up __value,
     __atomic_exchange_dispatch(&__a->__a_value, __atomic_small_to_32(__value), __order, _Sco{}));
 }
 
-template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_small<_Sto> = 0>
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_small<_Sto> = 0>
 _CCCL_HOST_DEVICE inline bool __atomic_compare_exchange_weak_dispatch(
   _Sto* __a, _Up* __expected, _Up __value, memory_order __success, memory_order __failure, _Sco = {})
 {
@@ -133,7 +133,7 @@ _CCCL_HOST_DEVICE inline bool __atomic_compare_exchange_weak_dispatch(
   return __ret;
 }
 
-template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_small<_Sto> = 0>
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_small<_Sto> = 0>
 _CCCL_HOST_DEVICE inline bool __atomic_compare_exchange_strong_dispatch(
   _Sto* __a, _Up* __expected, _Up __value, memory_order __success, memory_order __failure, _Sco = {})
 {
@@ -152,7 +152,7 @@ _CCCL_HOST_DEVICE inline bool __atomic_compare_exchange_strong_dispatch(
   }
 }
 
-template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_small<_Sto> = 0>
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_small<_Sto> = 0>
 _CCCL_HOST_DEVICE inline auto __atomic_fetch_add_dispatch(_Sto* __a, _Up __delta, memory_order __order, _Sco = {})
   -> __atomic_underlying_t<_Sto>
 {
@@ -161,7 +161,7 @@ _CCCL_HOST_DEVICE inline auto __atomic_fetch_add_dispatch(_Sto* __a, _Up __delta
     __atomic_fetch_add_dispatch(&__a->__a_value, __atomic_small_to_32(__delta), __order, _Sco{}));
 }
 
-template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_small<_Sto> = 0>
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_small<_Sto> = 0>
 _CCCL_HOST_DEVICE inline auto __atomic_fetch_sub_dispatch(_Sto* __a, _Up __delta, memory_order __order, _Sco = {})
   -> __atomic_underlying_t<_Sto>
 {
@@ -170,7 +170,7 @@ _CCCL_HOST_DEVICE inline auto __atomic_fetch_sub_dispatch(_Sto* __a, _Up __delta
     __atomic_fetch_sub_dispatch(&__a->__a_value, __atomic_small_to_32(__delta), __order, _Sco{}));
 }
 
-template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_small<_Sto> = 0>
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_small<_Sto> = 0>
 _CCCL_HOST_DEVICE inline auto __atomic_fetch_and_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {})
   -> __atomic_underlying_t<_Sto>
 {
@@ -179,7 +179,7 @@ _CCCL_HOST_DEVICE inline auto __atomic_fetch_and_dispatch(_Sto* __a, _Up __patte
     __atomic_fetch_and_dispatch(&__a->__a_value, __atomic_small_to_32(__pattern), __order, _Sco{}));
 }
 
-template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_small<_Sto> = 0>
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_small<_Sto> = 0>
 _CCCL_HOST_DEVICE inline auto __atomic_fetch_or_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {})
   -> __atomic_underlying_t<_Sto>
 {
@@ -188,7 +188,7 @@ _CCCL_HOST_DEVICE inline auto __atomic_fetch_or_dispatch(_Sto* __a, _Up __patter
     __atomic_fetch_or_dispatch(&__a->__a_value, __atomic_small_to_32(__pattern), __order, _Sco{}));
 }
 
-template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_small<_Sto> = 0>
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_small<_Sto> = 0>
 _CCCL_HOST_DEVICE inline auto __atomic_fetch_xor_dispatch(_Sto* __a, _Up __pattern, memory_order __order, _Sco = {})
   -> __atomic_underlying_t<_Sto>
 {
@@ -197,7 +197,7 @@ _CCCL_HOST_DEVICE inline auto __atomic_fetch_xor_dispatch(_Sto* __a, _Up __patte
     __atomic_fetch_xor_dispatch(&__a->__a_value, __atomic_small_to_32(__pattern), __order, _Sco{}));
 }
 
-template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_small<_Sto> = 0>
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_small<_Sto> = 0>
 _CCCL_HOST_DEVICE inline auto __atomic_fetch_max_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {})
   -> __atomic_underlying_t<_Sto>
 {
@@ -206,7 +206,7 @@ _CCCL_HOST_DEVICE inline auto __atomic_fetch_max_dispatch(_Sto* __a, _Up __val,
     __atomic_fetch_max_dispatch(&__a->__a_value, __atomic_small_to_32(__val), __order, _Sco{}));
 }
 
-template <typename _Sto, typename _Up, typename _Sco = __thread_scope_system_tag, __atomic_storage_is_small<_Sto> = 0>
+template <typename _Sto, typename _Up, typename _Sco, __atomic_storage_is_small<_Sto> = 0>
 _CCCL_HOST_DEVICE inline auto __atomic_fetch_min_dispatch(_Sto* __a, _Up __val, memory_order __order, _Sco = {})
   -> __atomic_underlying_t<_Sto>
 {

From 52bbcf1eb13e585bc73ac4eaca7ffd53e420884d Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Wed, 8 May 2024 20:02:06 -0700
Subject: [PATCH 45/71] Remove the defaulted scope specifier from the atomics
 API layer.

---
 libcudacxx/include/cuda/std/__atomic/api/owned.h     | 6 +++---
 libcudacxx/include/cuda/std/__atomic/api/reference.h | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__atomic/api/owned.h b/libcudacxx/include/cuda/std/__atomic/api/owned.h
index 6ca525dfd9..2ba8d8ca16 100644
--- a/libcudacxx/include/cuda/std/__atomic/api/owned.h
+++ b/libcudacxx/include/cuda/std/__atomic/api/owned.h
@@ -52,7 +52,7 @@ struct __atomic_common
   _LIBCUDACXX_ATOMIC_COMMON_IMPL(, volatile)
 };
 
-template <typename _Tp, typename _Sco = __thread_scope_system_tag>
+template <typename _Tp, typename _Sco>
 struct __atomic_arithmetic
 {
   _CCCL_HOST_DEVICE constexpr inline __atomic_arithmetic(_Tp __v)
@@ -76,7 +76,7 @@ struct __atomic_arithmetic
   _LIBCUDACXX_ATOMIC_ARITHMETIC_IMPL(, volatile)
 };
 
-template <typename _Tp, typename _Sco = __thread_scope_system_tag>
+template <typename _Tp, typename _Sco>
 struct __atomic_bitwise
 {
   _CCCL_HOST_DEVICE constexpr inline __atomic_bitwise(_Tp __v)
@@ -103,7 +103,7 @@ struct __atomic_bitwise
   _LIBCUDACXX_ATOMIC_BITWISE_IMPL(, volatile)
 };
 
-template <typename _Tp, typename _Sco = __thread_scope_system_tag>
+template <typename _Tp, typename _Sco>
 struct __atomic_pointer
 {
   _CCCL_HOST_DEVICE constexpr inline __atomic_pointer(_Tp __v)
diff --git a/libcudacxx/include/cuda/std/__atomic/api/reference.h b/libcudacxx/include/cuda/std/__atomic/api/reference.h
index 0d501a330b..94058da985 100644
--- a/libcudacxx/include/cuda/std/__atomic/api/reference.h
+++ b/libcudacxx/include/cuda/std/__atomic/api/reference.h
@@ -82,7 +82,7 @@ struct __atomic_ref_bitwise
   _LIBCUDACXX_ATOMIC_BITWISE_IMPL(const, )
 };
 
-template <typename _Tp, typename _Sco = __thread_scope_system_tag>
+template <typename _Tp, typename _Sco>
 struct __atomic_ref_pointer
 {
   _CCCL_HOST_DEVICE constexpr inline __atomic_ref_pointer(_Tp& __v)

From 8e9cd556e752fde9ded31fb644e73aadad30ab99 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Wed, 8 May 2024 22:14:28 -0700
Subject: [PATCH 46/71] Fix missing cassert in several tests.

---
 libcudacxx/test/libcudacxx/cuda/annotated_ptr/utils.h    | 1 +
 libcudacxx/test/libcudacxx/cuda/pipeline_group_concept.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/libcudacxx/test/libcudacxx/cuda/annotated_ptr/utils.h b/libcudacxx/test/libcudacxx/cuda/annotated_ptr/utils.h
index 5eddfd442d..588bbedb4f 100644
--- a/libcudacxx/test/libcudacxx/cuda/annotated_ptr/utils.h
+++ b/libcudacxx/test/libcudacxx/cuda/annotated_ptr/utils.h
@@ -14,6 +14,7 @@
 #endif
 
 #include <cuda/annotated_ptr>
+#include <cuda/std/cassert>
 
 #if defined(DEBUG)
 #  define DPRINTF(...)     \
diff --git a/libcudacxx/test/libcudacxx/cuda/pipeline_group_concept.h b/libcudacxx/test/libcudacxx/cuda/pipeline_group_concept.h
index 2410abea06..83d08371d5 100644
--- a/libcudacxx/test/libcudacxx/cuda/pipeline_group_concept.h
+++ b/libcudacxx/test/libcudacxx/cuda/pipeline_group_concept.h
@@ -13,6 +13,7 @@
 // TODO: Remove pointless comparison suppression when compiler fixes short-circuiting
 
 #include <cuda/pipeline>
+#include <cuda/std/cassert>
 
 #include "test_macros.h"
 

From ef66a5136bccc0b64d0ff4f197464869e0a45c87 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Wed, 8 May 2024 22:15:04 -0700
Subject: [PATCH 47/71] Revert mistaken `LIBCUDACXX_ATOMIC_FLAG_INIT` change.

---
 .../test/libcudacxx/cuda/stream_ref/stream_ref.wait.pass.cpp    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcudacxx/test/libcudacxx/cuda/stream_ref/stream_ref.wait.pass.cpp b/libcudacxx/test/libcudacxx/cuda/stream_ref/stream_ref.wait.pass.cpp
index 8f6a017964..2c0ede6159 100644
--- a/libcudacxx/test/libcudacxx/cuda/stream_ref/stream_ref.wait.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/stream_ref/stream_ref.wait.pass.cpp
@@ -47,7 +47,7 @@ int main(int argc, char** argv)
     ( // passing case
       cudaStream_t stream;
       cudaStreamCreate(&stream);
-      std::atomic_flag flag = LIBCUDACXX_ATOMIC_FLAG_INIT;
+      std::atomic_flag flag = ATOMIC_FLAG_INIT;
       cudaStreamAddCallback(stream, callback, &flag, 0);
       cuda::stream_ref ref{stream};
       test_wait(ref);

From 83d3414fe4f3401e4b2d768b301e869340e52b16 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Wed, 8 May 2024 22:15:24 -0700
Subject: [PATCH 48/71] Fix bad atomic alignment errors.

---
 .../cuda/std/__atomic/functions/host.h        | 45 ++++++++++++++++---
 1 file changed, 38 insertions(+), 7 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__atomic/functions/host.h b/libcudacxx/include/cuda/std/__atomic/functions/host.h
index 59b848e2aa..a99379c813 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/host.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/host.h
@@ -26,6 +26,35 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_CLANG("-Watomic-alignment")
+
+template <typename _Tp>
+struct __atomic_alignment_wrapper {
+  _CCCL_ALIGNAS(sizeof(_Tp)) _Tp __atom;
+};
+
+template <typename _Tp>
+__atomic_alignment_wrapper<__remove_cv_t<_Tp>>& __atomic_auto_align(_Tp* __a) {
+  using __aligned_t = __atomic_alignment_wrapper<__remove_cv_t<_Tp>>;
+  return *reinterpret_cast<__aligned_t*>(__a);
+};
+template <typename _Tp>
+const __atomic_alignment_wrapper<__remove_cv_t<_Tp>>& __atomic_auto_align(const _Tp* __a) {
+  using __aligned_t = const __atomic_alignment_wrapper<__remove_cv_t<_Tp>>;
+  return *reinterpret_cast<__aligned_t*>(__a);
+};
+template <typename _Tp>
+volatile __atomic_alignment_wrapper<__remove_cv_t<_Tp>>& __atomic_auto_align(volatile _Tp* __a) {
+  using __aligned_t = volatile __atomic_alignment_wrapper<__remove_cv_t<_Tp>>;
+  return *reinterpret_cast<__aligned_t*>(__a);
+};
+template <typename _Tp>
+const volatile __atomic_alignment_wrapper<__remove_cv_t<_Tp>>& __atomic_auto_align(const volatile _Tp* __a) {
+  using __aligned_t = const volatile __atomic_alignment_wrapper<__remove_cv_t<_Tp>>;
+  return *reinterpret_cast<__aligned_t*>(__a);
+};
+
 // Guard ifdef for lock free query in case it is assigned elsewhere (MSVC/CUDA)
 inline void __atomic_thread_fence_host(memory_order __order)
 {
@@ -40,22 +69,22 @@ inline void __atomic_signal_fence_host(memory_order __order)
 template <typename _Tp, typename _Up>
 inline void __atomic_store_host(_Tp* __a, _Up __val, memory_order __order)
 {
-  __atomic_store(__a, &__val, __atomic_order_to_int(__order));
+  __atomic_store(&__atomic_auto_align<_Tp>(__a), &__atomic_auto_align<__remove_cv_t<_Tp>>(&__val), __atomic_order_to_int(__order));
 }
 
 template <typename _Tp>
-inline auto __atomic_load_host(_Tp* __a, memory_order __order) -> __remove_cvref_t<_Tp>
+inline auto __atomic_load_host(_Tp* __a, memory_order __order) -> __remove_cv_t<_Tp>
 {
-  __remove_cvref_t<_Tp> __ret{};
-  __atomic_load(__a, &__ret, __atomic_order_to_int(__order));
+  __remove_cv_t<_Tp> __ret;
+  __atomic_load(&__atomic_auto_align<_Tp>(__a), &__atomic_auto_align<__remove_cv_t<_Tp>>(&__ret), __atomic_order_to_int(__order));
   return __ret;
 }
 
 template <typename _Tp, typename _Up>
-inline auto __atomic_exchange_host(_Tp* __a, _Up __val, memory_order __order) -> __remove_cvref_t<_Tp>
+inline auto __atomic_exchange_host(_Tp* __a, _Up __val, memory_order __order) -> __remove_cv_t<_Tp>
 {
-  __remove_cvref_t<_Tp> __ret{};
-  __atomic_exchange(__a, &__val, &__ret, __atomic_order_to_int(__order));
+  __remove_cv_t<_Tp> __ret;
+  __atomic_exchange(&__atomic_auto_align<_Tp>(__a), &__atomic_auto_align<__remove_cv_t<_Tp>>(&__val), &__atomic_auto_align<__remove_cv_t<_Tp>>(&__ret), __atomic_order_to_int(__order));
   return __ret;
 }
 
@@ -189,6 +218,8 @@ inline _Tp __atomic_fetch_min_host(_Tp* __a, _Td __val, memory_order __order)
   return __expected;
 }
 
+_CCCL_DIAG_POP
+
 _LIBCUDACXX_END_NAMESPACE_STD
 
 #endif // _LIBCUDACXX___ATOMICS_FUNCTIONS_HOST_H

From 55cd1ec8deac33b4d0f253913fbc2289538015cf Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 9 May 2024 05:24:47 +0000
Subject: [PATCH 49/71] [pre-commit.ci] auto code formatting

---
 .../cuda/std/__atomic/functions/host.h        |  26 +-
 .../std/__atomic/platform/msvc_to_builtins.h  |  74 +-
 .../include/cuda/std/__atomic/types/common.h  |   2 +-
 .../include/cuda/std/__atomic/wait/polling.h  |   4 +-
 libcudacxx/include/cuda/std/atomic            | 841 +++++++-----------
 .../detail/libcxx/include/__threading_support |   1 -
 .../cuda/std/detail/libcxx/include/barrier    | 124 ++-
 .../cuda/std/detail/libcxx/include/latch      |   9 +-
 .../cuda/std/detail/libcxx/include/semaphore  | 314 +++----
 .../cuda/stream_ref/stream_ref.wait.pass.cpp  |   4 +-
 .../atomics.types.generic/address.pass.cpp    |   2 +-
 .../address_ref.pass.cpp                      |   2 +-
 .../address_ref_constness.pass.cpp            |   2 +-
 .../atomic_copyable.pass.cpp                  |   2 +-
 .../atomics.types.generic/bool.pass.cpp       |   2 +-
 .../cstdint_typedefs.pass.cpp                 |   2 +-
 .../atomics.types.generic/enum_class.pass.cpp |   2 +-
 .../floating_point.pass.cpp                   |   2 +-
 .../floating_point_ref.pass.cpp               |   2 +-
 .../floating_point_ref_constness.pass.cpp     |   2 +-
 .../atomics.types.generic/integral.pass.cpp   |   2 +-
 .../integral_ref.pass.cpp                     |   2 +-
 .../integral_ref_constness.pass.cpp           |   2 +-
 .../ctor.pass.cpp                             |   2 +-
 24 files changed, 626 insertions(+), 801 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__atomic/functions/host.h b/libcudacxx/include/cuda/std/__atomic/functions/host.h
index a99379c813..e02ae888db 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/host.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/host.h
@@ -30,27 +30,32 @@ _CCCL_DIAG_PUSH
 _CCCL_DIAG_SUPPRESS_CLANG("-Watomic-alignment")
 
 template <typename _Tp>
-struct __atomic_alignment_wrapper {
+struct __atomic_alignment_wrapper
+{
   _CCCL_ALIGNAS(sizeof(_Tp)) _Tp __atom;
 };
 
 template <typename _Tp>
-__atomic_alignment_wrapper<__remove_cv_t<_Tp>>& __atomic_auto_align(_Tp* __a) {
+__atomic_alignment_wrapper<__remove_cv_t<_Tp>>& __atomic_auto_align(_Tp* __a)
+{
   using __aligned_t = __atomic_alignment_wrapper<__remove_cv_t<_Tp>>;
   return *reinterpret_cast<__aligned_t*>(__a);
 };
 template <typename _Tp>
-const __atomic_alignment_wrapper<__remove_cv_t<_Tp>>& __atomic_auto_align(const _Tp* __a) {
+const __atomic_alignment_wrapper<__remove_cv_t<_Tp>>& __atomic_auto_align(const _Tp* __a)
+{
   using __aligned_t = const __atomic_alignment_wrapper<__remove_cv_t<_Tp>>;
   return *reinterpret_cast<__aligned_t*>(__a);
 };
 template <typename _Tp>
-volatile __atomic_alignment_wrapper<__remove_cv_t<_Tp>>& __atomic_auto_align(volatile _Tp* __a) {
+volatile __atomic_alignment_wrapper<__remove_cv_t<_Tp>>& __atomic_auto_align(volatile _Tp* __a)
+{
   using __aligned_t = volatile __atomic_alignment_wrapper<__remove_cv_t<_Tp>>;
   return *reinterpret_cast<__aligned_t*>(__a);
 };
 template <typename _Tp>
-const volatile __atomic_alignment_wrapper<__remove_cv_t<_Tp>>& __atomic_auto_align(const volatile _Tp* __a) {
+const volatile __atomic_alignment_wrapper<__remove_cv_t<_Tp>>& __atomic_auto_align(const volatile _Tp* __a)
+{
   using __aligned_t = const volatile __atomic_alignment_wrapper<__remove_cv_t<_Tp>>;
   return *reinterpret_cast<__aligned_t*>(__a);
 };
@@ -69,14 +74,16 @@ inline void __atomic_signal_fence_host(memory_order __order)
 template <typename _Tp, typename _Up>
 inline void __atomic_store_host(_Tp* __a, _Up __val, memory_order __order)
 {
-  __atomic_store(&__atomic_auto_align<_Tp>(__a), &__atomic_auto_align<__remove_cv_t<_Tp>>(&__val), __atomic_order_to_int(__order));
+  __atomic_store(
+    &__atomic_auto_align<_Tp>(__a), &__atomic_auto_align<__remove_cv_t<_Tp>>(&__val), __atomic_order_to_int(__order));
 }
 
 template <typename _Tp>
 inline auto __atomic_load_host(_Tp* __a, memory_order __order) -> __remove_cv_t<_Tp>
 {
   __remove_cv_t<_Tp> __ret;
-  __atomic_load(&__atomic_auto_align<_Tp>(__a), &__atomic_auto_align<__remove_cv_t<_Tp>>(&__ret), __atomic_order_to_int(__order));
+  __atomic_load(
+    &__atomic_auto_align<_Tp>(__a), &__atomic_auto_align<__remove_cv_t<_Tp>>(&__ret), __atomic_order_to_int(__order));
   return __ret;
 }
 
@@ -84,7 +91,10 @@ template <typename _Tp, typename _Up>
 inline auto __atomic_exchange_host(_Tp* __a, _Up __val, memory_order __order) -> __remove_cv_t<_Tp>
 {
   __remove_cv_t<_Tp> __ret;
-  __atomic_exchange(&__atomic_auto_align<_Tp>(__a), &__atomic_auto_align<__remove_cv_t<_Tp>>(&__val), &__atomic_auto_align<__remove_cv_t<_Tp>>(&__ret), __atomic_order_to_int(__order));
+  __atomic_exchange(&__atomic_auto_align<_Tp>(__a),
+                    &__atomic_auto_align<__remove_cv_t<_Tp>>(&__val),
+                    &__atomic_auto_align<__remove_cv_t<_Tp>>(&__ret),
+                    __atomic_order_to_int(__order));
   return __ret;
 }
 
diff --git a/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h b/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h
index c02ca6e5ff..8afa9756ef 100644
--- a/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h
+++ b/libcudacxx/include/cuda/std/__atomic/platform/msvc_to_builtins.h
@@ -23,25 +23,25 @@
 
 #if defined(_CCCL_COMPILER_MSVC)
 
-#include <cuda/std/__atomic/order.h>
-#include <cuda/std/cassert>
+#  include <cuda/std/__atomic/order.h>
+#  include <cuda/std/cassert>
 
-#include <intrin.h>
+#  include <intrin.h>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-#define _LIBCUDACXX_COMPILER_BARRIER() _ReadWriteBarrier()
+#  define _LIBCUDACXX_COMPILER_BARRIER() _ReadWriteBarrier()
 
-#if defined(_M_ARM) || defined(_M_ARM64)
-#  define _LIBCUDACXX_MEMORY_BARRIER()             __dmb(0xB) // inner shared data memory barrier
-#  define _LIBCUDACXX_COMPILER_OR_MEMORY_BARRIER() _LIBCUDACXX_MEMORY_BARRIER()
-#elif defined(_M_IX86) || defined(_M_X64)
-#  define _LIBCUDACXX_MEMORY_BARRIER()             __faststorefence()
+#  if defined(_M_ARM) || defined(_M_ARM64)
+#    define _LIBCUDACXX_MEMORY_BARRIER()             __dmb(0xB) // inner shared data memory barrier
+#    define _LIBCUDACXX_COMPILER_OR_MEMORY_BARRIER() _LIBCUDACXX_MEMORY_BARRIER()
+#  elif defined(_M_IX86) || defined(_M_X64)
+#    define _LIBCUDACXX_MEMORY_BARRIER()             __faststorefence()
 // x86/x64 hardware only emits memory barriers inside _Interlocked intrinsics
-#  define _LIBCUDACXX_COMPILER_OR_MEMORY_BARRIER() _LIBCUDACXX_COMPILER_BARRIER()
-#else // ^^^ x86/x64 / unsupported hardware vvv
-#  error Unsupported hardware
-#endif // hardware
+#    define _LIBCUDACXX_COMPILER_OR_MEMORY_BARRIER() _LIBCUDACXX_COMPILER_BARRIER()
+#  else // ^^^ x86/x64 / unsupported hardware vvv
+#    error Unsupported hardware
+#  endif // hardware
 
 // MSVC Does not have compiler intrinsics for lock-free checking
 inline int __stronger_order_msvc(int __a, int __b)
@@ -77,41 +77,41 @@ using _enable_if_sized_as = typename enable_if<sizeof(_Type) == _Size, int>::typ
 template <class _Type, _enable_if_sized_as<_Type, 1> = 0>
 void __atomic_load_relaxed(const volatile _Type* __ptr, _Type* __ret)
 {
-#ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN
+#  ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN
   __int8 __tmp = *(const volatile __int8*) __ptr;
-#else
+#  else
   __int8 __tmp = __iso_volatile_load8((const volatile __int8*) __ptr);
-#endif
+#  endif
   *__ret = reinterpret_cast<_Type&>(__tmp);
 }
 template <class _Type, _enable_if_sized_as<_Type, 2> = 0>
 void __atomic_load_relaxed(const volatile _Type* __ptr, _Type* __ret)
 {
-#ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN
+#  ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN
   __int16 __tmp = *(const volatile __int16*) __ptr;
-#else
+#  else
   __int16 __tmp = __iso_volatile_load16((const volatile __int16*) __ptr);
-#endif
+#  endif
   *__ret = reinterpret_cast<_Type&>(__tmp);
 }
 template <class _Type, _enable_if_sized_as<_Type, 4> = 0>
 void __atomic_load_relaxed(const volatile _Type* __ptr, _Type* __ret)
 {
-#ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN
+#  ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN
   __int32 __tmp = *(const volatile __int32*) __ptr;
-#else
+#  else
   __int32 __tmp = __iso_volatile_load32((const volatile __int32*) __ptr);
-#endif
+#  endif
   *__ret = reinterpret_cast<_Type&>(__tmp);
 }
 template <class _Type, _enable_if_sized_as<_Type, 8> = 0>
 void __atomic_load_relaxed(const volatile _Type* __ptr, _Type* __ret)
 {
-#ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN
+#  ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN
   __int64 __tmp = *(const volatile __int64*) __ptr;
-#else
+#  else
   __int64 __tmp = __iso_volatile_load64((const volatile __int64*) __ptr);
-#endif
+#  endif
   *__ret = reinterpret_cast<_Type&>(__tmp);
 }
 
@@ -141,45 +141,45 @@ void __atomic_store_relaxed(volatile _Type* __ptr, _Type* __val)
 {
   auto __t = reinterpret_cast<__int8*>(__val);
   auto __d = reinterpret_cast<volatile __int8*>(__ptr);
-#ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN
+#  ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN
   (void) _InterlockedExchange8(__d, *__t);
-#else
+#  else
   __iso_volatile_store8(__d, *__t);
-#endif
+#  endif
 }
 template <class _Type, _enable_if_sized_as<_Type, 2> = 0>
 void __atomic_store_relaxed(volatile _Type* __ptr, _Type* __val)
 {
   auto __t = reinterpret_cast<__int16*>(__val);
   auto __d = reinterpret_cast<volatile __int16*>(__ptr);
-#ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN
+#  ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN
   (void) _InterlockedExchange16(__d, *__t);
-#else
+#  else
   __iso_volatile_store16(__d, *__t);
-#endif
+#  endif
 }
 template <class _Type, _enable_if_sized_as<_Type, 4> = 0>
 void __atomic_store_relaxed(volatile _Type* __ptr, _Type* __val)
 {
   auto __t = reinterpret_cast<__int32*>(__val);
   auto __d = reinterpret_cast<volatile __int32*>(__ptr);
-#ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN
+#  ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN
   // int cannot be converted to long?...
   (void) _InterlockedExchange(reinterpret_cast<volatile long*>(__d), *__t);
-#else
+#  else
   __iso_volatile_store32(__d, *__t);
-#endif
+#  endif
 }
 template <class _Type, _enable_if_sized_as<_Type, 8> = 0>
 void __atomic_store_relaxed(volatile _Type* __ptr, _Type* __val)
 {
   auto __t = reinterpret_cast<__int64*>(__val);
   auto __d = reinterpret_cast<volatile __int64*>(__ptr);
-#ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN
+#  ifdef _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN
   (void) _InterlockedExchange64(__d, *__t);
-#else
+#  else
   __iso_volatile_store64(__d, *__t);
-#endif
+#  endif
 }
 
 template <class _Type>
diff --git a/libcudacxx/include/cuda/std/__atomic/types/common.h b/libcudacxx/include/cuda/std/__atomic/types/common.h
index f9382cdcf6..1906f9e745 100644
--- a/libcudacxx/include/cuda/std/__atomic/types/common.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/common.h
@@ -21,10 +21,10 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/std/detail/libcxx/include/cstring>
 #include <cuda/std/__type_traits/enable_if.h>
 #include <cuda/std/__type_traits/is_assignable.h>
 #include <cuda/std/__type_traits/remove_cv.h>
+#include <cuda/std/detail/libcxx/include/cstring>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__atomic/wait/polling.h b/libcudacxx/include/cuda/std/__atomic/wait/polling.h
index 0fb0ec3755..8fe5f24b6d 100644
--- a/libcudacxx/include/cuda/std/__atomic/wait/polling.h
+++ b/libcudacxx/include/cuda/std/__atomic/wait/polling.h
@@ -50,8 +50,8 @@ struct __atomic_poll_tester
 };
 
 template <typename _Tp, typename _Sco>
-_CCCL_HOST_DEVICE void
-__atomic_try_wait_slow_fallback(_Tp const volatile* __a, __atomic_underlying_remove_cv_t<_Tp> __val, memory_order __order, _Sco)
+_CCCL_HOST_DEVICE void __atomic_try_wait_slow_fallback(
+  _Tp const volatile* __a, __atomic_underlying_remove_cv_t<_Tp> __val, memory_order __order, _Sco)
 {
   __libcpp_thread_poll_with_backoff(__atomic_poll_tester<_Tp, _Sco>(__a, __val, __order));
 }
diff --git a/libcudacxx/include/cuda/std/atomic b/libcudacxx/include/cuda/std/atomic
index 166cdf0c16..88214a49ff 100644
--- a/libcudacxx/include/cuda/std/atomic
+++ b/libcudacxx/include/cuda/std/atomic
@@ -53,969 +53,784 @@ _CCCL_PUSH_MACROS
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp kill_dependency(_Tp __y) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp kill_dependency(_Tp __y) noexcept
 {
-    return __y;
+  return __y;
 }
 
 // atomic<T>
 template <class _Tp>
-struct atomic
-    : public __atomic_impl<_Tp>
+struct atomic : public __atomic_impl<_Tp>
 {
-    using value_type = _Tp;
+  using value_type = _Tp;
 
-    _LIBCUDACXX_INLINE_VISIBILITY
-    constexpr atomic() noexcept
-        : __atomic_impl<_Tp>() {}
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr atomic() noexcept
+      : __atomic_impl<_Tp>()
+  {}
 
-    _LIBCUDACXX_INLINE_VISIBILITY
-    constexpr atomic(_Tp __d) noexcept
-        : __atomic_impl<_Tp>(__d) {}
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr atomic(_Tp __d) noexcept
+      : __atomic_impl<_Tp>(__d)
+  {}
 
-    atomic(const atomic&) = delete;
-    atomic& operator=(const atomic&) = delete;
-    atomic& operator=(const atomic&) volatile = delete;
+  atomic(const atomic&)                     = delete;
+  atomic& operator=(const atomic&)          = delete;
+  atomic& operator=(const atomic&) volatile = delete;
 
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator=(_Tp __d) volatile noexcept
-        {this->store(__d); return __d;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator=(_Tp __d) noexcept
-        {this->store(__d); return __d;}
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator=(_Tp __d) volatile noexcept
+  {
+    this->store(__d);
+    return __d;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator=(_Tp __d) noexcept
+  {
+    this->store(__d);
+    return __d;
+  }
 };
 
 // atomic_ref<T>
 template <class _Tp>
-struct atomic_ref
-    : public __atomic_ref_impl<_Tp>
+struct atomic_ref : public __atomic_ref_impl<_Tp>
 {
-    using value_type = _Tp;
+  using value_type = _Tp;
 
-    static constexpr size_t required_alignment = sizeof(_Tp);
+  static constexpr size_t required_alignment = sizeof(_Tp);
 
-    static constexpr bool is_always_lock_free = sizeof(_Tp) <= 8;
+  static constexpr bool is_always_lock_free = sizeof(_Tp) <= 8;
 
-    _LIBCUDACXX_INLINE_VISIBILITY
-    explicit atomic_ref(_Tp& __ref) : __atomic_ref_impl<_Tp>(__ref) {}
+  _LIBCUDACXX_INLINE_VISIBILITY explicit atomic_ref(_Tp& __ref)
+      : __atomic_ref_impl<_Tp>(__ref)
+  {}
 
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator=(_Tp __v) const noexcept {this->store(__v); return __v;}
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator=(_Tp __v) const noexcept
+  {
+    this->store(__v);
+    return __v;
+  }
 
-    atomic_ref(const atomic_ref&) noexcept = default;
-    atomic_ref& operator=(const atomic_ref&) = delete;
-    atomic_ref& operator=(const atomic_ref&) const = delete;
+  atomic_ref(const atomic_ref&) noexcept         = default;
+  atomic_ref& operator=(const atomic_ref&)       = delete;
+  atomic_ref& operator=(const atomic_ref&) const = delete;
 };
 
 // atomic_is_lock_free
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_is_lock_free(const volatile atomic<_Tp>* __o) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY bool atomic_is_lock_free(const volatile atomic<_Tp>* __o) noexcept
 {
-    return __o->is_lock_free();
+  return __o->is_lock_free();
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_is_lock_free(const atomic<_Tp>* __o) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY bool atomic_is_lock_free(const atomic<_Tp>* __o) noexcept
 {
-    return __o->is_lock_free();
+  return __o->is_lock_free();
 }
 
 // atomic_init
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_init(volatile atomic<_Tp>* __o, _Tp __d) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_init(volatile atomic<_Tp>* __o, _Tp __d) noexcept
 {
-    __atomic_init_dispatch(&__o->__a, __d);
+  __atomic_init_dispatch(&__o->__a, __d);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_init(atomic<_Tp>* __o, _Tp __d) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_init(atomic<_Tp>* __o, _Tp __d) noexcept
 {
-    __atomic_init_dispatch(&__o->__a, __d);
+  __atomic_init_dispatch(&__o->__a, __d);
 }
 
 // atomic_store
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_store(volatile atomic<_Tp>* __o, _Tp __d) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_store(volatile atomic<_Tp>* __o, _Tp __d) noexcept
 {
-    __o->store(__d);
+  __o->store(__d);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_store(atomic<_Tp>* __o, _Tp __d) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_store(atomic<_Tp>* __o, _Tp __d) noexcept
 {
-    __o->store(__d);
+  __o->store(__d);
 }
 
 // atomic_store_explicit
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_store_explicit(volatile atomic<_Tp>* __o, _Tp __d, memory_order __m) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_store_explicit(volatile atomic<_Tp>* __o, _Tp __d, memory_order __m) noexcept
   _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
 {
-    __o->store(__d, __m);
+  __o->store(__d, __m);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_store_explicit(atomic<_Tp>* __o, _Tp __d, memory_order __m) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_store_explicit(atomic<_Tp>* __o, _Tp __d, memory_order __m) noexcept
   _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
 {
-    __o->store(__d, __m);
+  __o->store(__d, __m);
 }
 
 // atomic_load
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp
-atomic_load(const volatile atomic<_Tp>* __o) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_load(const volatile atomic<_Tp>* __o) noexcept
 {
-    return __o->load();
+  return __o->load();
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp
-atomic_load(const atomic<_Tp>* __o) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_load(const atomic<_Tp>* __o) noexcept
 {
-    return __o->load();
+  return __o->load();
 }
 
 // atomic_load_explicit
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp
-atomic_load_explicit(const volatile atomic<_Tp>* __o, memory_order __m) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_load_explicit(const volatile atomic<_Tp>* __o, memory_order __m) noexcept
   _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
 {
-    return __o->load(__m);
+  return __o->load(__m);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp
-atomic_load_explicit(const atomic<_Tp>* __o, memory_order __m) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_load_explicit(const atomic<_Tp>* __o, memory_order __m) noexcept
   _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
 {
-    return __o->load(__m);
+  return __o->load(__m);
 }
 
 // atomic_exchange
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp
-atomic_exchange(volatile atomic<_Tp>* __o, _Tp __d) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_exchange(volatile atomic<_Tp>* __o, _Tp __d) noexcept
 {
-    return __o->exchange(__d);
+  return __o->exchange(__d);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp
-atomic_exchange(atomic<_Tp>* __o, _Tp __d) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_exchange(atomic<_Tp>* __o, _Tp __d) noexcept
 {
-    return __o->exchange(__d);
+  return __o->exchange(__d);
 }
 
 // atomic_exchange_explicit
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp
-atomic_exchange_explicit(volatile atomic<_Tp>* __o, _Tp __d, memory_order __m) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_exchange_explicit(volatile atomic<_Tp>* __o, _Tp __d, memory_order __m) noexcept
 {
-    return __o->exchange(__d, __m);
+  return __o->exchange(__d, __m);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp
-atomic_exchange_explicit(atomic<_Tp>* __o, _Tp __d, memory_order __m) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_exchange_explicit(atomic<_Tp>* __o, _Tp __d, memory_order __m) noexcept
 {
-    return __o->exchange(__d, __m);
+  return __o->exchange(__d, __m);
 }
 
 // atomic_compare_exchange_weak
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_compare_exchange_weak(volatile atomic<_Tp>* __o, _Tp* __e, _Tp __d) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_weak(volatile atomic<_Tp>* __o, _Tp* __e, _Tp __d) noexcept
 {
-    return __o->compare_exchange_weak(*__e, __d);
+  return __o->compare_exchange_weak(*__e, __d);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_compare_exchange_weak(atomic<_Tp>* __o, _Tp* __e, _Tp __d) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_weak(atomic<_Tp>* __o, _Tp* __e, _Tp __d) noexcept
 {
-    return __o->compare_exchange_weak(*__e, __d);
+  return __o->compare_exchange_weak(*__e, __d);
 }
 
 // atomic_compare_exchange_strong
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_compare_exchange_strong(volatile atomic<_Tp>* __o, _Tp* __e, _Tp __d) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_strong(volatile atomic<_Tp>* __o, _Tp* __e, _Tp __d) noexcept
 {
-    return __o->compare_exchange_strong(*__e, __d);
+  return __o->compare_exchange_strong(*__e, __d);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_compare_exchange_strong(atomic<_Tp>* __o, _Tp* __e, _Tp __d) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_strong(atomic<_Tp>* __o, _Tp* __e, _Tp __d) noexcept
 {
-    return __o->compare_exchange_strong(*__e, __d);
+  return __o->compare_exchange_strong(*__e, __d);
 }
 
 // atomic_compare_exchange_weak_explicit
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_compare_exchange_weak_explicit(volatile atomic<_Tp>* __o, _Tp* __e,
-                                      _Tp __d,
-                                      memory_order __s, memory_order __f) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_weak_explicit(
+  volatile atomic<_Tp>* __o, _Tp* __e, _Tp __d, memory_order __s, memory_order __f) noexcept
   _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
 {
-    return __o->compare_exchange_weak(*__e, __d, __s, __f);
+  return __o->compare_exchange_weak(*__e, __d, __s, __f);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_compare_exchange_weak_explicit(atomic<_Tp>* __o, _Tp* __e, _Tp __d,
-                                      memory_order __s, memory_order __f) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY bool
+atomic_compare_exchange_weak_explicit(atomic<_Tp>* __o, _Tp* __e, _Tp __d, memory_order __s, memory_order __f) noexcept
   _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
 {
-    return __o->compare_exchange_weak(*__e, __d, __s, __f);
+  return __o->compare_exchange_weak(*__e, __d, __s, __f);
 }
 
 // atomic_compare_exchange_strong_explicit
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_compare_exchange_strong_explicit(volatile atomic<_Tp>* __o,
-                                        _Tp* __e, _Tp __d,
-                                        memory_order __s, memory_order __f) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_strong_explicit(
+  volatile atomic<_Tp>* __o, _Tp* __e, _Tp __d, memory_order __s, memory_order __f) noexcept
   _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
 {
-    return __o->compare_exchange_strong(*__e, __d, __s, __f);
+  return __o->compare_exchange_strong(*__e, __d, __s, __f);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_compare_exchange_strong_explicit(atomic<_Tp>* __o, _Tp* __e,
-                                        _Tp __d,
-                                        memory_order __s, memory_order __f) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_strong_explicit(
+  atomic<_Tp>* __o, _Tp* __e, _Tp __d, memory_order __s, memory_order __f) noexcept
   _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
 {
-    return __o->compare_exchange_strong(*__e, __d, __s, __f);
+  return __o->compare_exchange_strong(*__e, __d, __s, __f);
 }
 
 // atomic_wait
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-void atomic_wait(const volatile atomic<_Tp>* __o,
-                    typename atomic<_Tp>::value_type __v) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void
+atomic_wait(const volatile atomic<_Tp>* __o, typename atomic<_Tp>::value_type __v) noexcept
 {
-    return __o->wait(__v);
+  return __o->wait(__v);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-void atomic_wait(const atomic<_Tp>* __o,
-                    typename atomic<_Tp>::value_type __v) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_wait(const atomic<_Tp>* __o, typename atomic<_Tp>::value_type __v) noexcept
 {
-    return __o->wait(__v);
+  return __o->wait(__v);
 }
 
 // atomic_wait_explicit
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-void atomic_wait_explicit(const volatile atomic<_Tp>* __o,
-                            typename atomic<_Tp>::value_type __v,
-                            memory_order __m) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void
+atomic_wait_explicit(const volatile atomic<_Tp>* __o, typename atomic<_Tp>::value_type __v, memory_order __m) noexcept
   _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
 {
-    return __o->wait(__v, __m);
+  return __o->wait(__v, __m);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-void atomic_wait_explicit(const atomic<_Tp>* __o,
-                            typename atomic<_Tp>::value_type __v,
-                            memory_order __m) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void
+atomic_wait_explicit(const atomic<_Tp>* __o, typename atomic<_Tp>::value_type __v, memory_order __m) noexcept
   _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
 {
-    return __o->wait(__v, __m);
+  return __o->wait(__v, __m);
 }
 
 // atomic_notify_one
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-void atomic_notify_one(volatile atomic<_Tp>* __o) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_notify_one(volatile atomic<_Tp>* __o) noexcept
 {
-    __o->notify_one();
+  __o->notify_one();
 }
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-void atomic_notify_one(atomic<_Tp>* __o) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_notify_one(atomic<_Tp>* __o) noexcept
 {
-    __o->notify_one();
+  __o->notify_one();
 }
 
 // atomic_notify_one
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-void atomic_notify_all(volatile atomic<_Tp>* __o) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_notify_all(volatile atomic<_Tp>* __o) noexcept
 {
-    __o->notify_all();
+  __o->notify_all();
 }
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-void atomic_notify_all(atomic<_Tp>* __o) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_notify_all(atomic<_Tp>* __o) noexcept
 {
-    __o->notify_all();
+  __o->notify_all();
 }
 
 // atomic_fetch_add
 
 template <class _Tp>
 _LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    (is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value,
-    _Tp
->
-atomic_fetch_add(volatile atomic<_Tp>* __o, _Tp __op) noexcept
+  __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
+  atomic_fetch_add(volatile atomic<_Tp>* __o, _Tp __op) noexcept
 {
-    return __o->fetch_add(__op);
+  return __o->fetch_add(__op);
 }
 
 template <class _Tp>
 _LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    (is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value,
-    _Tp
->
-atomic_fetch_add(atomic<_Tp>* __o, _Tp __op) noexcept
+  __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
+  atomic_fetch_add(atomic<_Tp>* __o, _Tp __op) noexcept
 {
-    return __o->fetch_add(__op);
+  return __o->fetch_add(__op);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp*
-atomic_fetch_add(volatile atomic<_Tp*>* __o, ptrdiff_t __op) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp* atomic_fetch_add(volatile atomic<_Tp*>* __o, ptrdiff_t __op) noexcept
 {
-    return __o->fetch_add(__op);
+  return __o->fetch_add(__op);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp*
-atomic_fetch_add(atomic<_Tp*>* __o, ptrdiff_t __op) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp* atomic_fetch_add(atomic<_Tp*>* __o, ptrdiff_t __op) noexcept
 {
-    return __o->fetch_add(__op);
+  return __o->fetch_add(__op);
 }
 
 // atomic_fetch_add_explicit
 
 template <class _Tp>
 _LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    (is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value,
-    _Tp
->
-atomic_fetch_add_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
+  __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
+  atomic_fetch_add_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 {
-    return __o->fetch_add(__op, __m);
+  return __o->fetch_add(__op, __m);
 }
 
 template <class _Tp>
 _LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    (is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value,
-    _Tp
->
-atomic_fetch_add_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
+  __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
+  atomic_fetch_add_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 {
-    return __o->fetch_add(__op, __m);
+  return __o->fetch_add(__op, __m);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp*
-atomic_fetch_add_explicit(volatile atomic<_Tp*>* __o, ptrdiff_t __op,
-                          memory_order __m) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp*
+atomic_fetch_add_explicit(volatile atomic<_Tp*>* __o, ptrdiff_t __op, memory_order __m) noexcept
 {
-    return __o->fetch_add(__op, __m);
+  return __o->fetch_add(__op, __m);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp*
+_LIBCUDACXX_INLINE_VISIBILITY _Tp*
 atomic_fetch_add_explicit(atomic<_Tp*>* __o, ptrdiff_t __op, memory_order __m) noexcept
 {
-    return __o->fetch_add(__op, __m);
+  return __o->fetch_add(__op, __m);
 }
 
 // atomic_fetch_sub
 
 template <class _Tp>
 _LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    (is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value,
-    _Tp
->
-atomic_fetch_sub(volatile atomic<_Tp>* __o, _Tp __op) noexcept
+  __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
+  atomic_fetch_sub(volatile atomic<_Tp>* __o, _Tp __op) noexcept
 {
-    return __o->fetch_sub(__op);
+  return __o->fetch_sub(__op);
 }
 
 template <class _Tp>
 _LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    (is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value,
-    _Tp
->
-atomic_fetch_sub(atomic<_Tp>* __o, _Tp __op) noexcept
+  __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
+  atomic_fetch_sub(atomic<_Tp>* __o, _Tp __op) noexcept
 {
-    return __o->fetch_sub(__op);
+  return __o->fetch_sub(__op);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp*
-atomic_fetch_sub(volatile atomic<_Tp*>* __o, ptrdiff_t __op) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp* atomic_fetch_sub(volatile atomic<_Tp*>* __o, ptrdiff_t __op) noexcept
 {
-    return __o->fetch_sub(__op);
+  return __o->fetch_sub(__op);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp*
-atomic_fetch_sub(atomic<_Tp*>* __o, ptrdiff_t __op) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp* atomic_fetch_sub(atomic<_Tp*>* __o, ptrdiff_t __op) noexcept
 {
-    return __o->fetch_sub(__op);
+  return __o->fetch_sub(__op);
 }
 
 // atomic_fetch_sub_explicit
 
 template <class _Tp>
 _LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    (is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value,
-    _Tp
->
-atomic_fetch_sub_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
+  __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
+  atomic_fetch_sub_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 {
-    return __o->fetch_sub(__op, __m);
+  return __o->fetch_sub(__op, __m);
 }
 
 template <class _Tp>
 _LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    (is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value,
-    _Tp
->
-atomic_fetch_sub_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
+  __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
+  atomic_fetch_sub_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 {
-    return __o->fetch_sub(__op, __m);
+  return __o->fetch_sub(__op, __m);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp*
-atomic_fetch_sub_explicit(volatile atomic<_Tp*>* __o, ptrdiff_t __op,
-                          memory_order __m) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp*
+atomic_fetch_sub_explicit(volatile atomic<_Tp*>* __o, ptrdiff_t __op, memory_order __m) noexcept
 {
-    return __o->fetch_sub(__op, __m);
+  return __o->fetch_sub(__op, __m);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp*
+_LIBCUDACXX_INLINE_VISIBILITY _Tp*
 atomic_fetch_sub_explicit(atomic<_Tp*>* __o, ptrdiff_t __op, memory_order __m) noexcept
 {
-    return __o->fetch_sub(__op, __m);
+  return __o->fetch_sub(__op, __m);
 }
 
 // atomic_fetch_and
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
-    _Tp
->
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_and(volatile atomic<_Tp>* __o, _Tp __op) noexcept
 {
-    return __o->fetch_and(__op);
+  return __o->fetch_and(__op);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
-    _Tp
->
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_and(atomic<_Tp>* __o, _Tp __op) noexcept
 {
-    return __o->fetch_and(__op);
+  return __o->fetch_and(__op);
 }
 
 // atomic_fetch_and_explicit
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
-    _Tp
->
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_and_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 {
-    return __o->fetch_and(__op, __m);
+  return __o->fetch_and(__op, __m);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
-    _Tp
->
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_and_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 {
-    return __o->fetch_and(__op, __m);
+  return __o->fetch_and(__op, __m);
 }
 
 // atomic_fetch_or
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
-    _Tp
->
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_or(volatile atomic<_Tp>* __o, _Tp __op) noexcept
 {
-    return __o->fetch_or(__op);
+  return __o->fetch_or(__op);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
-    _Tp
->
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_or(atomic<_Tp>* __o, _Tp __op) noexcept
 {
-    return __o->fetch_or(__op);
+  return __o->fetch_or(__op);
 }
 
 // atomic_fetch_or_explicit
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
-    _Tp
->
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_or_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 {
-    return __o->fetch_or(__op, __m);
+  return __o->fetch_or(__op, __m);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
-    _Tp
->
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_or_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 {
-    return __o->fetch_or(__op, __m);
+  return __o->fetch_or(__op, __m);
 }
 
 // atomic_fetch_xor
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
-    _Tp
->
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_xor(volatile atomic<_Tp>* __o, _Tp __op) noexcept
 {
-    return __o->fetch_xor(__op);
+  return __o->fetch_xor(__op);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
-    _Tp
->
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_xor(atomic<_Tp>* __o, _Tp __op) noexcept
 {
-    return __o->fetch_xor(__op);
+  return __o->fetch_xor(__op);
 }
 
 // atomic_fetch_xor_explicit
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
-    _Tp
->
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_xor_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 {
-    return __o->fetch_xor(__op, __m);
+  return __o->fetch_xor(__op, __m);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
-    _Tp
->
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_xor_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 {
-    return __o->fetch_xor(__op, __m);
+  return __o->fetch_xor(__op, __m);
 }
 
 // flag type and operations
 
 struct atomic_flag
 {
-    __atomic_storage_t<_LIBCUDACXX_ATOMIC_FLAG_TYPE> __a;
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool test(memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {return _LIBCUDACXX_ATOMIC_FLAG_TYPE(true)==__atomic_load_dispatch(&__a, __m, __thread_scope_system_tag{});}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool test(memory_order __m = memory_order_seq_cst) const noexcept
-        {return _LIBCUDACXX_ATOMIC_FLAG_TYPE(true)==__atomic_load_dispatch(&__a, __m, __thread_scope_system_tag{});}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool test_and_set(memory_order __m = memory_order_seq_cst) volatile noexcept
-        {return __atomic_exchange_dispatch(&__a, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), __m, __thread_scope_system_tag{});}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool test_and_set(memory_order __m = memory_order_seq_cst) noexcept
-        {return __atomic_exchange_dispatch(&__a, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), __m, __thread_scope_system_tag{});}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void clear(memory_order __m = memory_order_seq_cst) volatile noexcept
-        {__atomic_store_dispatch(&__a, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), __m, __thread_scope_system_tag{});}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void clear(memory_order __m = memory_order_seq_cst) noexcept
-        {__atomic_store_dispatch(&__a, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), __m, __thread_scope_system_tag{});}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void wait(_LIBCUDACXX_ATOMIC_FLAG_TYPE __v, memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {__atomic_wait(&__a, __v, __m, __thread_scope_system_tag{});}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void wait(_LIBCUDACXX_ATOMIC_FLAG_TYPE __v, memory_order __m = memory_order_seq_cst) const noexcept
-        {__atomic_wait(&__a, __v, __m, __thread_scope_system_tag{});}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void notify_one() volatile noexcept
-        {__atomic_notify_one(&__a, __thread_scope_system_tag{});}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void notify_one() noexcept
-        {__atomic_notify_one(&__a, __thread_scope_system_tag{});}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void notify_all() volatile noexcept
-        {__atomic_notify_all(&__a, __thread_scope_system_tag{});}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void notify_all() noexcept
-        {__atomic_notify_all(&__a, __thread_scope_system_tag{});}
-
-
-    atomic_flag() noexcept = default;
-
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr
-    atomic_flag(bool __b) noexcept : __a(__b) {} // EXTENSION
-
-    atomic_flag(const atomic_flag&) = delete;
-    atomic_flag& operator=(const atomic_flag&) = delete;
-    atomic_flag& operator=(const atomic_flag&) volatile = delete;
+  __atomic_storage_t<_LIBCUDACXX_ATOMIC_FLAG_TYPE> __a;
+
+  _LIBCUDACXX_INLINE_VISIBILITY bool test(memory_order __m = memory_order_seq_cst) const volatile noexcept
+  {
+    return _LIBCUDACXX_ATOMIC_FLAG_TYPE(true) == __atomic_load_dispatch(&__a, __m, __thread_scope_system_tag{});
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool test(memory_order __m = memory_order_seq_cst) const noexcept
+  {
+    return _LIBCUDACXX_ATOMIC_FLAG_TYPE(true) == __atomic_load_dispatch(&__a, __m, __thread_scope_system_tag{});
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY bool test_and_set(memory_order __m = memory_order_seq_cst) volatile noexcept
+  {
+    return __atomic_exchange_dispatch(&__a, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), __m, __thread_scope_system_tag{});
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool test_and_set(memory_order __m = memory_order_seq_cst) noexcept
+  {
+    return __atomic_exchange_dispatch(&__a, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), __m, __thread_scope_system_tag{});
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void clear(memory_order __m = memory_order_seq_cst) volatile noexcept
+  {
+    __atomic_store_dispatch(&__a, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), __m, __thread_scope_system_tag{});
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void clear(memory_order __m = memory_order_seq_cst) noexcept
+  {
+    __atomic_store_dispatch(&__a, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), __m, __thread_scope_system_tag{});
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY void
+  wait(_LIBCUDACXX_ATOMIC_FLAG_TYPE __v, memory_order __m = memory_order_seq_cst) const volatile noexcept
+  {
+    __atomic_wait(&__a, __v, __m, __thread_scope_system_tag{});
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void
+  wait(_LIBCUDACXX_ATOMIC_FLAG_TYPE __v, memory_order __m = memory_order_seq_cst) const noexcept
+  {
+    __atomic_wait(&__a, __v, __m, __thread_scope_system_tag{});
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void notify_one() volatile noexcept
+  {
+    __atomic_notify_one(&__a, __thread_scope_system_tag{});
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void notify_one() noexcept
+  {
+    __atomic_notify_one(&__a, __thread_scope_system_tag{});
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void notify_all() volatile noexcept
+  {
+    __atomic_notify_all(&__a, __thread_scope_system_tag{});
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void notify_all() noexcept
+  {
+    __atomic_notify_all(&__a, __thread_scope_system_tag{});
+  }
+
+  atomic_flag() noexcept = default;
+
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr atomic_flag(bool __b) noexcept
+      : __a(__b)
+  {} // EXTENSION
+
+  atomic_flag(const atomic_flag&)                     = delete;
+  atomic_flag& operator=(const atomic_flag&)          = delete;
+  atomic_flag& operator=(const atomic_flag&) volatile = delete;
 };
 
-
-inline _LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_flag_test(const volatile atomic_flag* __o) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY bool atomic_flag_test(const volatile atomic_flag* __o) noexcept
 {
-    return __o->test();
+  return __o->test();
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_flag_test(const atomic_flag* __o) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY bool atomic_flag_test(const atomic_flag* __o) noexcept
 {
-    return __o->test();
+  return __o->test();
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-bool
+inline _LIBCUDACXX_INLINE_VISIBILITY bool
 atomic_flag_test_explicit(const volatile atomic_flag* __o, memory_order __m) noexcept
 {
-    return __o->test(__m);
+  return __o->test(__m);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_flag_test_explicit(const atomic_flag* __o, memory_order __m) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY bool atomic_flag_test_explicit(const atomic_flag* __o, memory_order __m) noexcept
 {
-    return __o->test(__m);
+  return __o->test(__m);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_flag_test_and_set(volatile atomic_flag* __o) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY bool atomic_flag_test_and_set(volatile atomic_flag* __o) noexcept
 {
-    return __o->test_and_set();
+  return __o->test_and_set();
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_flag_test_and_set(atomic_flag* __o) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY bool atomic_flag_test_and_set(atomic_flag* __o) noexcept
 {
-    return __o->test_and_set();
+  return __o->test_and_set();
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-bool
+inline _LIBCUDACXX_INLINE_VISIBILITY bool
 atomic_flag_test_and_set_explicit(volatile atomic_flag* __o, memory_order __m) noexcept
 {
-    return __o->test_and_set(__m);
+  return __o->test_and_set(__m);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_flag_test_and_set_explicit(atomic_flag* __o, memory_order __m) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY bool atomic_flag_test_and_set_explicit(atomic_flag* __o, memory_order __m) noexcept
 {
-    return __o->test_and_set(__m);
+  return __o->test_and_set(__m);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_flag_clear(volatile atomic_flag* __o) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_clear(volatile atomic_flag* __o) noexcept
 {
-    __o->clear();
+  __o->clear();
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_flag_clear(atomic_flag* __o) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_clear(atomic_flag* __o) noexcept
 {
-    __o->clear();
+  __o->clear();
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
+inline _LIBCUDACXX_INLINE_VISIBILITY void
 atomic_flag_clear_explicit(volatile atomic_flag* __o, memory_order __m) noexcept
 {
-    __o->clear(__m);
+  __o->clear(__m);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_flag_clear_explicit(atomic_flag* __o, memory_order __m) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_clear_explicit(atomic_flag* __o, memory_order __m) noexcept
 {
-    __o->clear(__m);
+  __o->clear(__m);
 }
 
 #if !defined(__CUDA_MINIMUM_ARCH__) || __CUDA_MINIMUM_ARCH__ >= 700
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_flag_wait(const volatile atomic_flag* __o, bool __v) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_wait(const volatile atomic_flag* __o, bool __v) noexcept
 {
-    __o->wait(__v);
+  __o->wait(__v);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_flag_wait(const atomic_flag* __o, bool __v) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_wait(const atomic_flag* __o, bool __v) noexcept
 {
-    __o->wait(__v);
+  __o->wait(__v);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_flag_wait_explicit(const volatile atomic_flag* __o,
-                          bool __v, memory_order __m) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void
+atomic_flag_wait_explicit(const volatile atomic_flag* __o, bool __v, memory_order __m) noexcept
 {
-    __o->wait(__v, __m);
+  __o->wait(__v, __m);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_flag_wait_explicit(const atomic_flag* __o,
-                          bool __v, memory_order __m) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void
+atomic_flag_wait_explicit(const atomic_flag* __o, bool __v, memory_order __m) noexcept
 {
-    __o->wait(__v, __m);
+  __o->wait(__v, __m);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_flag_notify_one(volatile atomic_flag* __o) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_notify_one(volatile atomic_flag* __o) noexcept
 {
-    __o->notify_one();
+  __o->notify_one();
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_flag_notify_one(atomic_flag* __o) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_notify_one(atomic_flag* __o) noexcept
 {
-    __o->notify_one();
+  __o->notify_one();
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_flag_notify_all(volatile atomic_flag* __o) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_notify_all(volatile atomic_flag* __o) noexcept
 {
-    __o->notify_all();
+  __o->notify_all();
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_flag_notify_all(atomic_flag* __o) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_notify_all(atomic_flag* __o) noexcept
 {
-    __o->notify_all();
+  __o->notify_all();
 }
 
 #endif
 
 // fences
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_thread_fence(memory_order __m) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_thread_fence(memory_order __m) noexcept
 {
-    __atomic_thread_fence_dispatch(__m);
+  __atomic_thread_fence_dispatch(__m);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_signal_fence(memory_order __m) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_signal_fence(memory_order __m) noexcept
 {
-    __atomic_signal_fence_dispatch(__m);
+  __atomic_signal_fence_dispatch(__m);
 }
 
 // Atomics for standard typedef types
 
-typedef atomic<bool>               atomic_bool;
-typedef atomic<char>               atomic_char;
-typedef atomic<signed char>        atomic_schar;
-typedef atomic<unsigned char>      atomic_uchar;
-typedef atomic<short>              atomic_short;
-typedef atomic<unsigned short>     atomic_ushort;
-typedef atomic<int>                atomic_int;
-typedef atomic<unsigned int>       atomic_uint;
-typedef atomic<long>               atomic_long;
-typedef atomic<unsigned long>      atomic_ulong;
-typedef atomic<long long>          atomic_llong;
+typedef atomic<bool> atomic_bool;
+typedef atomic<char> atomic_char;
+typedef atomic<signed char> atomic_schar;
+typedef atomic<unsigned char> atomic_uchar;
+typedef atomic<short> atomic_short;
+typedef atomic<unsigned short> atomic_ushort;
+typedef atomic<int> atomic_int;
+typedef atomic<unsigned int> atomic_uint;
+typedef atomic<long> atomic_long;
+typedef atomic<unsigned long> atomic_ulong;
+typedef atomic<long long> atomic_llong;
 typedef atomic<unsigned long long> atomic_ullong;
-typedef atomic<char16_t>           atomic_char16_t;
-typedef atomic<char32_t>           atomic_char32_t;
-typedef atomic<wchar_t>            atomic_wchar_t;
+typedef atomic<char16_t> atomic_char16_t;
+typedef atomic<char32_t> atomic_char32_t;
+typedef atomic<wchar_t> atomic_wchar_t;
 
-typedef atomic<int_least8_t>   atomic_int_least8_t;
-typedef atomic<uint_least8_t>  atomic_uint_least8_t;
-typedef atomic<int_least16_t>  atomic_int_least16_t;
+typedef atomic<int_least8_t> atomic_int_least8_t;
+typedef atomic<uint_least8_t> atomic_uint_least8_t;
+typedef atomic<int_least16_t> atomic_int_least16_t;
 typedef atomic<uint_least16_t> atomic_uint_least16_t;
-typedef atomic<int_least32_t>  atomic_int_least32_t;
+typedef atomic<int_least32_t> atomic_int_least32_t;
 typedef atomic<uint_least32_t> atomic_uint_least32_t;
-typedef atomic<int_least64_t>  atomic_int_least64_t;
+typedef atomic<int_least64_t> atomic_int_least64_t;
 typedef atomic<uint_least64_t> atomic_uint_least64_t;
 
-typedef atomic<int_fast8_t>   atomic_int_fast8_t;
-typedef atomic<uint_fast8_t>  atomic_uint_fast8_t;
-typedef atomic<int_fast16_t>  atomic_int_fast16_t;
+typedef atomic<int_fast8_t> atomic_int_fast8_t;
+typedef atomic<uint_fast8_t> atomic_uint_fast8_t;
+typedef atomic<int_fast16_t> atomic_int_fast16_t;
 typedef atomic<uint_fast16_t> atomic_uint_fast16_t;
-typedef atomic<int_fast32_t>  atomic_int_fast32_t;
+typedef atomic<int_fast32_t> atomic_int_fast32_t;
 typedef atomic<uint_fast32_t> atomic_uint_fast32_t;
-typedef atomic<int_fast64_t>  atomic_int_fast64_t;
+typedef atomic<int_fast64_t> atomic_int_fast64_t;
 typedef atomic<uint_fast64_t> atomic_uint_fast64_t;
 
-typedef atomic< int8_t>  atomic_int8_t;
-typedef atomic<uint8_t>  atomic_uint8_t;
-typedef atomic< int16_t> atomic_int16_t;
+typedef atomic<int8_t> atomic_int8_t;
+typedef atomic<uint8_t> atomic_uint8_t;
+typedef atomic<int16_t> atomic_int16_t;
 typedef atomic<uint16_t> atomic_uint16_t;
-typedef atomic< int32_t> atomic_int32_t;
+typedef atomic<int32_t> atomic_int32_t;
 typedef atomic<uint32_t> atomic_uint32_t;
-typedef atomic< int64_t> atomic_int64_t;
+typedef atomic<int64_t> atomic_int64_t;
 typedef atomic<uint64_t> atomic_uint64_t;
 
-typedef atomic<intptr_t>  atomic_intptr_t;
+typedef atomic<intptr_t> atomic_intptr_t;
 typedef atomic<uintptr_t> atomic_uintptr_t;
-typedef atomic<size_t>    atomic_size_t;
+typedef atomic<size_t> atomic_size_t;
 typedef atomic<ptrdiff_t> atomic_ptrdiff_t;
-typedef atomic<intmax_t>  atomic_intmax_t;
+typedef atomic<intmax_t> atomic_intmax_t;
 typedef atomic<uintmax_t> atomic_uintmax_t;
 
 static_assert(LIBCUDACXX_ATOMIC_INT_LOCK_FREE, "This library assumes atomic<int> is lock-free.");
 
-typedef atomic<int>       atomic_signed_lock_free;
-typedef atomic<unsigned>  atomic_unsigned_lock_free;
+typedef atomic<int> atomic_signed_lock_free;
+typedef atomic<unsigned> atomic_unsigned_lock_free;
 
-#define LIBCUDACXX_ATOMIC_FLAG_INIT {false}
-#define LIBCUDACXX_ATOMIC_VAR_INIT(__v) {__v}
+#define LIBCUDACXX_ATOMIC_FLAG_INIT \
+  {                                 \
+    false                           \
+  }
+#define LIBCUDACXX_ATOMIC_VAR_INIT(__v) \
+  {                                     \
+    __v                                 \
+  }
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
 _CCCL_POP_MACROS
 
-#endif  // _CUDA_STD_ATOMIC
+#endif // _CUDA_STD_ATOMIC
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__threading_support b/libcudacxx/include/cuda/std/detail/libcxx/include/__threading_support
index 171cab3870..9f5fbe9255 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__threading_support
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__threading_support
@@ -20,7 +20,6 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/std/detail/libcxx/include/__assert> // all public C++ headers provide the assertion handler
 #include <cuda/std/__functional/hash.h>
 #include <cuda/std/chrono>
 #include <cuda/std/climits>
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/barrier b/libcudacxx/include/cuda/std/detail/libcxx/include/barrier
index b12ff0347c..58e0e2d240 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/barrier
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/barrier
@@ -78,20 +78,21 @@ struct __empty_completion
 template <class _CompletionF = __empty_completion, int _Sco = 0>
 class alignas(64) __barrier_base
 {
-    ptrdiff_t                       __expected;
-    __atomic_impl<ptrdiff_t, _Sco>  __expected_adjustment;
-    _CompletionF                    __completion;
+  ptrdiff_t __expected;
+  __atomic_impl<ptrdiff_t, _Sco> __expected_adjustment;
+  _CompletionF __completion;
 
-    using __phase_t = uint8_t;
-    __atomic_impl<__phase_t, _Sco>  __phase;
+  using __phase_t = uint8_t;
+  __atomic_impl<__phase_t, _Sco> __phase;
 
-    struct alignas(64) __state_t
+  struct alignas(64) __state_t
+  {
+    struct
     {
-        struct {
-            __atomic_impl<__phase_t, _Sco> __phase = LIBCUDACXX_ATOMIC_VAR_INIT(0);
-        } __tickets[64];
-    };
-    ::std::vector<__state_t>   __state;
+      __atomic_impl<__phase_t, _Sco> __phase = LIBCUDACXX_ATOMIC_VAR_INIT(0);
+    } __tickets[64];
+  };
+  ::std::vector<__state_t> __state;
 
   inline _LIBCUDACXX_INLINE_VISIBILITY bool __arrive(__phase_t const __old_phase)
   {
@@ -262,13 +263,12 @@ _LIBCUDACXX_INLINE_VISIBILITY bool __call_try_wait_parity(const _Barrier& __b, b
   return __b.__try_wait_parity(__parity);
 }
 
-
-template<class _CompletionF, thread_scope _Sco = thread_scope_system>
-class __barrier_base {
-
-    _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_impl<ptrdiff_t, _Sco> __expected, __arrived;
-    _LIBCUDACXX_BARRIER_ALIGNMENTS _CompletionF                   __completion;
-    _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_impl<bool, _Sco>      __phase;
+template <class _CompletionF, thread_scope _Sco = thread_scope_system>
+class __barrier_base
+{
+  _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_impl<ptrdiff_t, _Sco> __expected, __arrived;
+  _LIBCUDACXX_BARRIER_ALIGNMENTS _CompletionF __completion;
+  _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_impl<bool, _Sco> __phase;
 
 public:
   using arrival_token = bool;
@@ -303,62 +303,58 @@ public:
       , __phase(false)
   {}
 
-   ~__barrier_base() = default;
+  ~__barrier_base() = default;
 
-    __barrier_base(__barrier_base const&) = delete;
-    __barrier_base& operator=(__barrier_base const&) = delete;
+  __barrier_base(__barrier_base const&)            = delete;
+  __barrier_base& operator=(__barrier_base const&) = delete;
 
-    _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY
-    arrival_token arrive(ptrdiff_t __update = 1)
-    {
-        auto const __old_phase = __phase.load(memory_order_relaxed);
-        auto const __result = __arrived.fetch_sub(__update, memory_order_acq_rel) - __update;
-        auto const __new_expected = __expected.load(memory_order_relaxed);
+  _CCCL_NODISCARD _LIBCUDACXX_INLINE_VISIBILITY arrival_token arrive(ptrdiff_t __update = 1)
+  {
+    auto const __old_phase    = __phase.load(memory_order_relaxed);
+    auto const __result       = __arrived.fetch_sub(__update, memory_order_acq_rel) - __update;
+    auto const __new_expected = __expected.load(memory_order_relaxed);
 
-        _LIBCUDACXX_DEBUG_ASSERT(__result >= 0, "");
+    _LIBCUDACXX_DEBUG_ASSERT(__result >= 0, "");
 
-        if(0 == __result) {
-            __completion();
-            __arrived.store(__new_expected, memory_order_relaxed);
-            __phase.store(!__old_phase, memory_order_release);
-            __atomic_notify_all(&__phase.__a, __scope_to_tag<_Sco>{});
-        }
-        return __old_phase;
-    }
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void wait(arrival_token&& __old_phase) const
-    {
-        __phase.wait(__old_phase, memory_order_acquire);
-    }
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void arrive_and_wait()
+    if (0 == __result)
     {
-        wait(arrive());
-    }
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void arrive_and_drop()
-    {
-        __expected.fetch_sub(1, memory_order_relaxed);
-        (void)arrive();
+      __completion();
+      __arrived.store(__new_expected, memory_order_relaxed);
+      __phase.store(!__old_phase, memory_order_release);
+      __atomic_notify_all(&__phase.__a, __scope_to_tag<_Sco>{});
     }
+    return __old_phase;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void wait(arrival_token&& __old_phase) const
+  {
+    __phase.wait(__old_phase, memory_order_acquire);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void arrive_and_wait()
+  {
+    wait(arrive());
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void arrive_and_drop()
+  {
+    __expected.fetch_sub(1, memory_order_relaxed);
+    (void) arrive();
+  }
 
-    _LIBCUDACXX_INLINE_VISIBILITY
-    static constexpr ptrdiff_t max() noexcept
-    {
-        return numeric_limits<ptrdiff_t>::max();
-    }
+  _LIBCUDACXX_INLINE_VISIBILITY static constexpr ptrdiff_t max() noexcept
+  {
+    return numeric_limits<ptrdiff_t>::max();
+  }
 };
 
-template<thread_scope _Sco>
-class __barrier_base<__empty_completion, _Sco> {
-
-    static constexpr uint64_t __expected_unit = 1ull;
-    static constexpr uint64_t __arrived_unit = 1ull << 32;
-    static constexpr uint64_t __expected_mask = __arrived_unit - 1;
-    static constexpr uint64_t __phase_bit = 1ull << 63;
-    static constexpr uint64_t __arrived_mask = (__phase_bit - 1) & ~__expected_mask;
+template <thread_scope _Sco>
+class __barrier_base<__empty_completion, _Sco>
+{
+  static constexpr uint64_t __expected_unit = 1ull;
+  static constexpr uint64_t __arrived_unit  = 1ull << 32;
+  static constexpr uint64_t __expected_mask = __arrived_unit - 1;
+  static constexpr uint64_t __phase_bit     = 1ull << 63;
+  static constexpr uint64_t __arrived_mask  = (__phase_bit - 1) & ~__expected_mask;
 
-    _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_impl<uint64_t, _Sco> __phase_arrived_expected;
+  _LIBCUDACXX_BARRIER_ALIGNMENTS __atomic_impl<uint64_t, _Sco> __phase_arrived_expected;
 
 public:
   using arrival_token = uint64_t;
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/latch b/libcudacxx/include/cuda/std/detail/libcxx/include/latch
index 74fb72fcb3..26442e8283 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/latch
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/latch
@@ -48,14 +48,12 @@ namespace std
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/std/__atomic/api/owned.h>
 #include <cuda/std/atomic>
 #include <cuda/std/detail/libcxx/include/__assert> // all public C++ headers provide the assertion handler
 #include <cuda/std/detail/libcxx/include/__debug>
-
 #include <cuda/std/limits>
 
-#include <cuda/std/__atomic/api/owned.h>
-
 _CCCL_PUSH_MACROS
 
 #ifdef _LIBCUDACXX_HAS_NO_THREADS
@@ -70,10 +68,11 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #  define _LIBCUDACXX_LATCH_ALIGNMENT
 #endif
 
-template<thread_scope _Sco = thread_scope_system>
+template <thread_scope _Sco = thread_scope_system>
 class __latch_base
 {
-    _LIBCUDACXX_LATCH_ALIGNMENT __atomic_impl<ptrdiff_t, _Sco> __counter;
+  _LIBCUDACXX_LATCH_ALIGNMENT __atomic_impl<ptrdiff_t, _Sco> __counter;
+
 public:
   inline _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit __latch_base(ptrdiff_t __expected)
       : __counter(__expected)
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/semaphore b/libcudacxx/include/cuda/std/detail/libcxx/include/semaphore
index e299cf98e6..74b421d903 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/semaphore
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/semaphore
@@ -68,7 +68,7 @@ _CCCL_PUSH_MACROS
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-template<thread_scope _Sco, ptrdiff_t __least_max_value>
+template <thread_scope _Sco, ptrdiff_t __least_max_value>
 class __atomic_semaphore_base
 {
   _LIBCUDACXX_INLINE_VISIBILITY bool __fetch_sub_if_slow(ptrdiff_t __old)
@@ -110,15 +110,16 @@ class __atomic_semaphore_base
     }
   }
 
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool __acquire_slow_timed(chrono::nanoseconds const& __rel_time)
-    {
-        return __libcpp_thread_poll_with_backoff([this]() {
-            ptrdiff_t const __old = __count.load(memory_order_acquire);
-            return __old != 0 && __fetch_sub_if_slow(__old);
-        }, __rel_time);
-    }
-    __atomic_impl<ptrdiff_t, _Sco> __count;
+  _LIBCUDACXX_INLINE_VISIBILITY bool __acquire_slow_timed(chrono::nanoseconds const& __rel_time)
+  {
+    return __libcpp_thread_poll_with_backoff(
+      [this]() {
+        ptrdiff_t const __old = __count.load(memory_order_acquire);
+        return __old != 0 && __fetch_sub_if_slow(__old);
+      },
+      __rel_time);
+  }
+  __atomic_impl<ptrdiff_t, _Sco> __count;
 
 public:
   _LIBCUDACXX_INLINE_VISIBILITY static constexpr ptrdiff_t max() noexcept
@@ -190,17 +191,18 @@ public:
 
 #ifndef _LIBCUDACXX_USE_NATIVE_SEMAPHORES
 
-template<thread_scope _Sco>
-class __atomic_semaphore_base<_Sco, 1> {
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool __acquire_slow_timed(chrono::nanoseconds const& __rel_time)
-    {
-        return __libcpp_thread_poll_with_backoff([this]() {
-            return try_acquire();
-        }, __rel_time);
-    }
-    __atomic_impl<int, _Sco> __available;
+template <thread_scope _Sco>
+class __atomic_semaphore_base<_Sco, 1>
+{
+  _LIBCUDACXX_INLINE_VISIBILITY bool __acquire_slow_timed(chrono::nanoseconds const& __rel_time)
+  {
+    return __libcpp_thread_poll_with_backoff(
+      [this]() {
+        return try_acquire();
+      },
+      __rel_time);
+  }
+  __atomic_impl<int, _Sco> __available;
 
 public:
   _LIBCUDACXX_INLINE_VISIBILITY static constexpr ptrdiff_t max() noexcept
@@ -267,89 +269,92 @@ public:
 
 #else
 
-template<thread_scope _Sco>
-class __sem_semaphore_base {
-
+template <thread_scope _Sco>
+class __sem_semaphore_base
+{
   _LIBCUDACXX_INLINE_VISIBILITY bool __backfill(bool __success)
   {
 #  ifndef _LIBCUDACXX_HAS_NO_SEMAPHORE_BACK_BUFFER
     if (__success)
+    {
       auto const __back_amount = __backbuffer.fetch_sub(2, memory_order_acquire);
-      bool const __post_one    = __back_amount > 0;
-      bool const __post_two    = __back_amount > 1;
-      auto const __success     = (!__post_one || __libcpp_semaphore_post(&__semaphore))
-                          && (!__post_two || __libcpp_semaphore_post(&__semaphore));
-      _LIBCUDACXX_ASSERT(__success, "");
-      if (!__post_one || !__post_two)
-      {
-        __backbuffer.fetch_add(!__post_one ? 2 : 1, memory_order_relaxed);
-      }
     }
-#  endif
-    return __success;
+    bool const __post_one = __back_amount > 0;
+    bool const __post_two = __back_amount > 1;
+    auto const __success =
+      (!__post_one || __libcpp_semaphore_post(&__semaphore)) && (!__post_two || __libcpp_semaphore_post(&__semaphore));
+    _LIBCUDACXX_ASSERT(__success, "");
+    if (!__post_one || !__post_two)
+    {
+      __backbuffer.fetch_add(!__post_one ? 2 : 1, memory_order_relaxed);
+    }
   }
+#  endif
+  return __success;
+}
 
-  _LIBCUDACXX_INLINE_VISIBILITY bool __try_acquire_fast()
-  {
+_LIBCUDACXX_INLINE_VISIBILITY bool
+__try_acquire_fast()
+{
 #  ifndef _LIBCUDACXX_HAS_NO_SEMAPHORE_FRONT_BUFFER
 
-    ptrdiff_t __old;
-    __libcpp_thread_poll_with_backoff(
-      [&]() {
-        __old = __frontbuffer.load(memory_order_relaxed);
-        return 0 != (__old >> 32);
-      },
-      chrono::microseconds(5));
+  ptrdiff_t __old;
+  __libcpp_thread_poll_with_backoff(
+    [&]() {
+      __old = __frontbuffer.load(memory_order_relaxed);
+      return 0 != (__old >> 32);
+    },
+    chrono::microseconds(5));
 
-    // always steal if you can
-    while (__old >> 32)
-    {
-      if (__frontbuffer.compare_exchange_weak(__old, __old - (1ll << 32), memory_order_acquire))
-      {
-        return true;
-      }
-    }
-    // record we're waiting
-    __old = __frontbuffer.fetch_add(1ll, memory_order_release);
-    // ALWAYS steal if you can!
-    while (__old >> 32)
+  // always steal if you can
+  while (__old >> 32)
+  {
+    if (__frontbuffer.compare_exchange_weak(__old, __old - (1ll << 32), memory_order_acquire))
     {
-      if (__frontbuffer.compare_exchange_weak(__old, __old - (1ll << 32), memory_order_acquire))
-      {
-        break;
-      }
+      return true;
     }
-    // not going to wait after all
-    if (__old >> 32)
+  }
+  // record we're waiting
+  __old = __frontbuffer.fetch_add(1ll, memory_order_release);
+  // ALWAYS steal if you can!
+  while (__old >> 32)
+  {
+    if (__frontbuffer.compare_exchange_weak(__old, __old - (1ll << 32), memory_order_acquire))
     {
-      return __try_done(true);
+      break;
     }
-#  endif
-    // the wait has begun...
-    return false;
   }
-
-  _LIBCUDACXX_INLINE_VISIBILITY bool __try_done(bool __success)
+  // not going to wait after all
+  if (__old >> 32)
   {
+    return __try_done(true);
+  }
+#  endif
+  // the wait has begun...
+  return false;
+}
+
+_LIBCUDACXX_INLINE_VISIBILITY bool __try_done(bool __success)
+{
 #  ifndef _LIBCUDACXX_HAS_NO_SEMAPHORE_FRONT_BUFFER
-    // record we're NOT waiting
-    __frontbuffer.fetch_sub(1ll, memory_order_release);
+  // record we're NOT waiting
+  __frontbuffer.fetch_sub(1ll, memory_order_release);
 #  endif
-    return __backfill(__success);
-  }
+  return __backfill(__success);
+}
 
-  _LIBCUDACXX_INLINE_VISIBILITY void __release_slow(ptrdiff_t __post_amount)
-  {
+_LIBCUDACXX_INLINE_VISIBILITY void __release_slow(ptrdiff_t __post_amount)
+{
 #  ifndef _LIBCUDACXX_HAS_NO_SEMAPHORE_BACK_BUFFER
-    bool const __post_one = __post_amount > 0;
-    bool const __post_two = __post_amount > 1;
-    if (__post_amount > 2)
-    {
-      __backbuffer.fetch_add(__post_amount - 2, memory_order_acq_rel);
-    }
-    auto const __success =
-      (!__post_one || __libcpp_semaphore_post(&__semaphore)) && (!__post_two || __libcpp_semaphore_post(&__semaphore));
-    _LIBCUDACXX_ASSERT(__success, "");
+  bool const __post_one = __post_amount > 0;
+  bool const __post_two = __post_amount > 1;
+  if (__post_amount > 2)
+  {
+    __backbuffer.fetch_add(__post_amount - 2, memory_order_acq_rel);
+  }
+  auto const __success =
+    (!__post_one || __libcpp_semaphore_post(&__semaphore)) && (!__post_two || __libcpp_semaphore_post(&__semaphore));
+  _LIBCUDACXX_ASSERT(__success, "");
 #  else
     for (; __post_amount; --__post_amount)
     {
@@ -357,101 +362,102 @@ class __sem_semaphore_base {
       _LIBCUDACXX_ASSERT(__success, "");
     }
 #  endif
-  }
+}
 
-    __libcpp_semaphore_t __semaphore;
-#ifndef _LIBCUDACXX_HAS_NO_SEMAPHORE_FRONT_BUFFER
-    __atomic_impl<ptrdiff_t, _Sco> __frontbuffer;
-#endif
-#ifndef _LIBCUDACXX_HAS_NO_SEMAPHORE_BACK_BUFFER
-    __atomic_impl<ptrdiff_t, _Sco> __backbuffer;
-#endif
+__libcpp_semaphore_t __semaphore;
+#  ifndef _LIBCUDACXX_HAS_NO_SEMAPHORE_FRONT_BUFFER
+__atomic_impl<ptrdiff_t, _Sco> __frontbuffer;
+#  endif
+#  ifndef _LIBCUDACXX_HAS_NO_SEMAPHORE_BACK_BUFFER
+__atomic_impl<ptrdiff_t, _Sco> __backbuffer;
+#  endif
 
 public:
-  static constexpr ptrdiff_t max() noexcept
-  {
-    return _LIBCUDACXX_SEMAPHORE_MAX;
-  }
+static constexpr ptrdiff_t max() noexcept
+{
+  return _LIBCUDACXX_SEMAPHORE_MAX;
+}
 
-  _LIBCUDACXX_INLINE_VISIBILITY __sem_semaphore_base(ptrdiff_t __count = 0)
-      : __semaphore()
+_LIBCUDACXX_INLINE_VISIBILITY __sem_semaphore_base(ptrdiff_t __count = 0)
+    : __semaphore()
 #  ifndef _LIBCUDACXX_HAS_NO_SEMAPHORE_FRONT_BUFFER
-      , __frontbuffer(__count << 32)
+    , __frontbuffer(__count << 32)
 #  endif
 #  ifndef _LIBCUDACXX_HAS_NO_SEMAPHORE_BACK_BUFFER
-      , __backbuffer(0)
+    , __backbuffer(0)
 #  endif
-  {
-    _LIBCUDACXX_ASSERT(__count <= max(), "");
-    auto const __success =
+{
+  _LIBCUDACXX_ASSERT(__count <= max(), "");
+  auto const __success =
 #  ifndef _LIBCUDACXX_HAS_NO_SEMAPHORE_FRONT_BUFFER
-      __libcpp_semaphore_init(&__semaphore, 0);
+    __libcpp_semaphore_init(&__semaphore, 0);
 #  else
       __libcpp_semaphore_init(&__semaphore, __count);
 #  endif
-    _LIBCUDACXX_ASSERT(__success, "");
-  }
+  _LIBCUDACXX_ASSERT(__success, "");
+}
 
-  _LIBCUDACXX_INLINE_VISIBILITY ~__sem_semaphore_base()
-  {
+_LIBCUDACXX_INLINE_VISIBILITY ~__sem_semaphore_base()
+{
 #  ifndef _LIBCUDACXX_HAS_NO_SEMAPHORE_FRONT_BUFFER
-    _LIBCUDACXX_ASSERT(0 == (__frontbuffer.load(memory_order_relaxed) & ~0u), "");
+  _LIBCUDACXX_ASSERT(0 == (__frontbuffer.load(memory_order_relaxed) & ~0u), "");
 #  endif
-    auto const __success = __libcpp_semaphore_destroy(&__semaphore);
-    _LIBCUDACXX_ASSERT(__success, "");
-  }
+  auto const __success = __libcpp_semaphore_destroy(&__semaphore);
+  _LIBCUDACXX_ASSERT(__success, "");
+}
 
-  __sem_semaphore_base(const __sem_semaphore_base&)            = delete;
-  __sem_semaphore_base& operator=(const __sem_semaphore_base&) = delete;
+__sem_semaphore_base(const __sem_semaphore_base&)            = delete;
+__sem_semaphore_base& operator=(const __sem_semaphore_base&) = delete;
 
-  _LIBCUDACXX_INLINE_VISIBILITY void release(ptrdiff_t __update = 1)
-  {
+_LIBCUDACXX_INLINE_VISIBILITY void release(ptrdiff_t __update = 1)
+{
 #  ifndef _LIBCUDACXX_HAS_NO_SEMAPHORE_FRONT_BUFFER
-    // boldly assume the semaphore is taken but uncontended
-    ptrdiff_t __old = 0;
-    // try to fast-release as long as it's uncontended
-    while (0 == (__old & ~0ul))
-    {
-      if (__frontbuffer.compare_exchange_weak(__old, __old + (__update << 32), memory_order_acq_rel))
-      {
-        return;
-      }
-    }
-#  endif
-    // slow-release it is
-    __release_slow(__update);
-  }
-
-  _LIBCUDACXX_INLINE_VISIBILITY void acquire()
+  // boldly assume the semaphore is taken but uncontended
+  ptrdiff_t __old = 0;
+  // try to fast-release as long as it's uncontended
+  while (0 == (__old & ~0ul))
   {
-    if (!__try_acquire_fast())
+    if (__frontbuffer.compare_exchange_weak(__old, __old + (__update << 32), memory_order_acq_rel))
     {
-      __try_done(__libcpp_semaphore_wait(&__semaphore));
+      return;
     }
   }
+#  endif
+  // slow-release it is
+  __release_slow(__update);
+}
 
-  _LIBCUDACXX_INLINE_VISIBILITY bool try_acquire() noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void acquire()
+{
+  if (!__try_acquire_fast())
   {
-    return try_acquire_for(chrono::nanoseconds(0));
+    __try_done(__libcpp_semaphore_wait(&__semaphore));
   }
+}
 
-  template <class Clock, class Duration>
-  _LIBCUDACXX_INLINE_VISIBILITY bool try_acquire_until(chrono::time_point<Clock, Duration> const& __abs_time)
-  {
-    auto const current = max(Clock::now(), __abs_time);
-    return try_acquire_for(chrono::duration_cast<chrono::nanoseconds>(__abs_time - current));
-  }
+_LIBCUDACXX_INLINE_VISIBILITY bool try_acquire() noexcept
+{
+  return try_acquire_for(chrono::nanoseconds(0));
+}
 
-  template <class Rep, class Period>
-  _LIBCUDACXX_INLINE_VISIBILITY bool try_acquire_for(chrono::duration<Rep, Period> const& __rel_time)
-  {
-    return __try_acquire_fast() || __try_done(__libcpp_semaphore_wait_timed(&__semaphore, __rel_time));
-  }
-};
+template <class Clock, class Duration>
+_LIBCUDACXX_INLINE_VISIBILITY bool try_acquire_until(chrono::time_point<Clock, Duration> const& __abs_time)
+{
+  auto const current = max(Clock::now(), __abs_time);
+  return try_acquire_for(chrono::duration_cast<chrono::nanoseconds>(__abs_time - current));
+}
+
+template <class Rep, class Period>
+_LIBCUDACXX_INLINE_VISIBILITY bool try_acquire_for(chrono::duration<Rep, Period> const& __rel_time)
+{
+  return __try_acquire_fast() || __try_done(__libcpp_semaphore_wait_timed(&__semaphore, __rel_time));
+}
+}
+;
 
 #endif //_LIBCUDACXX_HAS_NO_SEMAPHORES
 
-template<ptrdiff_t __least_max_value, thread_scope _Sco>
+template <ptrdiff_t __least_max_value, thread_scope _Sco>
 using __semaphore_base =
 #ifdef _LIBCUDACXX_USE_NATIVE_SEMAPHORES
   __conditional_t<__least_max_value <= __sem_semaphore_base<_Sco>::max(),
@@ -462,14 +468,16 @@ using __semaphore_base =
 #endif
   ;
 
-template<ptrdiff_t __least_max_value = INT_MAX>
+template <ptrdiff_t __least_max_value = INT_MAX>
 class counting_semaphore : public __semaphore_base<__least_max_value, thread_scope_system>
 {
-    static_assert(__least_max_value <= __semaphore_base<__least_max_value, thread_scope_system>::max(), "");
+  static_assert(__least_max_value <= __semaphore_base<__least_max_value, thread_scope_system>::max(), "");
+
 public:
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr
-    counting_semaphore(ptrdiff_t __count = 0) : __semaphore_base<__least_max_value, thread_scope_system>(__count) { }
-    ~counting_semaphore() = default;
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr counting_semaphore(ptrdiff_t __count = 0)
+      : __semaphore_base<__least_max_value, thread_scope_system>(__count)
+  {}
+  ~counting_semaphore() = default;
 
   counting_semaphore(const counting_semaphore&)            = delete;
   counting_semaphore& operator=(const counting_semaphore&) = delete;
diff --git a/libcudacxx/test/libcudacxx/cuda/stream_ref/stream_ref.wait.pass.cpp b/libcudacxx/test/libcudacxx/cuda/stream_ref/stream_ref.wait.pass.cpp
index 2c0ede6159..271bddd1ef 100644
--- a/libcudacxx/test/libcudacxx/cuda/stream_ref/stream_ref.wait.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/stream_ref/stream_ref.wait.pass.cpp
@@ -45,9 +45,7 @@ int main(int argc, char** argv)
   NV_IF_TARGET(
     NV_IS_HOST,
     ( // passing case
-      cudaStream_t stream;
-      cudaStreamCreate(&stream);
-      std::atomic_flag flag = ATOMIC_FLAG_INIT;
+      cudaStream_t stream; cudaStreamCreate(&stream); std::atomic_flag flag = ATOMIC_FLAG_INIT;
       cudaStreamAddCallback(stream, callback, &flag, 0);
       cuda::stream_ref ref{stream};
       test_wait(ref);
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/address.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/address.pass.cpp
index b914b9c712..74dc6f8515 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/address.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/address.pass.cpp
@@ -68,8 +68,8 @@
 //     T* operator-=(ptrdiff_t op);
 // };
 
-#include <cuda/std/atomic>
 #include <cuda/atomic>
+#include <cuda/std/atomic>
 #include <cuda/std/cassert>
 #include <cuda/std/type_traits>
 
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/address_ref.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/address_ref.pass.cpp
index 8b4baf81f3..376ca94e19 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/address_ref.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/address_ref.pass.cpp
@@ -68,8 +68,8 @@
 //     T* operator-=(ptrdiff_t op);
 // };
 
-#include <cuda/std/atomic>
 #include <cuda/atomic>
+#include <cuda/std/atomic>
 #include <cuda/std/cassert>
 #include <cuda/std/type_traits>
 
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/address_ref_constness.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/address_ref_constness.pass.cpp
index 84b4b07b5f..9adc1d390b 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/address_ref_constness.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/address_ref_constness.pass.cpp
@@ -68,8 +68,8 @@
 //     T* operator-=(ptrdiff_t op);
 // };
 
-#include <cuda/std/atomic>
 #include <cuda/atomic>
+#include <cuda/std/atomic>
 #include <cuda/std/cassert>
 #include <cuda/std/type_traits>
 
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/atomic_copyable.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/atomic_copyable.pass.cpp
index fbf2e12126..a9486a5dcd 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/atomic_copyable.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/atomic_copyable.pass.cpp
@@ -15,8 +15,8 @@
 
 // <cuda/std/atomic>
 
-#include <cuda/std/atomic>
 #include <cuda/atomic>
+#include <cuda/std/atomic>
 #include <cuda/std/cassert>
 #include <cuda/std/utility>
 // #include <cuda/std/thread> // for thread_id
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/bool.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/bool.pass.cpp
index b4009abed5..131d3677d1 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/bool.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/bool.pass.cpp
@@ -51,8 +51,8 @@
 //
 // typedef atomic<bool> atomic_bool;
 
-#include <cuda/std/atomic>
 #include <cuda/atomic>
+#include <cuda/std/atomic>
 #include <cuda/std/cassert>
 
 #include "test_macros.h"
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/cstdint_typedefs.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/cstdint_typedefs.pass.cpp
index 4d00b8a8f2..13b1afe169 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/cstdint_typedefs.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/cstdint_typedefs.pass.cpp
@@ -36,8 +36,8 @@
 // typedef atomic<intmax_t>  atomic_intmax_t;
 // typedef atomic<uintmax_t> atomic_uintmax_t;
 
-#include <cuda/std/atomic>
 #include <cuda/atomic>
+#include <cuda/std/atomic>
 #include <cuda/std/cstdint>
 #include <cuda/std/type_traits>
 
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/enum_class.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/enum_class.pass.cpp
index 1ca0f8da66..adc43d32a5 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/enum_class.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/enum_class.pass.cpp
@@ -49,8 +49,8 @@
 //     T operator=(T) noexcept;
 // };
 
-#include <cuda/std/atomic>
 #include <cuda/atomic>
+#include <cuda/std/atomic>
 #include <cuda/std/cassert>
 
 #include "cuda_space_selector.h"
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point.pass.cpp
index 6f8027557f..28145c99bf 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point.pass.cpp
@@ -72,8 +72,8 @@
 //     floating_point operator-=(floating_point op);
 // };
 
-#include <cuda/std/atomic>
 #include <cuda/atomic>
+#include <cuda/std/atomic>
 #include <cuda/std/cassert>
 
 #include "test_macros.h"
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point_ref.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point_ref.pass.cpp
index fdc7426975..ce25bc45d3 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point_ref.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point_ref.pass.cpp
@@ -72,8 +72,8 @@
 //     floating_point operator-=(floating_point op);
 // };
 
-#include <cuda/std/atomic>
 #include <cuda/atomic>
+#include <cuda/std/atomic>
 #include <cuda/std/cassert>
 
 #include "test_macros.h"
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point_ref_constness.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point_ref_constness.pass.cpp
index 3f534b49a4..7c5dae71a9 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point_ref_constness.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/floating_point_ref_constness.pass.cpp
@@ -12,8 +12,8 @@
 
 // <cuda/std/atomic>
 
-#include <cuda/std/atomic>
 #include <cuda/atomic>
+#include <cuda/std/atomic>
 #include <cuda/std/cassert>
 
 #include "test_macros.h"
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral.pass.cpp
index 53cde8cb25..ed53c53c57 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral.pass.cpp
@@ -86,8 +86,8 @@
 //     integral operator^=(integral op);
 // };
 
-#include <cuda/std/atomic>
 #include <cuda/atomic>
+#include <cuda/std/atomic>
 #include <cuda/std/cassert>
 
 #include "test_macros.h"
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref.pass.cpp
index 007de5f333..56153f3664 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref.pass.cpp
@@ -86,8 +86,8 @@
 //     integral operator^=(integral op);
 // };
 
-#include <cuda/std/atomic>
 #include <cuda/atomic>
+#include <cuda/std/atomic>
 #include <cuda/std/cassert>
 
 #include "test_macros.h"
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref_constness.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref_constness.pass.cpp
index d593700dfd..b237c862a5 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref_constness.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.generic/integral_ref_constness.pass.cpp
@@ -86,8 +86,8 @@
 //     integral operator^=(integral op);
 // };
 
-#include <cuda/std/atomic>
 #include <cuda/atomic>
+#include <cuda/std/atomic>
 #include <cuda/std/cassert>
 
 #include "test_macros.h"
diff --git a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.operations/atomics.types.operations.req/ctor.pass.cpp b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.operations/atomics.types.operations.req/ctor.pass.cpp
index 2020cf54a0..b033b1ff83 100644
--- a/libcudacxx/test/libcudacxx/std/atomics/atomics.types.operations/atomics.types.operations.req/ctor.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/atomics/atomics.types.operations/atomics.types.operations.req/ctor.pass.cpp
@@ -20,8 +20,8 @@
 
 #define _LIBCUDACXX_DISABLE_DEPRECATION_WARNINGS
 
-#include <cuda/std/atomic>
 #include <cuda/atomic>
+#include <cuda/std/atomic>
 #include <cuda/std/cassert>
 #include <cuda/std/type_traits>
 

From b3b8e60d93497368673e1b4c1fb9faa987b0dac2 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Thu, 9 May 2024 12:55:06 -0700
Subject: [PATCH 50/71] Make internal owned memory atomic APIs have a default
 ctor.

---
 libcudacxx/include/cuda/std/__atomic/api/owned.h | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__atomic/api/owned.h b/libcudacxx/include/cuda/std/__atomic/api/owned.h
index 2ba8d8ca16..ac23091e3a 100644
--- a/libcudacxx/include/cuda/std/__atomic/api/owned.h
+++ b/libcudacxx/include/cuda/std/__atomic/api/owned.h
@@ -38,9 +38,7 @@ struct __atomic_common
       : __a(__v)
   {}
 
-  _CCCL_HOST_DEVICE constexpr inline __atomic_common()
-      : __a()
-  {}
+  constexpr inline __atomic_common() = default;
 
   __atomic_storage_t<_Tp> __a;
 
@@ -59,9 +57,7 @@ struct __atomic_arithmetic
       : __a(__v)
   {}
 
-  _CCCL_HOST_DEVICE constexpr inline __atomic_arithmetic()
-      : __a()
-  {}
+  constexpr inline __atomic_arithmetic() = default;
 
   __atomic_storage_t<_Tp> __a;
 
@@ -83,9 +79,7 @@ struct __atomic_bitwise
       : __a(__v)
   {}
 
-  _CCCL_HOST_DEVICE constexpr inline __atomic_bitwise()
-      : __a()
-  {}
+  constexpr inline __atomic_bitwise() = default;
 
   __atomic_storage_t<_Tp> __a;
 
@@ -110,9 +104,7 @@ struct __atomic_pointer
       : __a(__v)
   {}
 
-  _CCCL_HOST_DEVICE constexpr inline __atomic_pointer()
-      : __a()
-  {}
+  constexpr inline __atomic_pointer() = default;
 
   __atomic_storage_t<_Tp> __a;
 

From 50e37d745152f23597afd18155ef0dd19baa2d08 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Thu, 9 May 2024 12:55:32 -0700
Subject: [PATCH 51/71] Fully qualify the atomics APIs in the cuda/atomic
 header.

---
 libcudacxx/include/cuda/std/__cuda/atomic.h | 30 ++++++++++-----------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__cuda/atomic.h b/libcudacxx/include/cuda/std/__cuda/atomic.h
index d70481c699..1758b11fcb 100644
--- a/libcudacxx/include/cuda/std/__cuda/atomic.h
+++ b/libcudacxx/include/cuda/std/__cuda/atomic.h
@@ -28,14 +28,14 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 // atomic<T>
 
 template <class _Tp, thread_scope _Sco = thread_scope::thread_scope_system>
-struct atomic : public std::__atomic_impl<_Tp, _Sco>
+struct atomic : public _CUDA_VSTD::__atomic_impl<_Tp, _Sco>
 {
   using value_type = _Tp;
 
   constexpr atomic() noexcept = default;
 
   _LIBCUDACXX_INLINE_VISIBILITY constexpr atomic(_Tp __d) noexcept
-      : std::__atomic_impl<_Tp, _Sco>(__d)
+      : _CUDA_VSTD::__atomic_impl<_Tp, _Sco>(__d)
   {}
 
   atomic(const atomic&)                     = delete;
@@ -55,27 +55,27 @@ struct atomic : public std::__atomic_impl<_Tp, _Sco>
 
   _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_max(const _Tp& __op, memory_order __m = memory_order_seq_cst) noexcept
   {
-    return std::__atomic_fetch_max_dispatch(&this->__a, __op, __m, std::__scope_to_tag<_Sco>{});
+    return _CUDA_VSTD::__atomic_fetch_max_dispatch(&this->__a, __op, __m, _CUDA_VSTD::__scope_to_tag<_Sco>{});
   }
   _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_max(const _Tp& __op, memory_order __m = memory_order_seq_cst) volatile noexcept
   {
-    return std::__atomic_fetch_max_dispatch(&this->__a, __op, __m, std::__scope_to_tag<_Sco>{});
+    return _CUDA_VSTD::__atomic_fetch_max_dispatch(&this->__a, __op, __m, _CUDA_VSTD::__scope_to_tag<_Sco>{});
   }
 
   _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_min(const _Tp& __op, memory_order __m = memory_order_seq_cst) noexcept
   {
-    return std::__atomic_fetch_min_dispatch(&this->__a, __op, __m, std::__scope_to_tag<_Sco>{});
+    return _CUDA_VSTD::__atomic_fetch_min_dispatch(&this->__a, __op, __m, _CUDA_VSTD::__scope_to_tag<_Sco>{});
   }
   _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_min(const _Tp& __op, memory_order __m = memory_order_seq_cst) volatile noexcept
   {
-    return std::__atomic_fetch_min_dispatch(&this->__a, __op, __m, std::__scope_to_tag<_Sco>{});
+    return _CUDA_VSTD::__atomic_fetch_min_dispatch(&this->__a, __op, __m, _CUDA_VSTD::__scope_to_tag<_Sco>{});
   }
 };
 
 // atomic_ref<T>
 
 template <class _Tp, thread_scope _Sco = thread_scope::thread_scope_system>
-struct atomic_ref : public std::__atomic_ref_impl<_Tp, _Sco>
+struct atomic_ref : public _CUDA_VSTD::__atomic_ref_impl<_Tp, _Sco>
 {
   using value_type = _Tp;
 
@@ -84,7 +84,7 @@ struct atomic_ref : public std::__atomic_ref_impl<_Tp, _Sco>
   static constexpr bool is_always_lock_free = sizeof(_Tp) <= 8;
 
   _LIBCUDACXX_INLINE_VISIBILITY explicit atomic_ref(_Tp& __ref)
-      : std::__atomic_ref_impl<_Tp, _Sco>(__ref)
+      : _CUDA_VSTD::__atomic_ref_impl<_Tp, _Sco>(__ref)
   {}
 
   _LIBCUDACXX_INLINE_VISIBILITY _Tp operator=(_Tp __v) const noexcept
@@ -99,12 +99,12 @@ struct atomic_ref : public std::__atomic_ref_impl<_Tp, _Sco>
 
   _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_max(const _Tp& __op, memory_order __m = memory_order_seq_cst) const noexcept
   {
-    return std::__atomic_fetch_max_dispatch(&this->__a, __op, __m, std::__scope_to_tag<_Sco>{});
+    return _CUDA_VSTD::__atomic_fetch_max_dispatch(&this->__a, __op, __m, _CUDA_VSTD::__scope_to_tag<_Sco>{});
   }
 
   _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_min(const _Tp& __op, memory_order __m = memory_order_seq_cst) const noexcept
   {
-    return std::__atomic_fetch_min_dispatch(&this->__a, __op, __m, std::__scope_to_tag<_Sco>{});
+    return _CUDA_VSTD::__atomic_fetch_min_dispatch(&this->__a, __op, __m, _CUDA_VSTD::__scope_to_tag<_Sco>{});
   }
 };
 
@@ -115,25 +115,25 @@ atomic_thread_fence(memory_order __m, thread_scope _Scope = thread_scope::thread
     NV_IS_DEVICE,
     (switch (_Scope) {
       case thread_scope::thread_scope_system:
-        std::__atomic_thread_fence_cuda((int) __m, __thread_scope_system_tag{});
+        _CUDA_VSTD::__atomic_thread_fence_cuda((int) __m, __thread_scope_system_tag{});
         break;
       case thread_scope::thread_scope_device:
-        std::__atomic_thread_fence_cuda((int) __m, __thread_scope_device_tag{});
+        _CUDA_VSTD::__atomic_thread_fence_cuda((int) __m, __thread_scope_device_tag{});
         break;
       case thread_scope::thread_scope_block:
-        std::__atomic_thread_fence_cuda((int) __m, __thread_scope_block_tag{});
+        _CUDA_VSTD::__atomic_thread_fence_cuda((int) __m, __thread_scope_block_tag{});
         break;
       // Atomics scoped to themselves do not require fencing
       case thread_scope::thread_scope_thread:
         break;
     }),
     NV_IS_HOST,
-    ((void) _Scope; std::atomic_thread_fence(__m);))
+    ((void) _Scope; _CUDA_VSTD::atomic_thread_fence(__m);))
 }
 
 inline _CCCL_HOST_DEVICE void atomic_signal_fence(memory_order __m)
 {
-  std::atomic_signal_fence(__m);
+  _CUDA_VSTD::atomic_signal_fence(__m);
 }
 
 _LIBCUDACXX_END_NAMESPACE_CUDA

From 1cdffdb6e09bc24db8382a83ead5e86fbaa1b58d Mon Sep 17 00:00:00 2001
From: Wesley Maxey <71408887+wmaxey@users.noreply.github.com>
Date: Thu, 9 May 2024 14:44:54 -0700
Subject: [PATCH 52/71] Add missing type_traits to host.h

Co-authored-by: Michael Schellenberger Costa <miscco@nvidia.com>
---
 libcudacxx/include/cuda/std/__atomic/functions/host.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/libcudacxx/include/cuda/std/__atomic/functions/host.h b/libcudacxx/include/cuda/std/__atomic/functions/host.h
index e02ae888db..3db6ab3077 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/host.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/host.h
@@ -23,6 +23,9 @@
 
 #include <cuda/std/__atomic/order.h>
 #include <cuda/std/__atomic/platform.h>
+#include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/is_floating_point.h>
+#include <cuda/std/__type_traits/remove_cvref.h>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 

From 082137fb2d80c0be8029c56c72e1d6acc21d7a7c Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Thu, 9 May 2024 15:28:19 -0700
Subject: [PATCH 53/71] Use `_LIBCUDACXX_INLINE_VISIBILITY` for API functions.

---
 libcudacxx/include/cuda/std/__atomic/api/owned.h     | 8 ++++----
 libcudacxx/include/cuda/std/__atomic/api/reference.h | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__atomic/api/owned.h b/libcudacxx/include/cuda/std/__atomic/api/owned.h
index ac23091e3a..fdbc8baac2 100644
--- a/libcudacxx/include/cuda/std/__atomic/api/owned.h
+++ b/libcudacxx/include/cuda/std/__atomic/api/owned.h
@@ -34,7 +34,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 template <typename _Tp, typename _Sco>
 struct __atomic_common
 {
-  _CCCL_HOST_DEVICE constexpr inline __atomic_common(_Tp __v)
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_common(_Tp __v)
       : __a(__v)
   {}
 
@@ -53,7 +53,7 @@ struct __atomic_common
 template <typename _Tp, typename _Sco>
 struct __atomic_arithmetic
 {
-  _CCCL_HOST_DEVICE constexpr inline __atomic_arithmetic(_Tp __v)
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_arithmetic(_Tp __v)
       : __a(__v)
   {}
 
@@ -75,7 +75,7 @@ struct __atomic_arithmetic
 template <typename _Tp, typename _Sco>
 struct __atomic_bitwise
 {
-  _CCCL_HOST_DEVICE constexpr inline __atomic_bitwise(_Tp __v)
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_bitwise(_Tp __v)
       : __a(__v)
   {}
 
@@ -100,7 +100,7 @@ struct __atomic_bitwise
 template <typename _Tp, typename _Sco>
 struct __atomic_pointer
 {
-  _CCCL_HOST_DEVICE constexpr inline __atomic_pointer(_Tp __v)
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_pointer(_Tp __v)
       : __a(__v)
   {}
 
diff --git a/libcudacxx/include/cuda/std/__atomic/api/reference.h b/libcudacxx/include/cuda/std/__atomic/api/reference.h
index 94058da985..eeba3a6746 100644
--- a/libcudacxx/include/cuda/std/__atomic/api/reference.h
+++ b/libcudacxx/include/cuda/std/__atomic/api/reference.h
@@ -34,7 +34,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 template <typename _Tp, typename _Sco>
 struct __atomic_ref_common
 {
-  _CCCL_HOST_DEVICE constexpr inline __atomic_ref_common(_Tp& __v)
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_ref_common(_Tp& __v)
       : __a(&__v)
   {}
 
@@ -50,7 +50,7 @@ struct __atomic_ref_common
 template <typename _Tp, typename _Sco>
 struct __atomic_ref_arithmetic
 {
-  _CCCL_HOST_DEVICE constexpr inline __atomic_ref_arithmetic(_Tp& __v)
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_ref_arithmetic(_Tp& __v)
       : __a(&__v)
   {}
 
@@ -67,7 +67,7 @@ struct __atomic_ref_arithmetic
 template <typename _Tp, typename _Sco>
 struct __atomic_ref_bitwise
 {
-  _CCCL_HOST_DEVICE constexpr inline __atomic_ref_bitwise(_Tp& __v)
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_ref_bitwise(_Tp& __v)
       : __a(&__v)
   {}
 
@@ -85,7 +85,7 @@ struct __atomic_ref_bitwise
 template <typename _Tp, typename _Sco>
 struct __atomic_ref_pointer
 {
-  _CCCL_HOST_DEVICE constexpr inline __atomic_ref_pointer(_Tp& __v)
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_ref_pointer(_Tp& __v)
       : __a(&__v)
   {}
 

From 45a75d2c487d733c59356207a0ac8ed82c13fb78 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Thu, 9 May 2024 15:29:08 -0700
Subject: [PATCH 54/71] Reorder derived PTX functions attribute declarations.

---
 .../std/__atomic/functions/cuda_ptx_derived.h | 30 +++++++++----------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
index d7a8033dd0..13f534b905 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
@@ -4,7 +4,7 @@
 // under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
@@ -29,7 +29,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 #if defined(_CCCL_CUDA_COMPILER)
 
 template <typename _Tp, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
-bool _CCCL_DEVICE __atomic_compare_exchange_cuda(
+_CCCL_DEVICE bool __atomic_compare_exchange_cuda(
   _Tp volatile* __ptr, _Tp* __expected, const _Tp __desired, bool, int __success_memorder, int __failure_memorder, _Sco)
 {
   auto const __aligned = (uint32_t*) ((intptr_t) __ptr & ~(sizeof(uint32_t) - 1));
@@ -57,7 +57,7 @@ bool _CCCL_DEVICE __atomic_compare_exchange_cuda(
 }
 
 template <typename _Tp, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
-void _CCCL_DEVICE __atomic_exchange_cuda(_Tp volatile* __ptr, _Tp* __val, _Tp* __ret, int __memorder, _Sco)
+_CCCL_DEVICE void __atomic_exchange_cuda(_Tp volatile* __ptr, _Tp* __val, _Tp* __ret, int __memorder, _Sco)
 {
   _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
   while (!__atomic_compare_exchange_cuda(__ptr, &__expected, __val, true, __memorder, __memorder, _Sco{}))
@@ -66,7 +66,7 @@ void _CCCL_DEVICE __atomic_exchange_cuda(_Tp volatile* __ptr, _Tp* __val, _Tp* _
 }
 
 template <typename _Tp, typename _Up, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
-_Tp _CCCL_DEVICE __atomic_fetch_add_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
+_CCCL_DEVICE _Tp __atomic_fetch_add_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
 {
   _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
   _Tp __desired  = __expected + __val;
@@ -81,7 +81,7 @@ template <typename _Tp,
           typename _Up,
           typename _Sco,
           __enable_if_t<sizeof(_Tp) <= 2 || _CUDA_VSTD::is_floating_point<_Tp>::value, int> = 0>
-_Tp _CCCL_HOST_DEVICE __atomic_fetch_max_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
+_CCCL_DEVICE _Tp __atomic_fetch_max_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
 {
   _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
   _Tp __desired  = __expected > __val ? __expected : __val;
@@ -99,7 +99,7 @@ template <typename _Tp,
           typename _Up,
           typename _Sco,
           __enable_if_t<sizeof(_Tp) <= 2 || _CUDA_VSTD::is_floating_point<_Tp>::value, int> = 0>
-_Tp _CCCL_HOST_DEVICE __atomic_fetch_min_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
+_CCCL_DEVICE _Tp __atomic_fetch_min_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
 {
   _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
   _Tp __desired  = __expected < __val ? __expected : __val;
@@ -114,7 +114,7 @@ _Tp _CCCL_HOST_DEVICE __atomic_fetch_min_cuda(_Tp volatile* __ptr, _Up __val, in
 }
 
 template <typename _Tp, typename _Up, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
-_Tp _CCCL_DEVICE __atomic_fetch_sub_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
+_CCCL_DEVICE _Tp __atomic_fetch_sub_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
 {
   _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
   _Tp __desired  = __expected - __val;
@@ -126,7 +126,7 @@ _Tp _CCCL_DEVICE __atomic_fetch_sub_cuda(_Tp volatile* __ptr, _Up __val, int __m
 }
 
 template <typename _Tp, typename _Up, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
-_Tp _CCCL_DEVICE __atomic_fetch_and_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
+_CCCL_DEVICE _Tp __atomic_fetch_and_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
 {
   _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
   _Tp __desired  = __expected & __val;
@@ -138,7 +138,7 @@ _Tp _CCCL_DEVICE __atomic_fetch_and_cuda(_Tp volatile* __ptr, _Up __val, int __m
 }
 
 template <typename _Tp, typename _Up, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
-_Tp _CCCL_DEVICE __atomic_fetch_xor_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
+_CCCL_DEVICE _Tp __atomic_fetch_xor_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
 {
   _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
   _Tp __desired  = __expected ^ __val;
@@ -150,7 +150,7 @@ _Tp _CCCL_DEVICE __atomic_fetch_xor_cuda(_Tp volatile* __ptr, _Up __val, int __m
 }
 
 template <typename _Tp, typename _Up, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
-_Tp _CCCL_DEVICE __atomic_fetch_or_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
+_CCCL_DEVICE _Tp __atomic_fetch_or_cuda(_Tp volatile* __ptr, _Up __val, int __memorder, _Sco)
 {
   _Tp __expected = __atomic_load_n_cuda(__ptr, __ATOMIC_RELAXED, _Sco{});
   _Tp __desired  = __expected | __val;
@@ -162,7 +162,7 @@ _Tp _CCCL_DEVICE __atomic_fetch_or_cuda(_Tp volatile* __ptr, _Up __val, int __me
 }
 
 template <typename _Tp, typename _Sco>
-_Tp _CCCL_DEVICE __atomic_load_n_cuda(const _Tp volatile* __ptr, int __memorder, _Sco)
+_CCCL_DEVICE _Tp __atomic_load_n_cuda(const _Tp volatile* __ptr, int __memorder, _Sco)
 {
   _Tp __ret;
   __atomic_load_cuda(__ptr, &__ret, __memorder, _Sco{});
@@ -170,13 +170,13 @@ _Tp _CCCL_DEVICE __atomic_load_n_cuda(const _Tp volatile* __ptr, int __memorder,
 }
 
 template <typename _Tp, typename _Sco>
-void _CCCL_DEVICE __atomic_store_n_cuda(_Tp volatile* __ptr, _Tp __val, int __memorder, _Sco)
+_CCCL_DEVICE void __atomic_store_n_cuda(_Tp volatile* __ptr, _Tp __val, int __memorder, _Sco)
 {
   __atomic_store_cuda(__ptr, &__val, __memorder, _Sco{});
 }
 
 template <typename _Tp, typename _Sco>
-bool _CCCL_DEVICE __atomic_compare_exchange_n_cuda(
+_CCCL_DEVICE bool __atomic_compare_exchange_n_cuda(
   _Tp volatile* __ptr, _Tp* __expected, _Tp __desired, bool __weak, int __success_memorder, int __failure_memorder, _Sco)
 {
   return __atomic_compare_exchange_cuda(
@@ -184,14 +184,14 @@ bool _CCCL_DEVICE __atomic_compare_exchange_n_cuda(
 }
 
 template <typename _Tp, typename _Sco>
-_Tp _CCCL_DEVICE __atomic_exchange_n_cuda(_Tp volatile* __ptr, _Tp __val, int __memorder, _Sco)
+_CCCL_DEVICE _Tp __atomic_exchange_n_cuda(_Tp volatile* __ptr, _Tp __val, int __memorder, _Sco)
 {
   _Tp __ret;
   __atomic_exchange_cuda(__ptr, &__val, &__ret, __memorder, _Sco{});
   return __ret;
 }
 
-static inline _CCCL_DEVICE void __atomic_signal_fence_cuda(int)
+_CCCL_DEVICE static inline void __atomic_signal_fence_cuda(int)
 {
   asm volatile("" ::: "memory");
 }

From aaf5e94e814e2ca99000c60f8747d1851a38ba1f Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Thu, 9 May 2024 15:40:03 -0700
Subject: [PATCH 55/71] traits fixups in `__atomic/types`

---
 .../include/cuda/std/__atomic/types/base.h    |  2 +-
 .../cuda/std/__atomic/types/reference.h       |  4 ++--
 .../include/cuda/std/__atomic/types/small.h   | 20 ++++++++++---------
 3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__atomic/types/base.h b/libcudacxx/include/cuda/std/__atomic/types/base.h
index 413df176d2..cf23e48d6c 100644
--- a/libcudacxx/include/cuda/std/__atomic/types/base.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/base.h
@@ -34,7 +34,7 @@ struct __atomic_storage
   static constexpr __atomic_tag __tag = __atomic_tag::__atomic_base_tag;
 
 #if !defined(_CCCL_COMPILER_GCC) || (__GNUC__ >= 5)
-  static_assert(is_trivially_copyable<_Tp>::value, "std::atomic<Tp> requires that 'Tp' be a trivially copyable type");
+  static_assert(_LIBCUDACXX_TRAIT(is_trivially_copyable, _Tp), "std::atomic<Tp> requires that 'Tp' be a trivially copyable type");
 #endif
 
   _CCCL_ALIGNAS(sizeof(_Tp)) _Tp __a_value;
diff --git a/libcudacxx/include/cuda/std/__atomic/types/reference.h b/libcudacxx/include/cuda/std/__atomic/types/reference.h
index 222e8762f1..6241c398b7 100644
--- a/libcudacxx/include/cuda/std/__atomic/types/reference.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/reference.h
@@ -22,7 +22,7 @@
 #endif // no system header
 
 #include <cuda/std/__atomic/types/base.h>
-#include <cuda/std/type_traits>
+#include <cuda/std/__type_traits/is_trivially_copyable.h>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
@@ -34,7 +34,7 @@ struct __atomic_ref_storage
   static constexpr __atomic_tag __tag = __atomic_tag::__atomic_base_tag;
 
 #if !defined(_CCCL_COMPILER_GCC) || (__GNUC__ >= 5)
-  static_assert(is_trivially_copyable<_Tp>::value,
+  static_assert(_LIBCUDACXX_TRAIT(is_trivially_copyable, _Tp),
                 "std::atomic_ref<Tp> requires that 'Tp' be a trivially copyable type");
 #endif
 
diff --git a/libcudacxx/include/cuda/std/__atomic/types/small.h b/libcudacxx/include/cuda/std/__atomic/types/small.h
index 5aea50831d..621f5dfc5d 100644
--- a/libcudacxx/include/cuda/std/__atomic/types/small.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/small.h
@@ -24,30 +24,32 @@
 #include <cuda/std/__atomic/order.h>
 #include <cuda/std/__atomic/scopes.h>
 #include <cuda/std/__atomic/types/base.h>
-#include <cuda/std/type_traits>
+#include <cuda/std/__type_traits/conditional.h>
+#include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/is_arithmetic.h>
+#include <cuda/std/__type_traits/is_signed.h>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-// Atomic small types require conversion to/from a proxy type that can be
 // manipulated by PTX without any performance overhead
 template <typename _Tp>
-using __atomic_small_proxy_t = __conditional_t<is_signed<_Tp>::value, int32_t, uint32_t>;
+using __atomic_small_proxy_t = _If<_LIBCUDACXX_TRAIT(is_signed, _Tp), int32_t, uint32_t>;
 
 // Arithmetic conversions to/from proxy types
-template <class _Tp, __enable_if_t<is_arithmetic<_Tp>::value, int> = 0>
-constexpr _CCCL_HOST_DEVICE inline __atomic_small_proxy_t<_Tp> __atomic_small_to_32(_Tp __val)
+template <class _Tp, __enable_if_t<_LIBCUDACXX_TRAIT(is_arithmetic, _Tp), int> = 0>
+_CCCL_HOST_DEVICE constexpr __atomic_small_proxy_t<_Tp> __atomic_small_to_32(_Tp __val)
 {
   return static_cast<__atomic_small_proxy_t<_Tp>>(__val);
 }
 
-template <class _Tp, __enable_if_t<is_arithmetic<_Tp>::value, int> = 0>
-constexpr _CCCL_HOST_DEVICE inline _Tp __atomic_small_from_32(__atomic_small_proxy_t<_Tp> __val)
+template <class _Tp, __enable_if_t<_LIBCUDACXX_TRAIT(is_arithmetic, _Tp), int> = 0>
+_CCCL_HOST_DEVICE constexpr inline _Tp __atomic_small_from_32(__atomic_small_proxy_t<_Tp> __val)
 {
   return static_cast<_Tp>(__val);
 }
 
 // Non-arithmetic conversion to/from proxy types
-template <class _Tp, __enable_if_t<!is_arithmetic<_Tp>::value, int> = 0>
+template <class _Tp, __enable_if_t<!_LIBCUDACXX_TRAIT(is_arithmetic, _Tp), int> = 0>
 _CCCL_HOST_DEVICE inline __atomic_small_proxy_t<_Tp> __atomic_small_to_32(_Tp __val)
 {
   __atomic_small_proxy_t<_Tp> __temp{};
@@ -55,7 +57,7 @@ _CCCL_HOST_DEVICE inline __atomic_small_proxy_t<_Tp> __atomic_small_to_32(_Tp __
   return __temp;
 }
 
-template <class _Tp, __enable_if_t<!is_arithmetic<_Tp>::value, int> = 0>
+template <class _Tp, __enable_if_t<!_LIBCUDACXX_TRAIT(is_arithmetic, _Tp), int> = 0>
 _CCCL_HOST_DEVICE inline _Tp __atomic_small_from_32(__atomic_small_proxy_t<_Tp> __val)
 {
   _Tp __temp{};

From a01583091abc081c68d278710cfd7d4656f7c375 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Thu, 9 May 2024 15:41:34 -0700
Subject: [PATCH 56/71] Default ctors and sprinkle noexcept around on some
 `__atomic/types` APIs.

---
 libcudacxx/include/cuda/std/__atomic/types/base.h    | 12 +++++-------
 libcudacxx/include/cuda/std/__atomic/types/locked.h  |  5 +----
 .../include/cuda/std/__atomic/types/reference.h      |  8 ++++----
 3 files changed, 10 insertions(+), 15 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__atomic/types/base.h b/libcudacxx/include/cuda/std/__atomic/types/base.h
index cf23e48d6c..bc6e0ad2cd 100644
--- a/libcudacxx/include/cuda/std/__atomic/types/base.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/base.h
@@ -39,27 +39,25 @@ struct __atomic_storage
 
   _CCCL_ALIGNAS(sizeof(_Tp)) _Tp __a_value;
 
-  _CCCL_HOST_DEVICE constexpr explicit inline __atomic_storage() noexcept
-      : __a_value{}
-  {}
+  constexpr explicit __atomic_storage() noexcept = default;
 
   _CCCL_HOST_DEVICE constexpr explicit inline __atomic_storage(_Tp value) noexcept
       : __a_value(value)
   {}
 
-  _CCCL_HOST_DEVICE inline auto get() -> __underlying_t*
+  _CCCL_HOST_DEVICE inline auto get() noexcept -> __underlying_t*
   {
     return &__a_value;
   }
-  _CCCL_HOST_DEVICE inline auto get() const -> const __underlying_t*
+  _CCCL_HOST_DEVICE inline auto get() noexcept const -> const __underlying_t*
   {
     return &__a_value;
   }
-  _CCCL_HOST_DEVICE inline auto get() volatile -> volatile __underlying_t*
+  _CCCL_HOST_DEVICE inline auto get() noexcept volatile -> volatile __underlying_t*
   {
     return &__a_value;
   }
-  _CCCL_HOST_DEVICE inline auto get() const volatile -> const volatile __underlying_t*
+  _CCCL_HOST_DEVICE inline auto get() noexcept const volatile -> const volatile __underlying_t*
   {
     return &__a_value;
   }
diff --git a/libcudacxx/include/cuda/std/__atomic/types/locked.h b/libcudacxx/include/cuda/std/__atomic/types/locked.h
index 3f1fa3e240..f436c37f24 100644
--- a/libcudacxx/include/cuda/std/__atomic/types/locked.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/locked.h
@@ -39,10 +39,7 @@ struct __atomic_locked_storage
   _Tp __a_value;
   mutable __atomic_storage<_LIBCUDACXX_ATOMIC_FLAG_TYPE> __a_lock;
 
-  _CCCL_HOST_DEVICE constexpr explicit inline __atomic_locked_storage() noexcept
-      : __a_value{}
-      , __a_lock{}
-  {}
+  explicit constexpr __atomic_locked_storage() noexcept = default;
 
   _CCCL_HOST_DEVICE constexpr explicit inline __atomic_locked_storage(_Tp value) noexcept
       : __a_value(value)
diff --git a/libcudacxx/include/cuda/std/__atomic/types/reference.h b/libcudacxx/include/cuda/std/__atomic/types/reference.h
index 6241c398b7..dc99b9325f 100644
--- a/libcudacxx/include/cuda/std/__atomic/types/reference.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/reference.h
@@ -46,19 +46,19 @@ struct __atomic_ref_storage
       : __a_value(value)
   {}
 
-  _CCCL_HOST_DEVICE inline auto get() -> __underlying_t*
+  _CCCL_HOST_DEVICE inline auto get() noexcept -> __underlying_t*
   {
     return __a_value;
   }
-  _CCCL_HOST_DEVICE inline auto get() const -> __underlying_t*
+  _CCCL_HOST_DEVICE inline auto get() noexcept const -> __underlying_t*
   {
     return __a_value;
   }
-  _CCCL_HOST_DEVICE inline auto get() volatile -> volatile __underlying_t*
+  _CCCL_HOST_DEVICE inline auto get() noexcept volatile -> volatile __underlying_t*
   {
     return __a_value;
   }
-  _CCCL_HOST_DEVICE inline auto get() const volatile -> volatile __underlying_t*
+  _CCCL_HOST_DEVICE inline auto get() noexcept const volatile -> volatile __underlying_t*
   {
     return __a_value;
   }

From 8d90f567986818d4344a332206ca8e0985016e22 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Thu, 9 May 2024 15:44:54 -0700
Subject: [PATCH 57/71] Apply suggestions to common.h.

---
 libcudacxx/include/cuda/std/__atomic/types/common.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__atomic/types/common.h b/libcudacxx/include/cuda/std/__atomic/types/common.h
index 1906f9e745..41dbed9e7f 100644
--- a/libcudacxx/include/cuda/std/__atomic/types/common.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/common.h
@@ -24,6 +24,7 @@
 #include <cuda/std/__type_traits/enable_if.h>
 #include <cuda/std/__type_traits/is_assignable.h>
 #include <cuda/std/__type_traits/remove_cv.h>
+#include <cuda/std/__type_traits/remove_cvref.h>
 #include <cuda/std/detail/libcxx/include/cstring>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
@@ -53,14 +54,14 @@ using __atomic_underlying_remove_cv_t = __remove_cv_t<typename _Tp::__underlying
 // the default operator= in an object is not volatile, a byte-by-byte copy
 // is required.
 template <typename _Tp, typename _Tv>
-__enable_if_t<is_assignable<_Tp&, _Tv>::value> _CCCL_HOST_DEVICE
+_CCCL_HOST_DEVICE __enable_if_t<_LIBCUDACXX_TRAITS(is_assignable, _Tp&, _Tv)>
 __atomic_assign_volatile(_Tp* __a_value, _Tv const& __val)
 {
   *__a_value = __val;
 }
 
 template <typename _Tp, typename _Tv>
-__enable_if_t<is_assignable<_Tp&, _Tv>::value> _CCCL_HOST_DEVICE
+_CCCL_HOST_DEVICE __enable_if_t<_LIBCUDACXX_TRAITS(is_assignable, _Tp&, _Tv)>
 __atomic_assign_volatile(_Tp volatile* __a_value, _Tv volatile const& __val)
 {
   volatile char* __to         = reinterpret_cast<volatile char*>(__a_value);

From 63b6f5dfda9d23dec882942da473c2d6e1507215 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Thu, 9 May 2024 15:51:24 -0700
Subject: [PATCH 58/71] Remove full namespace qualifier in atomic storage
 trait.

---
 libcudacxx/include/cuda/std/__atomic/types.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__atomic/types.h b/libcudacxx/include/cuda/std/__atomic/types.h
index 07f4ce91ab..b3ddfe58f8 100644
--- a/libcudacxx/include/cuda/std/__atomic/types.h
+++ b/libcudacxx/include/cuda/std/__atomic/types.h
@@ -39,10 +39,10 @@ struct __atomic_traits
 };
 
 template <typename _Tp>
-using __atomic_storage_t = typename _CUDA_VSTD::_If<
+using __atomic_storage_t = typename _If<
   __atomic_traits<_Tp>::__atomic_requires_small,
   __atomic_small_storage<_Tp>,
-  _CUDA_VSTD::_If<__atomic_traits<_Tp>::__atomic_requires_lock, __atomic_locked_storage<_Tp>, __atomic_storage<_Tp>>>;
+  _If<__atomic_traits<_Tp>::__atomic_requires_lock, __atomic_locked_storage<_Tp>, __atomic_storage<_Tp>>>;
 
 _LIBCUDACXX_END_NAMESPACE_STD
 

From 6620229b0ccb29e768d8f0e51228dff7d21c45d9 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Thu, 9 May 2024 15:52:06 -0700
Subject: [PATCH 59/71] modernization fixes to order.h.

---
 libcudacxx/include/cuda/std/__atomic/order.h | 23 ++++++++++----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__atomic/order.h b/libcudacxx/include/cuda/std/__atomic/order.h
index b99db0d59d..935efab757 100644
--- a/libcudacxx/include/cuda/std/__atomic/order.h
+++ b/libcudacxx/include/cuda/std/__atomic/order.h
@@ -21,6 +21,7 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/std/__type_traits/is_same.h>
 #include <cuda/std/__type_traits/underlying_type.h>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
@@ -60,9 +61,9 @@ enum __legacy_memory_order
   __mo_seq_cst
 };
 
-typedef underlying_type<__legacy_memory_order>::type __memory_order_underlying_t;
+using __memory_order_underlying_t = underlying_type<__legacy_memory_order>::type;
 
-#if _CCCL_STD_VER > 2017
+#if _CCCL_STD_VER >= 2020
 
 enum class memory_order : __memory_order_underlying_t
 {
@@ -81,7 +82,7 @@ inline constexpr auto memory_order_release = memory_order::release;
 inline constexpr auto memory_order_acq_rel = memory_order::acq_rel;
 inline constexpr auto memory_order_seq_cst = memory_order::seq_cst;
 
-#else
+#else // ^^^ C++20 ^^^ / vvv C++17 vvv
 
 typedef enum memory_order
 {
@@ -93,7 +94,7 @@ typedef enum memory_order
   memory_order_seq_cst = __mo_seq_cst,
 } memory_order;
 
-#endif // _CCCL_STD_VER > 2017
+#endif // _CCCL_STD_VER >= 2020
 
 _CCCL_HOST_DEVICE inline int __stronger_order_cuda(int __a, int __b)
 {
@@ -102,7 +103,7 @@ _CCCL_HOST_DEVICE inline int __stronger_order_cuda(int __a, int __b)
   {
     return __max;
   }
-  static int const __xform[] = {__ATOMIC_RELEASE, __ATOMIC_ACQ_REL, __ATOMIC_ACQ_REL, __ATOMIC_RELEASE};
+  constexpr int __xform[] = {__ATOMIC_RELEASE, __ATOMIC_ACQ_REL, __ATOMIC_ACQ_REL, __ATOMIC_RELEASE};
   return __xform[__a < __b ? __a : __b];
 }
 
@@ -143,12 +144,12 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
 using memory_order = _CUDA_VSTD::memory_order;
 
-constexpr memory_order memory_order_relaxed = _CUDA_VSTD::memory_order_relaxed;
-constexpr memory_order memory_order_consume = _CUDA_VSTD::memory_order_consume;
-constexpr memory_order memory_order_acquire = _CUDA_VSTD::memory_order_acquire;
-constexpr memory_order memory_order_release = _CUDA_VSTD::memory_order_release;
-constexpr memory_order memory_order_acq_rel = _CUDA_VSTD::memory_order_acq_rel;
-constexpr memory_order memory_order_seq_cst = _CUDA_VSTD::memory_order_seq_cst;
+_LIBCUDACXX_INLINE_VAR constexpr memory_order memory_order_relaxed = _CUDA_VSTD::memory_order_relaxed;
+_LIBCUDACXX_INLINE_VAR constexpr memory_order memory_order_consume = _CUDA_VSTD::memory_order_consume;
+_LIBCUDACXX_INLINE_VAR constexpr memory_order memory_order_acquire = _CUDA_VSTD::memory_order_acquire;
+_LIBCUDACXX_INLINE_VAR constexpr memory_order memory_order_release = _CUDA_VSTD::memory_order_release;
+_LIBCUDACXX_INLINE_VAR constexpr memory_order memory_order_acq_rel = _CUDA_VSTD::memory_order_acq_rel;
+_LIBCUDACXX_INLINE_VAR constexpr memory_order memory_order_seq_cst = _CUDA_VSTD::memory_order_seq_cst;
 
 _LIBCUDACXX_END_NAMESPACE_CUDA
 

From e00ebf3a6705ce778e5b0c079319c5c39768b5df Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Thu, 9 May 2024 15:53:29 -0700
Subject: [PATCH 60/71] Move includes cuda/atomic.h and cuda/barrier.h.

---
 libcudacxx/include/cuda/std/__cuda/atomic.h  | 4 ++--
 libcudacxx/include/cuda/std/__cuda/barrier.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__cuda/atomic.h b/libcudacxx/include/cuda/std/__cuda/atomic.h
index 1758b11fcb..d45a12c155 100644
--- a/libcudacxx/include/cuda/std/__cuda/atomic.h
+++ b/libcudacxx/include/cuda/std/__cuda/atomic.h
@@ -13,8 +13,6 @@
 
 #include <cuda/std/detail/__config>
 
-#include <cuda/std/atomic>
-
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
@@ -23,6 +21,8 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/std/atomic>
+
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
 // atomic<T>
diff --git a/libcudacxx/include/cuda/std/__cuda/barrier.h b/libcudacxx/include/cuda/std/__cuda/barrier.h
index 7d639121e3..38a1b9e8f5 100644
--- a/libcudacxx/include/cuda/std/__cuda/barrier.h
+++ b/libcudacxx/include/cuda/std/__cuda/barrier.h
@@ -13,8 +13,6 @@
 
 #include <cuda/std/detail/__config>
 
-#include <cuda/std/__atomic/api/owned.h>
-
 #if defined(__CUDA_MINIMUM_ARCH__) && __CUDA_MINIMUM_ARCH__ < 700
 #  error "CUDA synchronization primitives are only supported for sm_70 and up."
 #endif
@@ -27,6 +25,8 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/std/__atomic/api/owned.h>
+
 #include <cuda/std/__type_traits/void_t.h> // _CUDA_VSTD::__void_t
 #include <cuda/std/detail/libcxx/include/cstdlib> // _LIBCUDACXX_UNREACHABLE
 

From 310f06a00c2808dfa0369f56de050b5e5471fabe Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Thu, 9 May 2024 15:57:25 -0700
Subject: [PATCH 61/71] Adjust header error block and add missing includes in
 `<cuda/std/atomic>`

---
 libcudacxx/include/cuda/std/atomic | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/libcudacxx/include/cuda/std/atomic b/libcudacxx/include/cuda/std/atomic
index 88214a49ff..8ae9efb420 100644
--- a/libcudacxx/include/cuda/std/atomic
+++ b/libcudacxx/include/cuda/std/atomic
@@ -14,16 +14,6 @@
 
 #include <cuda/std/detail/__config>
 
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-_CCCL_PUSH_MACROS
-
 #ifdef _LIBCUDACXX_HAS_NO_THREADS
 # error <atomic> is not supported on this single threaded system
 #endif
@@ -37,19 +27,31 @@ _CCCL_PUSH_MACROS
 # error C++ standard library is incompatible with <stdatomic.h>
 #endif
 
-#include <cuda/std/__atomic/platform.h>
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
 
+#include <cuda/std/__atomic/platform.h>
 #include <cuda/std/__atomic/order.h>
 #include <cuda/std/__atomic/scopes.h>
-
 #include <cuda/std/__atomic/wait/polling.h>
 #include <cuda/std/__atomic/wait/notify_wait.h>
-
 #include <cuda/std/__atomic/api/owned.h>
 #include <cuda/std/__atomic/api/reference.h>
 
+#include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/is_floating_point.h>
+#include <cuda/std/__type_traits/is_integral.h>
+#include <cuda/std/__type_traits/is_same.h>
+
 // clang-format on
 
+_CCCL_PUSH_MACROS
+
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <class _Tp>

From 1398c3c6ffb1b344a4597e4e5d37d7cb997569d5 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Thu, 9 May 2024 17:00:21 -0700
Subject: [PATCH 62/71] Add missing system header block in cuda/atomic

---
 libcudacxx/include/cuda/atomic | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/libcudacxx/include/cuda/atomic b/libcudacxx/include/cuda/atomic
index 565703dfb8..06dd1c785c 100644
--- a/libcudacxx/include/cuda/atomic
+++ b/libcudacxx/include/cuda/atomic
@@ -13,4 +13,12 @@
 
 #include <cuda/std/__cuda/atomic.h>
 
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
 #endif // _CUDA_ATOMIC

From 678006c850a28c774c7e3ad69a56d68b029f3ad7 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Thu, 9 May 2024 17:12:20 -0700
Subject: [PATCH 63/71] Fix invalid use of typename.

---
 libcudacxx/include/cuda/std/__atomic/types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcudacxx/include/cuda/std/__atomic/types.h b/libcudacxx/include/cuda/std/__atomic/types.h
index b3ddfe58f8..986211e63d 100644
--- a/libcudacxx/include/cuda/std/__atomic/types.h
+++ b/libcudacxx/include/cuda/std/__atomic/types.h
@@ -39,7 +39,7 @@ struct __atomic_traits
 };
 
 template <typename _Tp>
-using __atomic_storage_t = typename _If<
+using __atomic_storage_t = _If<
   __atomic_traits<_Tp>::__atomic_requires_small,
   __atomic_small_storage<_Tp>,
   _If<__atomic_traits<_Tp>::__atomic_requires_lock, __atomic_locked_storage<_Tp>, __atomic_storage<_Tp>>>;

From b3a24fe8a72dca22ff31e64b71b4b5ee85900edc Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Thu, 9 May 2024 17:12:41 -0700
Subject: [PATCH 64/71] `_LIBCUDACXX_TRAITS->_CCCL_TRAIT`

---
 libcudacxx/include/cuda/std/__atomic/types/base.h      |  8 ++++----
 libcudacxx/include/cuda/std/__atomic/types/common.h    |  4 ++--
 libcudacxx/include/cuda/std/__atomic/types/locked.h    |  8 ++++----
 libcudacxx/include/cuda/std/__atomic/types/reference.h |  8 ++++----
 libcudacxx/include/cuda/std/__atomic/types/small.h     | 10 +++++-----
 5 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__atomic/types/base.h b/libcudacxx/include/cuda/std/__atomic/types/base.h
index bc6e0ad2cd..f22f1d669b 100644
--- a/libcudacxx/include/cuda/std/__atomic/types/base.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/base.h
@@ -34,7 +34,7 @@ struct __atomic_storage
   static constexpr __atomic_tag __tag = __atomic_tag::__atomic_base_tag;
 
 #if !defined(_CCCL_COMPILER_GCC) || (__GNUC__ >= 5)
-  static_assert(_LIBCUDACXX_TRAIT(is_trivially_copyable, _Tp), "std::atomic<Tp> requires that 'Tp' be a trivially copyable type");
+  static_assert(_CCCL_TRAIT(is_trivially_copyable, _Tp), "std::atomic<Tp> requires that 'Tp' be a trivially copyable type");
 #endif
 
   _CCCL_ALIGNAS(sizeof(_Tp)) _Tp __a_value;
@@ -49,15 +49,15 @@ struct __atomic_storage
   {
     return &__a_value;
   }
-  _CCCL_HOST_DEVICE inline auto get() noexcept const -> const __underlying_t*
+  _CCCL_HOST_DEVICE inline auto get() const noexcept -> const __underlying_t*
   {
     return &__a_value;
   }
-  _CCCL_HOST_DEVICE inline auto get() noexcept volatile -> volatile __underlying_t*
+  _CCCL_HOST_DEVICE inline auto get() volatile noexcept -> volatile __underlying_t*
   {
     return &__a_value;
   }
-  _CCCL_HOST_DEVICE inline auto get() noexcept const volatile -> const volatile __underlying_t*
+  _CCCL_HOST_DEVICE inline auto get() const volatile noexcept -> const volatile __underlying_t*
   {
     return &__a_value;
   }
diff --git a/libcudacxx/include/cuda/std/__atomic/types/common.h b/libcudacxx/include/cuda/std/__atomic/types/common.h
index 41dbed9e7f..9a44fe7034 100644
--- a/libcudacxx/include/cuda/std/__atomic/types/common.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/common.h
@@ -54,14 +54,14 @@ using __atomic_underlying_remove_cv_t = __remove_cv_t<typename _Tp::__underlying
 // the default operator= in an object is not volatile, a byte-by-byte copy
 // is required.
 template <typename _Tp, typename _Tv>
-_CCCL_HOST_DEVICE __enable_if_t<_LIBCUDACXX_TRAITS(is_assignable, _Tp&, _Tv)>
+_CCCL_HOST_DEVICE __enable_if_t<_CCCL_TRAIT(is_assignable, _Tp&, _Tv)>
 __atomic_assign_volatile(_Tp* __a_value, _Tv const& __val)
 {
   *__a_value = __val;
 }
 
 template <typename _Tp, typename _Tv>
-_CCCL_HOST_DEVICE __enable_if_t<_LIBCUDACXX_TRAITS(is_assignable, _Tp&, _Tv)>
+_CCCL_HOST_DEVICE __enable_if_t<_CCCL_TRAIT(is_assignable, _Tp&, _Tv)>
 __atomic_assign_volatile(_Tp volatile* __a_value, _Tv volatile const& __val)
 {
   volatile char* __to         = reinterpret_cast<volatile char*>(__a_value);
diff --git a/libcudacxx/include/cuda/std/__atomic/types/locked.h b/libcudacxx/include/cuda/std/__atomic/types/locked.h
index f436c37f24..1fc5103d2a 100644
--- a/libcudacxx/include/cuda/std/__atomic/types/locked.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/locked.h
@@ -47,24 +47,24 @@ struct __atomic_locked_storage
   {}
 
   template <typename _Sco>
-  _CCCL_HOST_DEVICE inline void __lock(_Sco) const volatile
+  _CCCL_HOST_DEVICE inline void __lock(_Sco) const volatile noexcept
   {
     while (1 == __atomic_exchange_dispatch(&__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), memory_order_acquire, _Sco{}))
       /*spin*/;
   }
   template <typename _Sco>
-  _CCCL_HOST_DEVICE inline void __lock(_Sco) const
+  _CCCL_HOST_DEVICE inline void __lock(_Sco) const noexcept
   {
     while (1 == __atomic_exchange_dispatch(&__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), memory_order_acquire, _Sco{}))
       /*spin*/;
   }
   template <typename _Sco>
-  _CCCL_HOST_DEVICE inline void __unlock(_Sco) const volatile
+  _CCCL_HOST_DEVICE inline void __unlock(_Sco) const volatile noexcept
   {
     __atomic_store_dispatch(&__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), memory_order_release, _Sco{});
   }
   template <typename _Sco>
-  _CCCL_HOST_DEVICE inline void __unlock(_Sco) const
+  _CCCL_HOST_DEVICE inline void __unlock(_Sco) const noexcept
   {
     __atomic_store_dispatch(&__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), memory_order_release, _Sco{});
   }
diff --git a/libcudacxx/include/cuda/std/__atomic/types/reference.h b/libcudacxx/include/cuda/std/__atomic/types/reference.h
index dc99b9325f..a83c8e5832 100644
--- a/libcudacxx/include/cuda/std/__atomic/types/reference.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/reference.h
@@ -34,7 +34,7 @@ struct __atomic_ref_storage
   static constexpr __atomic_tag __tag = __atomic_tag::__atomic_base_tag;
 
 #if !defined(_CCCL_COMPILER_GCC) || (__GNUC__ >= 5)
-  static_assert(_LIBCUDACXX_TRAIT(is_trivially_copyable, _Tp),
+  static_assert(_CCCL_TRAIT(is_trivially_copyable, _Tp),
                 "std::atomic_ref<Tp> requires that 'Tp' be a trivially copyable type");
 #endif
 
@@ -50,15 +50,15 @@ struct __atomic_ref_storage
   {
     return __a_value;
   }
-  _CCCL_HOST_DEVICE inline auto get() noexcept const -> __underlying_t*
+  _CCCL_HOST_DEVICE inline auto get() const noexcept -> __underlying_t*
   {
     return __a_value;
   }
-  _CCCL_HOST_DEVICE inline auto get() noexcept volatile -> volatile __underlying_t*
+  _CCCL_HOST_DEVICE inline auto get() volatile noexcept -> volatile __underlying_t*
   {
     return __a_value;
   }
-  _CCCL_HOST_DEVICE inline auto get() noexcept const volatile -> volatile __underlying_t*
+  _CCCL_HOST_DEVICE inline auto get() const volatile noexcept -> volatile __underlying_t*
   {
     return __a_value;
   }
diff --git a/libcudacxx/include/cuda/std/__atomic/types/small.h b/libcudacxx/include/cuda/std/__atomic/types/small.h
index 621f5dfc5d..8f38df5bb0 100644
--- a/libcudacxx/include/cuda/std/__atomic/types/small.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/small.h
@@ -33,23 +33,23 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 // manipulated by PTX without any performance overhead
 template <typename _Tp>
-using __atomic_small_proxy_t = _If<_LIBCUDACXX_TRAIT(is_signed, _Tp), int32_t, uint32_t>;
+using __atomic_small_proxy_t = _If<_CCCL_TRAIT(is_signed, _Tp), int32_t, uint32_t>;
 
 // Arithmetic conversions to/from proxy types
-template <class _Tp, __enable_if_t<_LIBCUDACXX_TRAIT(is_arithmetic, _Tp), int> = 0>
+template <class _Tp, __enable_if_t<_CCCL_TRAIT(is_arithmetic, _Tp), int> = 0>
 _CCCL_HOST_DEVICE constexpr __atomic_small_proxy_t<_Tp> __atomic_small_to_32(_Tp __val)
 {
   return static_cast<__atomic_small_proxy_t<_Tp>>(__val);
 }
 
-template <class _Tp, __enable_if_t<_LIBCUDACXX_TRAIT(is_arithmetic, _Tp), int> = 0>
+template <class _Tp, __enable_if_t<_CCCL_TRAIT(is_arithmetic, _Tp), int> = 0>
 _CCCL_HOST_DEVICE constexpr inline _Tp __atomic_small_from_32(__atomic_small_proxy_t<_Tp> __val)
 {
   return static_cast<_Tp>(__val);
 }
 
 // Non-arithmetic conversion to/from proxy types
-template <class _Tp, __enable_if_t<!_LIBCUDACXX_TRAIT(is_arithmetic, _Tp), int> = 0>
+template <class _Tp, __enable_if_t<!_CCCL_TRAIT(is_arithmetic, _Tp), int> = 0>
 _CCCL_HOST_DEVICE inline __atomic_small_proxy_t<_Tp> __atomic_small_to_32(_Tp __val)
 {
   __atomic_small_proxy_t<_Tp> __temp{};
@@ -57,7 +57,7 @@ _CCCL_HOST_DEVICE inline __atomic_small_proxy_t<_Tp> __atomic_small_to_32(_Tp __
   return __temp;
 }
 
-template <class _Tp, __enable_if_t<!_LIBCUDACXX_TRAIT(is_arithmetic, _Tp), int> = 0>
+template <class _Tp, __enable_if_t<!_CCCL_TRAIT(is_arithmetic, _Tp), int> = 0>
 _CCCL_HOST_DEVICE inline _Tp __atomic_small_from_32(__atomic_small_proxy_t<_Tp> __val)
 {
   _Tp __temp{};

From 923f61ea046719d99a3bf3497535003dafd0a503 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 10 May 2024 00:30:31 +0000
Subject: [PATCH 65/71] [pre-commit.ci] auto code formatting

---
 libcudacxx/include/cuda/std/__atomic/types.h      | 8 ++++----
 libcudacxx/include/cuda/std/__atomic/types/base.h | 3 ++-
 libcudacxx/include/cuda/std/__cuda/barrier.h      | 1 -
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__atomic/types.h b/libcudacxx/include/cuda/std/__atomic/types.h
index 986211e63d..4b58ba4901 100644
--- a/libcudacxx/include/cuda/std/__atomic/types.h
+++ b/libcudacxx/include/cuda/std/__atomic/types.h
@@ -39,10 +39,10 @@ struct __atomic_traits
 };
 
 template <typename _Tp>
-using __atomic_storage_t = _If<
-  __atomic_traits<_Tp>::__atomic_requires_small,
-  __atomic_small_storage<_Tp>,
-  _If<__atomic_traits<_Tp>::__atomic_requires_lock, __atomic_locked_storage<_Tp>, __atomic_storage<_Tp>>>;
+using __atomic_storage_t =
+  _If<__atomic_traits<_Tp>::__atomic_requires_small,
+      __atomic_small_storage<_Tp>,
+      _If<__atomic_traits<_Tp>::__atomic_requires_lock, __atomic_locked_storage<_Tp>, __atomic_storage<_Tp>>>;
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
diff --git a/libcudacxx/include/cuda/std/__atomic/types/base.h b/libcudacxx/include/cuda/std/__atomic/types/base.h
index f22f1d669b..ecee01eb15 100644
--- a/libcudacxx/include/cuda/std/__atomic/types/base.h
+++ b/libcudacxx/include/cuda/std/__atomic/types/base.h
@@ -34,7 +34,8 @@ struct __atomic_storage
   static constexpr __atomic_tag __tag = __atomic_tag::__atomic_base_tag;
 
 #if !defined(_CCCL_COMPILER_GCC) || (__GNUC__ >= 5)
-  static_assert(_CCCL_TRAIT(is_trivially_copyable, _Tp), "std::atomic<Tp> requires that 'Tp' be a trivially copyable type");
+  static_assert(_CCCL_TRAIT(is_trivially_copyable, _Tp),
+                "std::atomic<Tp> requires that 'Tp' be a trivially copyable type");
 #endif
 
   _CCCL_ALIGNAS(sizeof(_Tp)) _Tp __a_value;
diff --git a/libcudacxx/include/cuda/std/__cuda/barrier.h b/libcudacxx/include/cuda/std/__cuda/barrier.h
index 38a1b9e8f5..8533501ae1 100644
--- a/libcudacxx/include/cuda/std/__cuda/barrier.h
+++ b/libcudacxx/include/cuda/std/__cuda/barrier.h
@@ -26,7 +26,6 @@
 #endif // no system header
 
 #include <cuda/std/__atomic/api/owned.h>
-
 #include <cuda/std/__type_traits/void_t.h> // _CUDA_VSTD::__void_t
 #include <cuda/std/detail/libcxx/include/cstdlib> // _LIBCUDACXX_UNREACHABLE
 

From 3ade90ae4726bf494967a0558790b385087995ed Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Thu, 9 May 2024 20:44:06 -0700
Subject: [PATCH 66/71] Fix return type of host atomics.

---
 .../include/cuda/std/__atomic/functions/host.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__atomic/functions/host.h b/libcudacxx/include/cuda/std/__atomic/functions/host.h
index 3db6ab3077..0a48f7bb1f 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/host.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/host.h
@@ -144,14 +144,14 @@ template <typename _Tp>
 using __atomic_ptr_skip_t = __atomic_ptr_skip<__remove_cvref_t<_Tp>>;
 
 template <typename _Tp, typename _Td, __enable_if_t<!is_floating_point<_Tp>::value, int> = 0>
-inline _Tp __atomic_fetch_add_host(_Tp* __a, _Td __delta, memory_order __order)
+inline __remove_cv_t<_Tp> __atomic_fetch_add_host(_Tp* __a, _Td __delta, memory_order __order)
 {
   constexpr auto __skip_v = __atomic_ptr_skip_t<_Tp>::__skip;
   return __atomic_fetch_add(__a, __delta * __skip_v, __atomic_order_to_int(__order));
 }
 
 template <typename _Tp, typename _Td, __enable_if_t<is_floating_point<_Tp>::value, int> = 0>
-inline _Tp __atomic_fetch_add_host(_Tp* __a, _Td __delta, memory_order __order)
+inline __remove_cv_t<_Tp> __atomic_fetch_add_host(_Tp* __a, _Td __delta, memory_order __order)
 {
   auto __expected = __atomic_load_host(__a, memory_order_relaxed);
   auto __desired  = __expected + __delta;
@@ -165,14 +165,14 @@ inline _Tp __atomic_fetch_add_host(_Tp* __a, _Td __delta, memory_order __order)
 }
 
 template <typename _Tp, typename _Td, __enable_if_t<!is_floating_point<_Tp>::value, int> = 0>
-inline _Tp __atomic_fetch_sub_host(_Tp* __a, _Td __delta, memory_order __order)
+inline __remove_cv_t<_Tp> __atomic_fetch_sub_host(_Tp* __a, _Td __delta, memory_order __order)
 {
   constexpr auto __skip_v = __atomic_ptr_skip_t<_Tp>::__skip;
   return __atomic_fetch_sub(__a, __delta * __skip_v, __atomic_order_to_int(__order));
 }
 
 template <typename _Tp, typename _Td, __enable_if_t<is_floating_point<_Tp>::value, int> = 0>
-inline _Tp __atomic_fetch_sub_host(_Tp* __a, _Td __delta, memory_order __order)
+inline __remove_cv_t<_Tp> __atomic_fetch_sub_host(_Tp* __a, _Td __delta, memory_order __order)
 {
   auto __expected = __atomic_load_host(__a, memory_order_relaxed);
   auto __desired  = __expected - __delta;
@@ -186,25 +186,25 @@ inline _Tp __atomic_fetch_sub_host(_Tp* __a, _Td __delta, memory_order __order)
 }
 
 template <typename _Tp, typename _Td>
-inline _Tp __atomic_fetch_and_host(_Tp* __a, _Td __pattern, memory_order __order)
+inline __remove_cv_t<_Tp> __atomic_fetch_and_host(_Tp* __a, _Td __pattern, memory_order __order)
 {
   return __atomic_fetch_and(__a, __pattern, __atomic_order_to_int(__order));
 }
 
 template <typename _Tp, typename _Td>
-inline _Tp __atomic_fetch_or_host(_Tp* __a, _Td __pattern, memory_order __order)
+inline __remove_cv_t<_Tp> __atomic_fetch_or_host(_Tp* __a, _Td __pattern, memory_order __order)
 {
   return __atomic_fetch_or(__a, __pattern, __atomic_order_to_int(__order));
 }
 
 template <typename _Tp, typename _Td>
-inline _Tp __atomic_fetch_xor_host(_Tp* __a, _Td __pattern, memory_order __order)
+inline __remove_cv_t<_Tp> __atomic_fetch_xor_host(_Tp* __a, _Td __pattern, memory_order __order)
 {
   return __atomic_fetch_xor(__a, __pattern, __atomic_order_to_int(__order));
 }
 
 template <typename _Tp, typename _Td>
-inline _Tp __atomic_fetch_max_host(_Tp* __a, _Td __val, memory_order __order)
+inline __remove_cv_t<_Tp> __atomic_fetch_max_host(_Tp* __a, _Td __val, memory_order __order)
 {
   auto __expected = __atomic_load_host(__a, memory_order_relaxed);
   auto __desired  = __expected > __val ? __expected : __val;
@@ -218,7 +218,7 @@ inline _Tp __atomic_fetch_max_host(_Tp* __a, _Td __val, memory_order __order)
 }
 
 template <typename _Tp, typename _Td>
-inline _Tp __atomic_fetch_min_host(_Tp* __a, _Td __val, memory_order __order)
+inline __remove_cv_t<_Tp> __atomic_fetch_min_host(_Tp* __a, _Td __val, memory_order __order)
 {
   auto __expected = __atomic_load_host(__a, memory_order_relaxed);
   auto __desired  = __expected < __val ? __expected : __val;

From b297715cdb9fa94df253439cf37baa71f1bb957a Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Thu, 9 May 2024 23:16:22 -0700
Subject: [PATCH 67/71] Fix cassert missing from generated atomic header.

---
 libcudacxx/codegen/codegen.cpp                                   | 1 +
 .../include/cuda/std/__atomic/functions/cuda_ptx_generated.h     | 1 +
 2 files changed, 2 insertions(+)

diff --git a/libcudacxx/codegen/codegen.cpp b/libcudacxx/codegen/codegen.cpp
index cd5d1ddda7..b7111c44d7 100644
--- a/libcudacxx/codegen/codegen.cpp
+++ b/libcudacxx/codegen/codegen.cpp
@@ -94,6 +94,7 @@ int main()
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/std/cassert>
 #include <cuda/std/cstdint>
 
 #include <cuda/std/__type_traits/enable_if.h>
diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
index f8deee65fb..6d3ad940f3 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
@@ -24,6 +24,7 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/std/cassert>
 #include <cuda/std/cstdint>
 
 #include <cuda/std/__type_traits/enable_if.h>

From f40a59fe9df6f66a14f3f1391ef92a18bbde468b Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Thu, 9 May 2024 23:31:18 -0700
Subject: [PATCH 68/71] Fix visibility of host atomics in NVRTC build.

---
 libcudacxx/include/cuda/std/__atomic/functions/host.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/libcudacxx/include/cuda/std/__atomic/functions/host.h b/libcudacxx/include/cuda/std/__atomic/functions/host.h
index 0a48f7bb1f..f038da30cd 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/host.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/host.h
@@ -32,6 +32,8 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 _CCCL_DIAG_PUSH
 _CCCL_DIAG_SUPPRESS_CLANG("-Watomic-alignment")
 
+#if !defined(_CCCL_COMPILER_NVRTC)
+
 template <typename _Tp>
 struct __atomic_alignment_wrapper
 {
@@ -231,6 +233,8 @@ inline __remove_cv_t<_Tp> __atomic_fetch_min_host(_Tp* __a, _Td __val, memory_or
   return __expected;
 }
 
+#endif // !defined(_CCCL_COMPILER_NVRTC)
+
 _CCCL_DIAG_POP
 
 _LIBCUDACXX_END_NAMESPACE_STD

From b84130418a43cafb5512729449e1cab5930240a4 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Fri, 10 May 2024 12:59:04 -0700
Subject: [PATCH 69/71] Add more tests to bad_atomic_alignment.pass.cpp

---
 .../test/libcudacxx/cuda/bad_atomic_alignment.pass.cpp      | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/libcudacxx/test/libcudacxx/cuda/bad_atomic_alignment.pass.cpp b/libcudacxx/test/libcudacxx/cuda/bad_atomic_alignment.pass.cpp
index d0566c3a14..e4a099ac6c 100644
--- a/libcudacxx/test/libcudacxx/cuda/bad_atomic_alignment.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/bad_atomic_alignment.pass.cpp
@@ -37,8 +37,11 @@ struct TestFn
       A& t = *sel.construct();
       cuda::std::atomic_init(&t, key{1, 2});
       auto r = t.load();
+      auto d = key{5, 5};
       t.store(r);
       (void) t.exchange(r);
+      (void) t.compare_exchange_weak(r, d, cuda::memory_order_seq_cst, cuda::memory_order_seq_cst);
+      (void) t.compare_exchange_strong(d, r, cuda::memory_order_seq_cst, cuda::memory_order_seq_cst);
     }
     {
       struct alignas(8) key
@@ -51,8 +54,11 @@ struct TestFn
       A& t = *sel.construct();
       cuda::std::atomic_init(&t, key{1, 2});
       auto r = t.load();
+      auto d = key{5, 5};
       t.store(r);
       (void) t.exchange(r);
+      (void) t.compare_exchange_weak(r, d, cuda::memory_order_seq_cst, cuda::memory_order_seq_cst);
+      (void) t.compare_exchange_strong(d, r, cuda::memory_order_seq_cst, cuda::memory_order_seq_cst);
     }
   }
 };

From 52788ffe05c3d0c30cffe725781fbe83381c63b8 Mon Sep 17 00:00:00 2001
From: Wesley Maxey <wesley.maxey@gmail.com>
Date: Fri, 10 May 2024 13:01:37 -0700
Subject: [PATCH 70/71] Fix alignment warnings in host compare_exchange layer.

---
 .../include/cuda/std/__atomic/functions/host.h | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__atomic/functions/host.h b/libcudacxx/include/cuda/std/__atomic/functions/host.h
index f038da30cd..842b1566e0 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/host.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/host.h
@@ -107,18 +107,24 @@ template <typename _Tp, typename _Up>
 inline bool __atomic_compare_exchange_strong_host(
   _Tp* __a, _Up* __expected, _Up __value, memory_order __success, memory_order __failure)
 {
-  (void) __expected;
-  return __atomic_compare_exchange(
-    __a, __expected, &__value, false, __atomic_order_to_int(__success), __atomic_failure_order_to_int(__failure));
+  return __atomic_compare_exchange(&__atomic_auto_align<_Tp>(__a),
+                                   &__atomic_auto_align<__remove_cv_t<_Tp>>(__expected),
+                                   &__atomic_auto_align<__remove_cv_t<_Tp>>(&__value),
+                                   false,
+                                   __atomic_order_to_int(__success),
+                                   __atomic_failure_order_to_int(__failure));
 }
 
 template <typename _Tp, typename _Up>
 inline bool __atomic_compare_exchange_weak_host(
   _Tp* __a, _Up* __expected, _Up __value, memory_order __success, memory_order __failure)
 {
-  (void) __expected;
-  return __atomic_compare_exchange(
-    __a, __expected, &__value, true, __atomic_order_to_int(__success), __atomic_failure_order_to_int(__failure));
+  return __atomic_compare_exchange(&__atomic_auto_align<_Tp>(__a),
+                                   &__atomic_auto_align<__remove_cv_t<_Tp>>(__expected),
+                                   &__atomic_auto_align<__remove_cv_t<_Tp>>(&__value),
+                                   true,
+                                   __atomic_order_to_int(__success),
+                                   __atomic_failure_order_to_int(__failure));
 }
 
 template <typename _Tp>

From 044b350dc07a89bd5308ea00d032a7a71ef5c318 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 10 May 2024 20:24:48 +0000
Subject: [PATCH 71/71] [pre-commit.ci] auto code formatting

---
 .../cuda/std/__atomic/functions/host.h        | 26 ++++++++++---------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__atomic/functions/host.h b/libcudacxx/include/cuda/std/__atomic/functions/host.h
index 842b1566e0..59dc6bd093 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/host.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/host.h
@@ -107,24 +107,26 @@ template <typename _Tp, typename _Up>
 inline bool __atomic_compare_exchange_strong_host(
   _Tp* __a, _Up* __expected, _Up __value, memory_order __success, memory_order __failure)
 {
-  return __atomic_compare_exchange(&__atomic_auto_align<_Tp>(__a),
-                                   &__atomic_auto_align<__remove_cv_t<_Tp>>(__expected),
-                                   &__atomic_auto_align<__remove_cv_t<_Tp>>(&__value),
-                                   false,
-                                   __atomic_order_to_int(__success),
-                                   __atomic_failure_order_to_int(__failure));
+  return __atomic_compare_exchange(
+    &__atomic_auto_align<_Tp>(__a),
+    &__atomic_auto_align<__remove_cv_t<_Tp>>(__expected),
+    &__atomic_auto_align<__remove_cv_t<_Tp>>(&__value),
+    false,
+    __atomic_order_to_int(__success),
+    __atomic_failure_order_to_int(__failure));
 }
 
 template <typename _Tp, typename _Up>
 inline bool __atomic_compare_exchange_weak_host(
   _Tp* __a, _Up* __expected, _Up __value, memory_order __success, memory_order __failure)
 {
-  return __atomic_compare_exchange(&__atomic_auto_align<_Tp>(__a),
-                                   &__atomic_auto_align<__remove_cv_t<_Tp>>(__expected),
-                                   &__atomic_auto_align<__remove_cv_t<_Tp>>(&__value),
-                                   true,
-                                   __atomic_order_to_int(__success),
-                                   __atomic_failure_order_to_int(__failure));
+  return __atomic_compare_exchange(
+    &__atomic_auto_align<_Tp>(__a),
+    &__atomic_auto_align<__remove_cv_t<_Tp>>(__expected),
+    &__atomic_auto_align<__remove_cv_t<_Tp>>(&__value),
+    true,
+    __atomic_order_to_int(__success),
+    __atomic_failure_order_to_int(__failure));
 }
 
 template <typename _Tp>