Merge branch 'main' into fea/atomic_ref_8_16_bit_support

NVIDIA · Oct 7, 2024 · 928465f · 928465f
2 parents 8612403 + c86caca
commit 928465f
Show file tree

Hide file tree

Showing 31 changed files with 2,415 additions and 1,779 deletions.
diff --git a/libcudacxx/include/cuda/__barrier/aligned_size.h b/libcudacxx/include/cuda/__barrier/aligned_size.h
@@ -0,0 +1,45 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___BARRIER_ALIGNED_SIZE_H
+#define _CUDA___BARRIER_ALIGNED_SIZE_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/cstddef>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+template <_CUDA_VSTD::size_t _Alignment>
+struct aligned_size_t
+{
+  static constexpr _CUDA_VSTD::size_t align = _Alignment;
+  _CUDA_VSTD::size_t value;
+
+  _LIBCUDACXX_HIDE_FROM_ABI explicit constexpr aligned_size_t(size_t __s)
+      : value(__s)
+  {}
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr operator size_t() const
+  {
+    return value;
+  }
+};
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#endif // _CUDA___BARRIER_ALIGNED_SIZE_H
diff --git a/libcudacxx/include/cuda/__barrier/async_contract_fulfillment.h b/libcudacxx/include/cuda/__barrier/async_contract_fulfillment.h
@@ -0,0 +1,35 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___BARRIER_ASYNC_CONTRACT_FULFILLMENT_H
+#define _CUDA___BARRIER_ASYNC_CONTRACT_FULFILLMENT_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+// Type only used for logging purpose
+enum async_contract_fulfillment
+{
+  none,
+  async
+};
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#endif // _CUDA___BARRIER_ASYNC_CONTRACT_FULFILLMENT_H
diff --git a/libcudacxx/include/cuda/__barrier/barrier.h b/libcudacxx/include/cuda/__barrier/barrier.h
@@ -0,0 +1,62 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___BARRIER_BARRIER_H
+#define _CUDA___BARRIER_BARRIER_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__fwd/barrier.h>
+#include <cuda/std/__atomic/scopes.h>
+#include <cuda/std/__barrier/barrier.h>
+#include <cuda/std/__barrier/empty_completion.h>
+#include <cuda/std/__new_>
+#include <cuda/std/cstdint>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+template <thread_scope _Sco, class _CompletionF>
+class barrier : public _CUDA_VSTD::__barrier_base<_CompletionF, _Sco>
+{
+public:
+  _CCCL_HIDE_FROM_ABI barrier() = default;
+
+  barrier(const barrier&)            = delete;
+  barrier& operator=(const barrier&) = delete;
+
+  _LIBCUDACXX_HIDE_FROM_ABI constexpr barrier(_CUDA_VSTD::ptrdiff_t __expected,
+                                              _CompletionF __completion = _CompletionF())
+      : _CUDA_VSTD::__barrier_base<_CompletionF, _Sco>(__expected, __completion)
+  {}
+
+  _LIBCUDACXX_HIDE_FROM_ABI friend void init(barrier* __b, _CUDA_VSTD::ptrdiff_t __expected)
+  {
+    _CCCL_ASSERT(__expected >= 0, "Cannot initialize barrier with negative arrival count");
+    new (__b) barrier(__expected);
+  }
+
+  _LIBCUDACXX_HIDE_FROM_ABI friend void init(barrier* __b, _CUDA_VSTD::ptrdiff_t __expected, _CompletionF __completion)
+  {
+    _CCCL_ASSERT(__expected >= 0, "Cannot initialize barrier with negative arrival count");
+    new (__b) barrier(__expected, __completion);
+  }
+};
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#endif // _CUDA___BARRIER_BARRIER_H
diff --git a/libcudacxx/include/cuda/__barrier/barrier_arrive_tx.h b/libcudacxx/include/cuda/__barrier/barrier_arrive_tx.h
@@ -0,0 +1,94 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_PTX_BARRIER_ARRIVE_TX_H_
+#define _CUDA_PTX_BARRIER_ARRIVE_TX_H_
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if defined(_CCCL_CUDA_COMPILER)
+#  if __cccl_ptx_isa >= 800
+
+#    include <cuda/__barrier/barrier_block_scope.h>
+#    include <cuda/__barrier/barrier_native_handle.h>
+#    include <cuda/__ptx/instructions/mbarrier_arrive.h>
+#    include <cuda/__ptx/ptx_dot_variants.h>
+#    include <cuda/__ptx/ptx_helper_functions.h>
+#    include <cuda/std/__atomic/scopes.h>
+#    include <cuda/std/cstdint>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE
+
+extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_arrive_tx_is_not_supported_before_SM_90__();
+_CCCL_NODISCARD _CCCL_DEVICE inline barrier<thread_scope_block>::arrival_token barrier_arrive_tx(
+  barrier<thread_scope_block>& __b,
+  _CUDA_VSTD::ptrdiff_t __arrive_count_update,
+  _CUDA_VSTD::ptrdiff_t __transaction_count_update)
+{
+  _CCCL_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory.");
+  _CCCL_ASSERT(1 <= __arrive_count_update, "Arrival count update must be at least one.");
+  _CCCL_ASSERT(__arrive_count_update <= (1 << 20) - 1, "Arrival count update cannot exceed 2^20 - 1.");
+  _CCCL_ASSERT(__transaction_count_update >= 0, "Transaction count update must be non-negative.");
+  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#contents-of-the-mbarrier-object
+  _CCCL_ASSERT(__transaction_count_update <= (1 << 20) - 1, "Transaction count update cannot exceed 2^20 - 1.");
+
+  barrier<thread_scope_block>::arrival_token __token = {};
+  // On architectures pre-sm90, arrive_tx is not supported.
+  // We do not check for the statespace of the barrier here. This is
+  // on purpose. This allows debugging tools like memcheck/racecheck
+  // to detect that we are passing a pointer with the wrong state
+  // space to mbarrier.arrive. If we checked for the state space here,
+  // and __trap() if wrong, then those tools would not be able to help
+  // us in release builds. In debug builds, the error would be caught
+  // by the asserts at the top of this function.
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_90,
+    (
+
+      auto __native_handle = barrier_native_handle(__b); auto __bh = __cvta_generic_to_shared(__native_handle);
+      if (__arrive_count_update == 1) {
+        __token = _CUDA_VPTX::mbarrier_arrive_expect_tx(
+          _CUDA_VPTX::sem_release,
+          _CUDA_VPTX::scope_cta,
+          _CUDA_VPTX::space_shared,
+          __native_handle,
+          __transaction_count_update);
+      } else {
+        asm("mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;"
+            :
+            : "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)),
+              "r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update))
+            : "memory");
+        __token = _CUDA_VPTX::mbarrier_arrive(
+          _CUDA_VPTX::sem_release,
+          _CUDA_VPTX::scope_cta,
+          _CUDA_VPTX::space_shared,
+          __native_handle,
+          __arrive_count_update);
+      }),
+    (__cuda_ptx_barrier_arrive_tx_is_not_supported_before_SM_90__();));
+  return __token;
+}
+
+_LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE
+
+#  endif // __cccl_ptx_isa >= 800
+#endif // _CCCL_CUDA_COMPILER
+
+#endif // _CUDA_PTX_BARRIER_ARRIVE_TX_H_