Skip to content

Commit

Permalink
Merge branch 'main' into fea/atomic_ref_8_16_bit_support
Browse files Browse the repository at this point in the history
  • Loading branch information
wmaxey committed Oct 7, 2024
2 parents 8612403 + c86caca commit 928465f
Show file tree
Hide file tree
Showing 31 changed files with 2,415 additions and 1,779 deletions.
45 changes: 45 additions & 0 deletions libcudacxx/include/cuda/__barrier/aligned_size.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDA___BARRIER_ALIGNED_SIZE_H
#define _CUDA___BARRIER_ALIGNED_SIZE_H

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cuda/std/cstddef>

_LIBCUDACXX_BEGIN_NAMESPACE_CUDA

template <_CUDA_VSTD::size_t _Alignment>
struct aligned_size_t
{
static constexpr _CUDA_VSTD::size_t align = _Alignment;
_CUDA_VSTD::size_t value;

_LIBCUDACXX_HIDE_FROM_ABI explicit constexpr aligned_size_t(size_t __s)
: value(__s)
{}
_LIBCUDACXX_HIDE_FROM_ABI constexpr operator size_t() const
{
return value;
}
};

_LIBCUDACXX_END_NAMESPACE_CUDA

#endif // _CUDA___BARRIER_ALIGNED_SIZE_H
35 changes: 35 additions & 0 deletions libcudacxx/include/cuda/__barrier/async_contract_fulfillment.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDA___BARRIER_ASYNC_CONTRACT_FULFILLMENT_H
#define _CUDA___BARRIER_ASYNC_CONTRACT_FULFILLMENT_H

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

_LIBCUDACXX_BEGIN_NAMESPACE_CUDA

// Type only used for logging purpose
enum async_contract_fulfillment
{
none,
async
};

_LIBCUDACXX_END_NAMESPACE_CUDA

#endif // _CUDA___BARRIER_ASYNC_CONTRACT_FULFILLMENT_H
62 changes: 62 additions & 0 deletions libcudacxx/include/cuda/__barrier/barrier.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDA___BARRIER_BARRIER_H
#define _CUDA___BARRIER_BARRIER_H

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cuda/__fwd/barrier.h>
#include <cuda/std/__atomic/scopes.h>
#include <cuda/std/__barrier/barrier.h>
#include <cuda/std/__barrier/empty_completion.h>
#include <cuda/std/__new_>
#include <cuda/std/cstdint>

_LIBCUDACXX_BEGIN_NAMESPACE_CUDA

template <thread_scope _Sco, class _CompletionF>
class barrier : public _CUDA_VSTD::__barrier_base<_CompletionF, _Sco>
{
public:
_CCCL_HIDE_FROM_ABI barrier() = default;

barrier(const barrier&) = delete;
barrier& operator=(const barrier&) = delete;

_LIBCUDACXX_HIDE_FROM_ABI constexpr barrier(_CUDA_VSTD::ptrdiff_t __expected,
_CompletionF __completion = _CompletionF())
: _CUDA_VSTD::__barrier_base<_CompletionF, _Sco>(__expected, __completion)
{}

_LIBCUDACXX_HIDE_FROM_ABI friend void init(barrier* __b, _CUDA_VSTD::ptrdiff_t __expected)
{
_CCCL_ASSERT(__expected >= 0, "Cannot initialize barrier with negative arrival count");
new (__b) barrier(__expected);
}

_LIBCUDACXX_HIDE_FROM_ABI friend void init(barrier* __b, _CUDA_VSTD::ptrdiff_t __expected, _CompletionF __completion)
{
_CCCL_ASSERT(__expected >= 0, "Cannot initialize barrier with negative arrival count");
new (__b) barrier(__expected, __completion);
}
};

_LIBCUDACXX_END_NAMESPACE_CUDA

#endif // _CUDA___BARRIER_BARRIER_H
94 changes: 94 additions & 0 deletions libcudacxx/include/cuda/__barrier/barrier_arrive_tx.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
// -*- C++ -*-
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDA_PTX_BARRIER_ARRIVE_TX_H_
#define _CUDA_PTX_BARRIER_ARRIVE_TX_H_

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#if defined(_CCCL_CUDA_COMPILER)
# if __cccl_ptx_isa >= 800

# include <cuda/__barrier/barrier_block_scope.h>
# include <cuda/__barrier/barrier_native_handle.h>
# include <cuda/__ptx/instructions/mbarrier_arrive.h>
# include <cuda/__ptx/ptx_dot_variants.h>
# include <cuda/__ptx/ptx_helper_functions.h>
# include <cuda/std/__atomic/scopes.h>
# include <cuda/std/cstdint>

_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE

extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_arrive_tx_is_not_supported_before_SM_90__();
_CCCL_NODISCARD _CCCL_DEVICE inline barrier<thread_scope_block>::arrival_token barrier_arrive_tx(
barrier<thread_scope_block>& __b,
_CUDA_VSTD::ptrdiff_t __arrive_count_update,
_CUDA_VSTD::ptrdiff_t __transaction_count_update)
{
_CCCL_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory.");
_CCCL_ASSERT(1 <= __arrive_count_update, "Arrival count update must be at least one.");
_CCCL_ASSERT(__arrive_count_update <= (1 << 20) - 1, "Arrival count update cannot exceed 2^20 - 1.");
_CCCL_ASSERT(__transaction_count_update >= 0, "Transaction count update must be non-negative.");
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#contents-of-the-mbarrier-object
_CCCL_ASSERT(__transaction_count_update <= (1 << 20) - 1, "Transaction count update cannot exceed 2^20 - 1.");

barrier<thread_scope_block>::arrival_token __token = {};
// On architectures pre-sm90, arrive_tx is not supported.
// We do not check for the statespace of the barrier here. This is
// on purpose. This allows debugging tools like memcheck/racecheck
// to detect that we are passing a pointer with the wrong state
// space to mbarrier.arrive. If we checked for the state space here,
// and __trap() if wrong, then those tools would not be able to help
// us in release builds. In debug builds, the error would be caught
// by the asserts at the top of this function.
NV_IF_ELSE_TARGET(
NV_PROVIDES_SM_90,
(

auto __native_handle = barrier_native_handle(__b); auto __bh = __cvta_generic_to_shared(__native_handle);
if (__arrive_count_update == 1) {
__token = _CUDA_VPTX::mbarrier_arrive_expect_tx(
_CUDA_VPTX::sem_release,
_CUDA_VPTX::scope_cta,
_CUDA_VPTX::space_shared,
__native_handle,
__transaction_count_update);
} else {
asm("mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;"
:
: "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)),
"r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update))
: "memory");
__token = _CUDA_VPTX::mbarrier_arrive(
_CUDA_VPTX::sem_release,
_CUDA_VPTX::scope_cta,
_CUDA_VPTX::space_shared,
__native_handle,
__arrive_count_update);
}),
(__cuda_ptx_barrier_arrive_tx_is_not_supported_before_SM_90__();));
return __token;
}

_LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE

# endif // __cccl_ptx_isa >= 800
#endif // _CCCL_CUDA_COMPILER

#endif // _CUDA_PTX_BARRIER_ARRIVE_TX_H_
Loading

0 comments on commit 928465f

Please sign in to comment.