-
Notifications
You must be signed in to change notification settings - Fork 141
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' into fea/atomic_ref_8_16_bit_support
- Loading branch information
Showing
31 changed files
with
2,415 additions
and
1,779 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
//===----------------------------------------------------------------------===// | ||
// | ||
// Part of libcu++, the C++ Standard Library for your entire system, | ||
// under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
#ifndef _CUDA___BARRIER_ALIGNED_SIZE_H | ||
#define _CUDA___BARRIER_ALIGNED_SIZE_H | ||
|
||
#include <cuda/std/detail/__config> | ||
|
||
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) | ||
# pragma GCC system_header | ||
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) | ||
# pragma clang system_header | ||
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) | ||
# pragma system_header | ||
#endif // no system header | ||
|
||
#include <cuda/std/cstddef> | ||
|
||
_LIBCUDACXX_BEGIN_NAMESPACE_CUDA | ||
|
||
template <_CUDA_VSTD::size_t _Alignment> | ||
struct aligned_size_t | ||
{ | ||
static constexpr _CUDA_VSTD::size_t align = _Alignment; | ||
_CUDA_VSTD::size_t value; | ||
|
||
_LIBCUDACXX_HIDE_FROM_ABI explicit constexpr aligned_size_t(size_t __s) | ||
: value(__s) | ||
{} | ||
_LIBCUDACXX_HIDE_FROM_ABI constexpr operator size_t() const | ||
{ | ||
return value; | ||
} | ||
}; | ||
|
||
_LIBCUDACXX_END_NAMESPACE_CUDA | ||
|
||
#endif // _CUDA___BARRIER_ALIGNED_SIZE_H |
35 changes: 35 additions & 0 deletions
35
libcudacxx/include/cuda/__barrier/async_contract_fulfillment.h
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
//===----------------------------------------------------------------------===// | ||
// | ||
// Part of libcu++, the C++ Standard Library for your entire system, | ||
// under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
#ifndef _CUDA___BARRIER_ASYNC_CONTRACT_FULFILLMENT_H | ||
#define _CUDA___BARRIER_ASYNC_CONTRACT_FULFILLMENT_H | ||
|
||
#include <cuda/std/detail/__config> | ||
|
||
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) | ||
# pragma GCC system_header | ||
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) | ||
# pragma clang system_header | ||
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) | ||
# pragma system_header | ||
#endif // no system header | ||
|
||
_LIBCUDACXX_BEGIN_NAMESPACE_CUDA | ||
|
||
// Type only used for logging purpose | ||
enum async_contract_fulfillment | ||
{ | ||
none, | ||
async | ||
}; | ||
|
||
_LIBCUDACXX_END_NAMESPACE_CUDA | ||
|
||
#endif // _CUDA___BARRIER_ASYNC_CONTRACT_FULFILLMENT_H |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
//===----------------------------------------------------------------------===// | ||
// | ||
// Part of libcu++, the C++ Standard Library for your entire system, | ||
// under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
#ifndef _CUDA___BARRIER_BARRIER_H | ||
#define _CUDA___BARRIER_BARRIER_H | ||
|
||
#include <cuda/std/detail/__config> | ||
|
||
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) | ||
# pragma GCC system_header | ||
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) | ||
# pragma clang system_header | ||
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) | ||
# pragma system_header | ||
#endif // no system header | ||
|
||
#include <cuda/__fwd/barrier.h> | ||
#include <cuda/std/__atomic/scopes.h> | ||
#include <cuda/std/__barrier/barrier.h> | ||
#include <cuda/std/__barrier/empty_completion.h> | ||
#include <cuda/std/__new_> | ||
#include <cuda/std/cstdint> | ||
|
||
_LIBCUDACXX_BEGIN_NAMESPACE_CUDA | ||
|
||
template <thread_scope _Sco, class _CompletionF> | ||
class barrier : public _CUDA_VSTD::__barrier_base<_CompletionF, _Sco> | ||
{ | ||
public: | ||
_CCCL_HIDE_FROM_ABI barrier() = default; | ||
|
||
barrier(const barrier&) = delete; | ||
barrier& operator=(const barrier&) = delete; | ||
|
||
_LIBCUDACXX_HIDE_FROM_ABI constexpr barrier(_CUDA_VSTD::ptrdiff_t __expected, | ||
_CompletionF __completion = _CompletionF()) | ||
: _CUDA_VSTD::__barrier_base<_CompletionF, _Sco>(__expected, __completion) | ||
{} | ||
|
||
_LIBCUDACXX_HIDE_FROM_ABI friend void init(barrier* __b, _CUDA_VSTD::ptrdiff_t __expected) | ||
{ | ||
_CCCL_ASSERT(__expected >= 0, "Cannot initialize barrier with negative arrival count"); | ||
new (__b) barrier(__expected); | ||
} | ||
|
||
_LIBCUDACXX_HIDE_FROM_ABI friend void init(barrier* __b, _CUDA_VSTD::ptrdiff_t __expected, _CompletionF __completion) | ||
{ | ||
_CCCL_ASSERT(__expected >= 0, "Cannot initialize barrier with negative arrival count"); | ||
new (__b) barrier(__expected, __completion); | ||
} | ||
}; | ||
|
||
_LIBCUDACXX_END_NAMESPACE_CUDA | ||
|
||
#endif // _CUDA___BARRIER_BARRIER_H |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
// -*- C++ -*- | ||
//===----------------------------------------------------------------------===// | ||
// | ||
// Part of libcu++, the C++ Standard Library for your entire system, | ||
// under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
#ifndef _CUDA_PTX_BARRIER_ARRIVE_TX_H_ | ||
#define _CUDA_PTX_BARRIER_ARRIVE_TX_H_ | ||
|
||
#include <cuda/std/detail/__config> | ||
|
||
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) | ||
# pragma GCC system_header | ||
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) | ||
# pragma clang system_header | ||
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) | ||
# pragma system_header | ||
#endif // no system header | ||
|
||
#if defined(_CCCL_CUDA_COMPILER) | ||
# if __cccl_ptx_isa >= 800 | ||
|
||
# include <cuda/__barrier/barrier_block_scope.h> | ||
# include <cuda/__barrier/barrier_native_handle.h> | ||
# include <cuda/__ptx/instructions/mbarrier_arrive.h> | ||
# include <cuda/__ptx/ptx_dot_variants.h> | ||
# include <cuda/__ptx/ptx_helper_functions.h> | ||
# include <cuda/std/__atomic/scopes.h> | ||
# include <cuda/std/cstdint> | ||
|
||
_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE | ||
|
||
extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_arrive_tx_is_not_supported_before_SM_90__(); | ||
_CCCL_NODISCARD _CCCL_DEVICE inline barrier<thread_scope_block>::arrival_token barrier_arrive_tx( | ||
barrier<thread_scope_block>& __b, | ||
_CUDA_VSTD::ptrdiff_t __arrive_count_update, | ||
_CUDA_VSTD::ptrdiff_t __transaction_count_update) | ||
{ | ||
_CCCL_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory."); | ||
_CCCL_ASSERT(1 <= __arrive_count_update, "Arrival count update must be at least one."); | ||
_CCCL_ASSERT(__arrive_count_update <= (1 << 20) - 1, "Arrival count update cannot exceed 2^20 - 1."); | ||
_CCCL_ASSERT(__transaction_count_update >= 0, "Transaction count update must be non-negative."); | ||
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#contents-of-the-mbarrier-object | ||
_CCCL_ASSERT(__transaction_count_update <= (1 << 20) - 1, "Transaction count update cannot exceed 2^20 - 1."); | ||
|
||
barrier<thread_scope_block>::arrival_token __token = {}; | ||
// On architectures pre-sm90, arrive_tx is not supported. | ||
// We do not check for the statespace of the barrier here. This is | ||
// on purpose. This allows debugging tools like memcheck/racecheck | ||
// to detect that we are passing a pointer with the wrong state | ||
// space to mbarrier.arrive. If we checked for the state space here, | ||
// and __trap() if wrong, then those tools would not be able to help | ||
// us in release builds. In debug builds, the error would be caught | ||
// by the asserts at the top of this function. | ||
NV_IF_ELSE_TARGET( | ||
NV_PROVIDES_SM_90, | ||
( | ||
|
||
auto __native_handle = barrier_native_handle(__b); auto __bh = __cvta_generic_to_shared(__native_handle); | ||
if (__arrive_count_update == 1) { | ||
__token = _CUDA_VPTX::mbarrier_arrive_expect_tx( | ||
_CUDA_VPTX::sem_release, | ||
_CUDA_VPTX::scope_cta, | ||
_CUDA_VPTX::space_shared, | ||
__native_handle, | ||
__transaction_count_update); | ||
} else { | ||
asm("mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;" | ||
: | ||
: "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)), | ||
"r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update)) | ||
: "memory"); | ||
__token = _CUDA_VPTX::mbarrier_arrive( | ||
_CUDA_VPTX::sem_release, | ||
_CUDA_VPTX::scope_cta, | ||
_CUDA_VPTX::space_shared, | ||
__native_handle, | ||
__arrive_count_update); | ||
}), | ||
(__cuda_ptx_barrier_arrive_tx_is_not_supported_before_SM_90__();)); | ||
return __token; | ||
} | ||
|
||
_LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE | ||
|
||
# endif // __cccl_ptx_isa >= 800 | ||
#endif // _CCCL_CUDA_COMPILER | ||
|
||
#endif // _CUDA_PTX_BARRIER_ARRIVE_TX_H_ |
Oops, something went wrong.