Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cleanup and modularize <cuda/std/barrier> #2443

Merged
merged 28 commits into from
Oct 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
666d484
Drop `_LIBCUDACXX_HAS_NO_TREE_BARRIER`
miscco Sep 17, 2024
eab67a6
Minor cleanups
miscco Sep 17, 2024
7c07c6c
Move poll tester to their own file
miscco Sep 17, 2024
f1897b8
Move `empty_completion` to its own file
miscco Sep 17, 2024
786dd42
Move `barrier` to its own file
miscco Sep 17, 2024
2c80dc0
Move `cuda::aligned_size` to its own file
miscco Sep 20, 2024
b84b4ee
Move forward declaration of `pipeline` to its own file
miscco Sep 20, 2024
71f7976
Move `async_contract_fulfillment` to its own file
miscco Sep 20, 2024
a4df38c
Move `completion_mechanism` to its own file
miscco Sep 20, 2024
fe87f9d
Move `cuda::barrier` to its own file
miscco Sep 20, 2024
9c5d340
Move forward declaration of `barrier_native_handle` into its own file
miscco Sep 20, 2024
d5d1db3
Move `barrier<thread_scope_block>` to its own file
miscco Sep 20, 2024
6040778
Move implementation of `barrier_native_handle` to its own file
miscco Sep 20, 2024
1e4932e
Move `barrier_arrive_tx` to its own file
miscco Sep 20, 2024
8b6877c
Move `barrier_expect_tx` to its own file
miscco Sep 20, 2024
ad54b5c
Move `memcpy_async_tx` to its own file
miscco Sep 20, 2024
b6431a8
Move `barrier<thread_scope_thread>` into its own file
miscco Sep 20, 2024
7878b66
Move `__is_local_smem_barrier` to its own file
miscco Sep 20, 2024
b9c7db0
Move `__try_get_barrier_handle` to its own file
miscco Sep 20, 2024
4fc4e45
Move `__memcpy_completion_impl` to its own file
miscco Sep 20, 2024
a2ca10f
Move `__cp_async_bulk_shared_global` to its own file
miscco Sep 20, 2024
dd6ab72
Move `__cp_async_shared_global` to its own file
miscco Sep 20, 2024
b6e0ad2
Move `__cp_async_fallback` to its own file
miscco Sep 20, 2024
6d76358
Move `__dispatch_memcpy_async` to its own file
miscco Sep 20, 2024
cb75f6c
Move `__memcpy_async_barrier` to its own file
miscco Sep 20, 2024
3a49b47
Move `memcpy_async` to its own file
miscco Sep 20, 2024
8a56233
Cleanup barrier header
miscco Sep 20, 2024
d2244e6
Shorten some names
miscco Sep 20, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions libcudacxx/include/cuda/__barrier/aligned_size.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDA___BARRIER_ALIGNED_SIZE_H
#define _CUDA___BARRIER_ALIGNED_SIZE_H

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cuda/std/cstddef>

_LIBCUDACXX_BEGIN_NAMESPACE_CUDA

template <_CUDA_VSTD::size_t _Alignment>
struct aligned_size_t
{
static constexpr _CUDA_VSTD::size_t align = _Alignment;
_CUDA_VSTD::size_t value;

_LIBCUDACXX_HIDE_FROM_ABI explicit constexpr aligned_size_t(size_t __s)
: value(__s)
{}
_LIBCUDACXX_HIDE_FROM_ABI constexpr operator size_t() const
{
return value;
}
};

_LIBCUDACXX_END_NAMESPACE_CUDA

#endif // _CUDA___BARRIER_ALIGNED_SIZE_H
35 changes: 35 additions & 0 deletions libcudacxx/include/cuda/__barrier/async_contract_fulfillment.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDA___BARRIER_ASYNC_CONTRACT_FULFILLMENT_H
#define _CUDA___BARRIER_ASYNC_CONTRACT_FULFILLMENT_H

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

_LIBCUDACXX_BEGIN_NAMESPACE_CUDA

// Type only used for logging purpose
enum async_contract_fulfillment
{
none,
async
};

_LIBCUDACXX_END_NAMESPACE_CUDA

#endif // _CUDA___BARRIER_ASYNC_CONTRACT_FULFILLMENT_H
62 changes: 62 additions & 0 deletions libcudacxx/include/cuda/__barrier/barrier.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDA___BARRIER_BARRIER_H
#define _CUDA___BARRIER_BARRIER_H

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cuda/__fwd/barrier.h>
#include <cuda/std/__atomic/scopes.h>
#include <cuda/std/__barrier/barrier.h>
#include <cuda/std/__barrier/empty_completion.h>
#include <cuda/std/__new_>
#include <cuda/std/cstdint>

_LIBCUDACXX_BEGIN_NAMESPACE_CUDA

template <thread_scope _Sco, class _CompletionF>
class barrier : public _CUDA_VSTD::__barrier_base<_CompletionF, _Sco>
{
public:
_CCCL_HIDE_FROM_ABI barrier() = default;

barrier(const barrier&) = delete;
barrier& operator=(const barrier&) = delete;

_LIBCUDACXX_HIDE_FROM_ABI constexpr barrier(_CUDA_VSTD::ptrdiff_t __expected,
_CompletionF __completion = _CompletionF())
: _CUDA_VSTD::__barrier_base<_CompletionF, _Sco>(__expected, __completion)
{}

_LIBCUDACXX_HIDE_FROM_ABI friend void init(barrier* __b, _CUDA_VSTD::ptrdiff_t __expected)
{
_CCCL_ASSERT(__expected >= 0, "Cannot initialize barrier with negative arrival count");
new (__b) barrier(__expected);
}

_LIBCUDACXX_HIDE_FROM_ABI friend void init(barrier* __b, _CUDA_VSTD::ptrdiff_t __expected, _CompletionF __completion)
{
_CCCL_ASSERT(__expected >= 0, "Cannot initialize barrier with negative arrival count");
new (__b) barrier(__expected, __completion);
}
};

_LIBCUDACXX_END_NAMESPACE_CUDA

#endif // _CUDA___BARRIER_BARRIER_H
94 changes: 94 additions & 0 deletions libcudacxx/include/cuda/__barrier/barrier_arrive_tx.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
// -*- C++ -*-
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDA_PTX_BARRIER_ARRIVE_TX_H_
#define _CUDA_PTX_BARRIER_ARRIVE_TX_H_

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#if defined(_CCCL_CUDA_COMPILER)
# if __cccl_ptx_isa >= 800

# include <cuda/__barrier/barrier_block_scope.h>
# include <cuda/__barrier/barrier_native_handle.h>
# include <cuda/__ptx/instructions/mbarrier_arrive.h>
# include <cuda/__ptx/ptx_dot_variants.h>
# include <cuda/__ptx/ptx_helper_functions.h>
# include <cuda/std/__atomic/scopes.h>
# include <cuda/std/cstdint>

_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE

extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_arrive_tx_is_not_supported_before_SM_90__();
_CCCL_NODISCARD _CCCL_DEVICE inline barrier<thread_scope_block>::arrival_token barrier_arrive_tx(
barrier<thread_scope_block>& __b,
_CUDA_VSTD::ptrdiff_t __arrive_count_update,
_CUDA_VSTD::ptrdiff_t __transaction_count_update)
{
_CCCL_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory.");
_CCCL_ASSERT(1 <= __arrive_count_update, "Arrival count update must be at least one.");
_CCCL_ASSERT(__arrive_count_update <= (1 << 20) - 1, "Arrival count update cannot exceed 2^20 - 1.");
_CCCL_ASSERT(__transaction_count_update >= 0, "Transaction count update must be non-negative.");
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#contents-of-the-mbarrier-object
_CCCL_ASSERT(__transaction_count_update <= (1 << 20) - 1, "Transaction count update cannot exceed 2^20 - 1.");

barrier<thread_scope_block>::arrival_token __token = {};
// On architectures pre-sm90, arrive_tx is not supported.
// We do not check for the statespace of the barrier here. This is
// on purpose. This allows debugging tools like memcheck/racecheck
// to detect that we are passing a pointer with the wrong state
// space to mbarrier.arrive. If we checked for the state space here,
// and __trap() if wrong, then those tools would not be able to help
// us in release builds. In debug builds, the error would be caught
// by the asserts at the top of this function.
NV_IF_ELSE_TARGET(
NV_PROVIDES_SM_90,
(

auto __native_handle = barrier_native_handle(__b); auto __bh = __cvta_generic_to_shared(__native_handle);
if (__arrive_count_update == 1) {
__token = _CUDA_VPTX::mbarrier_arrive_expect_tx(
_CUDA_VPTX::sem_release,
_CUDA_VPTX::scope_cta,
_CUDA_VPTX::space_shared,
__native_handle,
__transaction_count_update);
} else {
asm("mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;"
:
: "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)),
"r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update))
: "memory");
__token = _CUDA_VPTX::mbarrier_arrive(
_CUDA_VPTX::sem_release,
_CUDA_VPTX::scope_cta,
_CUDA_VPTX::space_shared,
__native_handle,
__arrive_count_update);
}),
(__cuda_ptx_barrier_arrive_tx_is_not_supported_before_SM_90__();));
return __token;
}

_LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE

# endif // __cccl_ptx_isa >= 800
#endif // _CCCL_CUDA_COMPILER

#endif // _CUDA_PTX_BARRIER_ARRIVE_TX_H_
Loading
Loading