Skip to content

Commit

Permalink
op/cuda: Lazily initialize the CUDA information
Browse files Browse the repository at this point in the history
Signed-off-by: Joseph Schuchart <[email protected]>
  • Loading branch information
devreal committed Jun 20, 2024
1 parent 53336c3 commit 6a85957
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 38 deletions.
2 changes: 2 additions & 0 deletions ompi/mca/op/cuda/op_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ ompi_op_base_stream_handler_fn_t ompi_op_cuda_functions[OMPI_OP_BASE_FORTRAN_OP_
extern
ompi_op_base_3buff_stream_handler_fn_t ompi_op_cuda_3buff_functions[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];

void ompi_op_cuda_lazy_init();

END_C_DECLS

#endif /* MCA_OP_CUDA_EXPORT_H */
96 changes: 58 additions & 38 deletions ompi/mca/op/cuda/op_cuda_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ static struct ompi_op_base_module_1_0_0_t *
cuda_component_op_query(struct ompi_op_t *op, int *priority);
static int cuda_component_register(void);

static opal_mutex_t init_lock = OPAL_MUTEX_STATIC_INIT;
static bool init_complete = false;

ompi_op_cuda_component_t mca_op_cuda_component = {
{
.opc_version = {
Expand Down Expand Up @@ -128,44 +131,6 @@ static int
cuda_component_init_query(bool enable_progress_threads,
bool enable_mpi_thread_multiple)
{
int num_devices;
int rc;
// TODO: is this init needed here?
cuInit(0);
CHECK(cuDeviceGetCount, (&num_devices));
mca_op_cuda_component.cu_num_devices = num_devices;
mca_op_cuda_component.cu_devices = (CUdevice*)malloc(num_devices*sizeof(CUdevice));
mca_op_cuda_component.cu_max_threads_per_block = (int*)malloc(num_devices*sizeof(int));
mca_op_cuda_component.cu_max_blocks = (int*)malloc(num_devices*sizeof(int));
for (int i = 0; i < num_devices; ++i) {
CHECK(cuDeviceGet, (&mca_op_cuda_component.cu_devices[i], i));
rc = cuDeviceGetAttribute(&mca_op_cuda_component.cu_max_threads_per_block[i],
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
mca_op_cuda_component.cu_devices[i]);
if (CUDA_SUCCESS != rc) {
/* fall-back to value that should work on every device */
mca_op_cuda_component.cu_max_threads_per_block[i] = 512;
}
if (-1 < mca_op_cuda_component.cu_max_num_threads) {
if (mca_op_cuda_component.cu_max_threads_per_block[i] >= mca_op_cuda_component.cu_max_num_threads) {
mca_op_cuda_component.cu_max_threads_per_block[i] = mca_op_cuda_component.cu_max_num_threads;
}
}

rc = cuDeviceGetAttribute(&mca_op_cuda_component.cu_max_blocks[i],
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
mca_op_cuda_component.cu_devices[i]);
if (CUDA_SUCCESS != rc) {
/* fall-back to value that should work on every device */
mca_op_cuda_component.cu_max_blocks[i] = 512;
}
if (-1 < mca_op_cuda_component.cu_max_num_blocks) {
if (mca_op_cuda_component.cu_max_blocks[i] >= mca_op_cuda_component.cu_max_num_blocks) {
mca_op_cuda_component.cu_max_blocks[i] = mca_op_cuda_component.cu_max_num_blocks;
}
}
}

return OMPI_SUCCESS;
}

Expand Down Expand Up @@ -193,3 +158,58 @@ cuda_component_op_query(struct ompi_op_t *op, int *priority)
*priority = 50;
return (ompi_op_base_module_1_0_0_t *) module;
}

void ompi_op_cuda_lazy_init()
{
/* Double checked locking to avoid having to
* grab locks post lazy-initialization. */
opal_atomic_rmb();
if (init_complete) return;

OPAL_THREAD_LOCK(&init_lock);

if (!init_complete) {
static opal_atomic_lock_t lock = OPAL_ATOMIC_LOCK_INIT;
while
int num_devices;
int rc;
// TODO: is this init needed here?
cuInit(0);
CHECK(cuDeviceGetCount, (&num_devices));
mca_op_cuda_component.cu_num_devices = num_devices;
mca_op_cuda_component.cu_devices = (CUdevice*)malloc(num_devices*sizeof(CUdevice));
mca_op_cuda_component.cu_max_threads_per_block = (int*)malloc(num_devices*sizeof(int));
mca_op_cuda_component.cu_max_blocks = (int*)malloc(num_devices*sizeof(int));
for (int i = 0; i < num_devices; ++i) {
CHECK(cuDeviceGet, (&mca_op_cuda_component.cu_devices[i], i));
rc = cuDeviceGetAttribute(&mca_op_cuda_component.cu_max_threads_per_block[i],
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
mca_op_cuda_component.cu_devices[i]);
if (CUDA_SUCCESS != rc) {
/* fall-back to value that should work on every device */
mca_op_cuda_component.cu_max_threads_per_block[i] = 512;
}
if (-1 < mca_op_cuda_component.cu_max_num_threads) {
if (mca_op_cuda_component.cu_max_threads_per_block[i] >= mca_op_cuda_component.cu_max_num_threads) {
mca_op_cuda_component.cu_max_threads_per_block[i] = mca_op_cuda_component.cu_max_num_threads;
}
}

rc = cuDeviceGetAttribute(&mca_op_cuda_component.cu_max_blocks[i],
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
mca_op_cuda_component.cu_devices[i]);
if (CUDA_SUCCESS != rc) {
/* fall-back to value that should work on every device */
mca_op_cuda_component.cu_max_blocks[i] = 512;
}
if (-1 < mca_op_cuda_component.cu_max_num_blocks) {
if (mca_op_cuda_component.cu_max_blocks[i] >= mca_op_cuda_component.cu_max_num_blocks) {
mca_op_cuda_component.cu_max_blocks[i] = mca_op_cuda_component.cu_max_num_blocks;
}
}
}
opal_atomic_wmb();
init_complete = true;
}
OPAL_THREAD_UNLOCK(&init_lock);
}
2 changes: 2 additions & 0 deletions ompi/mca/op/cuda/op_cuda_functions.c
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ static inline void device_op_pre(const void *orig_source1,
uint64_t target_flags = -1, source1_flags = -1, source2_flags = -1;
int target_rc, source1_rc, source2_rc = -1;

ompi_op_cuda_lazy_init();

*target = orig_target;
*source1 = (void*)orig_source1;
if (NULL != orig_source2) {
Expand Down

0 comments on commit 6a85957

Please sign in to comment.