Skip to content

Commit

Permalink
ocl: updated tuned parameters (#842)
Browse files Browse the repository at this point in the history
* ocl: updated tuned parameters

* tune_multiply.py
  - Make certain resources unique on a per-rank basis.
  - Only save JSON-file in case of tangible result.
  - Create DB-directory without prior removal
  - Reset label when input stems from file.
  - Avoid caching kernels.

* Code cleanup
  - Disabled ACC_OPENCL_PROFILE (c_dbcsr_acc_set_active_device),
    as it is potentially too early to use the timer facility.
  - NULL-device refers to device 0 (c_dbcsr_acc_opencl_device_name).
  - Print separate error message (acc_bench_smm).
  - Improved OPENCL_LIBSMM_VALIDATE.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
hfp and pre-commit-ci[bot] authored Sep 11, 2024
1 parent f4e8c38 commit b12b740
Show file tree
Hide file tree
Showing 5 changed files with 105 additions and 102 deletions.
5 changes: 3 additions & 2 deletions src/acc/acc_bench_smm.c
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@ int main(int argc, char* argv[]) {
#if defined(USE_LIBXSMM)
libxsmm_timer_tickint start;
int print_offset = 0;
char print_buffer[1024];
char print_buffer[1024] = "";
# if defined(__OPENCL)
const char* const env_smm_repeat = getenv("SMM_NREPEAT");
const int smm_nrepeat = (NULL == env_smm_repeat ? 1 : MAX(atoi(env_smm_repeat), 1));
Expand Down Expand Up @@ -497,7 +497,7 @@ int main(int argc, char* argv[]) {
if (maxdiff < epsilon && NULL != file) maxdiff = epsilon;
if (0 < epsilon) {
if (LIBXSMM_NOTNAN(diff.v_tst)) {
PRINTF(" (|%g-%g|=%g)\n", diff.v_ref, diff.v_tst, fabs(diff.v_ref - diff.v_tst));
PRINTF(" (|%g-%g|=%g)\n", diff.v_ref, diff.v_tst, diff.linf_abs);
}
else {
PRINTF(" (%g)\n", diff.v_tst);
Expand All @@ -508,6 +508,7 @@ int main(int argc, char* argv[]) {
}
if (0 < check && check < epsilon) result = EXIT_FAILURE;
}
else fprintf(stderr, "ERROR: failed to validate!\n");
}
# endif
}
Expand Down
7 changes: 5 additions & 2 deletions src/acc/opencl/acc_opencl.c
Original file line number Diff line number Diff line change
Expand Up @@ -843,6 +843,9 @@ int c_dbcsr_acc_opencl_device_name(
cl_device_id device, char name[], size_t name_maxlen, char platform[], size_t platform_maxlen, int cleanup) {
int result_name = 0, result_platform = 0;
assert(NULL != name || NULL != platform);
if (NULL == device && 0 < c_dbcsr_acc_opencl_config.ndevices) {
device = c_dbcsr_acc_opencl_config.devices[0]; /* NULL-device refers to device 0 */
}
if (NULL != name && 0 != name_maxlen) {
result_name = clGetDeviceInfo(device, CL_DEVICE_NAME, name_maxlen, name, NULL);
if (0 != cleanup && EXIT_SUCCESS == result_name) {
Expand Down Expand Up @@ -1157,7 +1160,7 @@ int c_dbcsr_acc_opencl_set_active_device(ACC_OPENCL_LOCKTYPE* lock, int device_i

int c_dbcsr_acc_set_active_device(int device_id) {
int result = EXIT_SUCCESS;
# if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE)
# if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) && 0
int routine_handle;
static const char* const routine_name_ptr = LIBXSMM_FUNCNAME;
static const int routine_name_len = (int)sizeof(LIBXSMM_FUNCNAME) - 1;
Expand All @@ -1177,7 +1180,7 @@ int c_dbcsr_acc_set_active_device(int device_id) {
# if !defined(NDEBUG)
else result = EXIT_FAILURE;
# endif
# if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE)
# if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) && 0
c_dbcsr_timestop(&routine_handle);
# endif
ACC_OPENCL_RETURN(result);
Expand Down
122 changes: 48 additions & 74 deletions src/acc/opencl/smm/opencl_libsmm.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,20 @@
libxsmm_gemm_descriptor_dinit(BLOB, PREC, M, N, K, LDA, LDB, LDC, 1.0, 1.0, FLAGS, PREFETCH)
# endif

# if !defined(OPENCL_LIBSMM_VALIDATE_TRANS) && defined(OPENCL_LIBSMM_VALIDATE) && \
(1 < OPENCL_LIBSMM_VALIDATE || 0 > OPENCL_LIBSMM_VALIDATE)
# define OPENCL_LIBSMM_VALIDATE_TRANS
# endif
# if !defined(OPENCL_LIBSMM_VALIDATE_SMM) && defined(OPENCL_LIBSMM_VALIDATE)
# define OPENCL_LIBSMM_VALIDATE_SMM
# endif
# if !defined(OPENCL_LIBSMM_VALIDATE_EXIT) && defined(OPENCL_LIBSMM_VALIDATE) && 1
# define OPENCL_LIBSMM_VALIDATE_EXIT
# if defined(OPENCL_LIBSMM_VALIDATE)
# if !defined(OPENCL_LIBSMM_VALIDATE_TRANS) && (1 < OPENCL_LIBSMM_VALIDATE || 0 > OPENCL_LIBSMM_VALIDATE)
# define OPENCL_LIBSMM_VALIDATE_TRANS
# endif
# if !defined(OPENCL_LIBSMM_VALIDATE_SMM)
# define OPENCL_LIBSMM_VALIDATE_SMM
# endif
# if !defined(OPENCL_LIBSMM_VALIDATE_EXIT) && 1
# define OPENCL_LIBSMM_VALIDATE_EXIT
# endif
# if !defined(OPENCL_LIBSMM_VALIDATE_SCRATCH)
# define OPENCL_LIBSMM_VALIDATE_SCRATCH(SIZE, ALIGN) /*libxsmm_aligned_scratch(SIZE, ALIGN)*/ malloc(SIZE)
# define OPENCL_LIBSMM_VALIDATE_FREE(PTR) /*libxsmm_free(PTR)*/ free(PTR)
# endif
# endif
# if !defined(OPENCL_LIBSMM_KERNELNAME_TRANS)
# define OPENCL_LIBSMM_KERNELNAME_TRANS "trans"
Expand Down Expand Up @@ -111,31 +116,6 @@ int opencl_libsmm_use_cmem(cl_device_id device) {
}


# if defined(OPENCL_LIBSMM_VALIDATE) && (0 != OPENCL_LIBSMM_VALIDATE)
void opencl_libsmm_print_matrix(FILE* ostream, const char* label, libsmm_acc_data_t type, const void* mat, int m, int n) {
int i, j;
const char* const s = (NULL != label ? label : "");
const int len = (int)strlen(s);
for (i = 0; i < m; ++i) {
if (0 < i) {
fprintf(ostream, "%*s", len, " ");
}
else {
fprintf(ostream, "%s", s);
}
for (j = 0; j < n; ++j) {
switch (type) {
case dbcsr_type_real_8: fprintf(ostream, "%.2f ", ((const double*)mat)[i * n + j]); break;
case dbcsr_type_real_4: fprintf(ostream, "%.2f ", ((const float*)mat)[i * n + j]); break;
default: fprintf(ostream, "? ");
}
}
fprintf(ostream, "\n");
}
}
# endif


int opencl_libsmm_write_trans_params(FILE* stream, int only_key, const opencl_libsmm_transkey_t* key,
const opencl_libsmm_trans_t* config, const char* delim, const char* begin, const char* close) {
int result = 0;
Expand Down Expand Up @@ -786,7 +766,7 @@ int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size, v
const size_t scratch_size = (sizeof(int) * offset_stack_size) /*stack*/
+ data_size /*imat*/ + data_size /*omat*/ + (mn * typesize) /*gold*/
+ 3 * (LIBXSMM_ALIGNMENT - 1) /*alignments*/;
scratch = libxsmm_aligned_scratch(scratch_size, LIBXSMM_ALIGNMENT);
scratch = OPENCL_LIBSMM_VALIDATE_SCRATCH(scratch_size, LIBXSMM_ALIGNMENT);
if (NULL != scratch) {
stack = (int*)scratch;
imat = (char*)LIBXSMM_UP2((uintptr_t)stack + sizeof(int) * offset_stack_size, LIBXSMM_ALIGNMENT);
Expand Down Expand Up @@ -855,20 +835,15 @@ int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size, v
}
# if defined(OPENCL_LIBSMM_VALIDATE_TRANS)
ACC_OPENCL_CHECK(c_dbcsr_acc_memcpy_d2h(dev_data, omat, data_size, stream), "transfer validation test", result);
# endif
# if defined(OPENCL_LIBSMM_VALIDATE_TRANS)
ACC_OPENCL_CHECK(c_dbcsr_acc_stream_sync(stream), "sync stream", result);
# endif
# if defined(OPENCL_LIBSMM_VALIDATE_TRANS)
if (EXIT_SUCCESS == result) {
int i, j;
LIBXSMM_STDIO_ACQUIRE();
char print_buffer[2048] = "";
int print_offset = 0, i, j;
if (0 != c_dbcsr_acc_opencl_config.verbosity) {
fprintf(stderr,
"libsmm_acc_transpose("
"offset=%i, size=%i, type=%s, m=%i, n=%i, max=%i, stream=%p)",
offset, stack_size, dbcsr_type_real_8 == datatype ? "f64" : (dbcsr_type_real_4 == datatype ? "f32" : "unknown"), m, n,
max_kernel_dim, stream);
print_offset += LIBXSMM_SNPRINTF(print_buffer + print_offset, sizeof(print_buffer) - print_offset,
"libsmm_acc_transpose(offset=%i, size=%i, type=%s, m=%i, n=%i, max=%i, stream=%p)", offset, stack_size,
dbcsr_type_real_8 == datatype ? "f64" : (dbcsr_type_real_4 == datatype ? "f32" : "unknown"), m, n, max_kernel_dim,
stream);
}
for (i = offset; i < offset_stack_size; ++i) {
const size_t index = stack[i];
Expand All @@ -879,20 +854,12 @@ int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size, v
libxsmm_itrans(gold, typesize, m, n, m, n);
if (0 != memcmp(gold, test, mn * typesize)) {
if (0 == c_dbcsr_acc_opencl_config.verbosity) {
fprintf(stderr,
"libsmm_acc_transpose("
"offset=%i, size=%i, type=%s, m=%i, n=%i, max=%i, stream=%p)",
offset, stack_size, dbcsr_type_real_8 == datatype ? "f64" : (dbcsr_type_real_4 == datatype ? "f32" : "unknown"), m,
n, max_kernel_dim, stream);
}
fprintf(stderr, " => ERROR\n");
if (3 <= c_dbcsr_acc_opencl_config.verbosity || 0 > c_dbcsr_acc_opencl_config.verbosity) {
fprintf(stderr, "stackposition = %i (index=%llu)\n", i, (unsigned long long)index);
opencl_libsmm_print_matrix(stderr, "orig = ", datatype, orig, m, n);
opencl_libsmm_print_matrix(stderr, "gold = ", datatype, gold, n, m);
opencl_libsmm_print_matrix(stderr, "test = ", datatype, test, n, m);
fprintf(stderr, "\n");
print_offset += LIBXSMM_SNPRINTF(print_buffer + print_offset, sizeof(print_buffer) - print_offset,
"libsmm_acc_transpose(offset=%i, size=%i, type=%s, m=%i, n=%i, max=%i, stream=%p)", offset, stack_size,
dbcsr_type_real_8 == datatype ? "f64" : (dbcsr_type_real_4 == datatype ? "f32" : "unknown"), m, n, max_kernel_dim,
stream);
}
print_offset += LIBXSMM_SNPRINTF(print_buffer + print_offset, sizeof(print_buffer) - print_offset, " => ERROR\n");
# if defined(OPENCL_LIBSMM_VALIDATE_EXIT)
exit(EXIT_FAILURE);
# else
Expand All @@ -903,7 +870,7 @@ int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size, v
for (j = offset; j < i; ++j) {
const size_t duplicate = stack[j];
if (index == duplicate) {
fprintf(stderr, " => ERROR\n");
print_offset += LIBXSMM_SNPRINTF(print_buffer + print_offset, sizeof(print_buffer) - print_offset, " => ERROR\n");
# if defined(OPENCL_LIBSMM_VALIDATE_EXIT)
exit(EXIT_FAILURE);
# else
Expand All @@ -915,8 +882,10 @@ int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size, v
}
}
if (0 != c_dbcsr_acc_opencl_config.verbosity && EXIT_SUCCESS == result) {
fprintf(stderr, " => OK\n");
print_offset += LIBXSMM_SNPRINTF(print_buffer + print_offset, sizeof(print_buffer) - print_offset, " => OK\n");
}
LIBXSMM_STDIO_ACQUIRE();
fputs(print_buffer, stderr);
LIBXSMM_STDIO_RELEASE();
}
libxsmm_free(scratch);
Expand Down Expand Up @@ -1342,7 +1311,7 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack,
&blob, precision, m_max, n_max, k_max, m_max, k_max, m_max, LIBXSMM_GEMM_FLAG_NONE, LIBXSMM_PREFETCH_NONE);
const size_t scratch_size = psize + asize + bsize + csize + csize + k_max * n_max * typesize +
5 * (LIBXSMM_ALIGNMENT - 1) /*alignments*/;
scratch = libxsmm_aligned_scratch(scratch_size, LIBXSMM_ALIGNMENT);
scratch = OPENCL_LIBSMM_VALIDATE_SCRATCH(scratch_size, LIBXSMM_ALIGNMENT);
if (NULL != desc && NULL != scratch) {
pinp = (int*)scratch;
ainp = (char*)LIBXSMM_UP2((uintptr_t)pinp + psize, LIBXSMM_ALIGNMENT);
Expand Down Expand Up @@ -1429,10 +1398,12 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack,
const char* const env_tol = getenv("OPENCL_LIBSMM_SMM_TOLERANCE");
const double tolerance = ((NULL == env_tol || '\0' == *env_tol) ? 1E-3 : atof(env_tol));
const int* const params = pinp + (4 <= nparams ? (nparams - 4) : 0);
char print_buffer[2048] = "";
int print_offset = 0;
size_t i;
LIBXSMM_STDIO_ACQUIRE();
if (0 != c_dbcsr_acc_opencl_config.verbosity) {
fprintf(stderr, "libsmm_acc_process(size=%i, type=%s, m=%i, n=%i, k=%i, max=%i, stream=%p)", stack_size,
print_offset += LIBXSMM_SNPRINTF(print_buffer + print_offset, sizeof(print_buffer) - print_offset,
"libsmm_acc_process(size=%i, type=%s, m=%i, n=%i, k=%i, max=%i, stream=%p)", stack_size,
dbcsr_type_real_8 == datatype ? "f64" : (dbcsr_type_real_4 == datatype ? "f32" : "unknown"), m_max, n_max, k_max,
max_kernel_dim, stream);
}
Expand All @@ -1458,20 +1429,21 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack,
# endif
if (tolerance < epsilon) {
if (0 == c_dbcsr_acc_opencl_config.verbosity) {
fprintf(stderr, "libsmm_acc_process(size=%i, type=%s, m=%i, n=%i, k=%i, max=%i, stream=%p)", stack_size,
print_offset += LIBXSMM_SNPRINTF(print_buffer + print_offset, sizeof(print_buffer) - print_offset,
"libsmm_acc_process(size=%i, type=%s, m=%i, n=%i, k=%i, max=%i, stream=%p)", stack_size,
dbcsr_type_real_8 == datatype ? "f64" : (dbcsr_type_real_4 == datatype ? "f32" : "unknown"), m_max, n_max, k_max,
max_kernel_dim, stream);
}
# if LIBXSMM_VERSION4(1, 17, 0, 0) < LIBXSMM_VERSION_NUMBER
fprintf(stderr, " => ERROR diff=%g (%g != %g)\n", diff.linf_abs, diff.v_ref, diff.v_tst);
# else
fprintf(stderr, " => ERROR diff=%g\n", diff.linf_abs);
if (LIBXSMM_NOTNAN(diff.v_tst)) {
print_offset += LIBXSMM_SNPRINTF(print_buffer + print_offset, sizeof(print_buffer) - print_offset,
" => ERROR diff=%g (|%g-%g|=%g)\n", epsilon, diff.v_ref, diff.v_tst, diff.linf_abs);
}
else
# endif
if (3 <= c_dbcsr_acc_opencl_config.verbosity || 0 > c_dbcsr_acc_opencl_config.verbosity) {
fprintf(stderr, "stackposition = %llu (index=%llu)\n", (unsigned long long)i, (unsigned long long)ic);
opencl_libsmm_print_matrix(stderr, "gold = ", datatype, gold + ic, m_max, n_max);
opencl_libsmm_print_matrix(stderr, "test = ", datatype, test + ic, m_max, n_max);
fprintf(stderr, "\n");
{
print_offset += LIBXSMM_SNPRINTF(
print_buffer + print_offset, sizeof(print_buffer) - print_offset, " => ERROR diff=%g\n", epsilon);
}
# if defined(OPENCL_LIBSMM_VALIDATE_EXIT)
exit(EXIT_FAILURE);
Expand All @@ -1482,8 +1454,10 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack,
}
}
if (0 != c_dbcsr_acc_opencl_config.verbosity && EXIT_SUCCESS == result) {
fprintf(stderr, " => OK\n");
print_offset += LIBXSMM_SNPRINTF(print_buffer + print_offset, sizeof(print_buffer) - print_offset, " => OK\n");
}
LIBXSMM_STDIO_ACQUIRE();
fputs(print_buffer, stderr);
LIBXSMM_STDIO_RELEASE();
}
libxsmm_free(scratch);
Expand Down
Loading

0 comments on commit b12b740

Please sign in to comment.