diff --git a/src/acc/libsmm_acc/libsmm_acc.cpp b/src/acc/libsmm_acc/libsmm_acc.cpp index b3981144586..8ad7ad05601 100644 --- a/src/acc/libsmm_acc/libsmm_acc.cpp +++ b/src/acc/libsmm_acc/libsmm_acc.cpp @@ -199,14 +199,14 @@ kernel_map_iterator add_kernel_handle_to_jitted_kernels( ACC_DRV(function) kern_func, ACC_DRV(stream) stream, Triplet h_mnk, int& threads, int& grouping, bool& generated_acc_untuned) { kernel_map_iterator kernel_it = kernel_handles.end(); - // Check if the kernel was already generated and failed + // Check if the kernel was already generated and failed or if it is too big if (failed_acc_kernels.find(h_mnk) != failed_acc_kernels.end()) return kernel_it; libsmm_acc_algo algo; int tile_m, tile_n, w, v, minblocks; // Check whether autotuned parameters are given for this kernel, and if so, retrieve them - if (ht.find(h_mnk) != ht.end()) { + if (ht.find(h_mnk) != ht.end() && false) { // Retrieve launching parameters const KernelParameters params = ht.at(h_mnk); algo = libsmm_acc_algo(params[0]); // enum {largeDB1, largeDB2, medium, small, tiny} @@ -219,7 +219,7 @@ kernel_map_iterator add_kernel_handle_to_jitted_kernels( minblocks = params[7]; generated_acc_untuned = false; } - else { // Use a default untuned kernel + else if (h_mnk[0] < 50 && h_mnk[1] < 50 && h_mnk[2] < 50) { // Use a default untuned kernel algo = medium; tile_m = 2; tile_n = 2; @@ -227,9 +227,13 @@ kernel_map_iterator add_kernel_handle_to_jitted_kernels( v = 0; threads = 256; grouping = 30; - minblocks = 2; + minblocks = 1; generated_acc_untuned = true; } + else { + failed_acc_kernels.insert(h_mnk); + return kernel_it; + } // JIT and validate the kernel jit_kernel(kern_func, algo, tile_m, tile_n, w, v, threads, grouping, minblocks, h_mnk[0], h_mnk[1], h_mnk[2]); diff --git a/src/core/dbcsr_config.F b/src/core/dbcsr_config.F index 9c57d6a1cfd..0655a1cc81a 100644 --- a/src/core/dbcsr_config.F +++ b/src/core/dbcsr_config.F @@ -701,7 +701,7 @@ SUBROUTINE reset_accdrv_active_device_id() accdrv_active_device_id = default_accdrv_active_device_id END SUBROUTINE reset_accdrv_active_device_id - FUNCTION use_acc() + PURE FUNCTION use_acc() LOGICAL :: use_acc IF (has_acc .AND. dbcsr_cfg%run_on_gpu%val) THEN diff --git a/src/mm/dbcsr_mm_sched.F b/src/mm/dbcsr_mm_sched.F index abe3dd86024..db68526e398 100644 --- a/src/mm/dbcsr_mm_sched.F +++ b/src/mm/dbcsr_mm_sched.F @@ -567,7 +567,8 @@ SUBROUTINE stats_print_report(report, output_unit) INTEGER(KIND=int_8), DIMENSION(3) :: flops_homo INTEGER, ALLOCATABLE, DIMENSION(:) :: sort_idx CHARACTER(LEN=4) :: generated_acc_untuned_label - LOGICAL :: has_acc_untuned_kernel + LOGICAL :: has_acc_untuned_kernel, & + use_cpu_kernels IF (output_unit <= 0) RETURN @@ -582,6 +583,8 @@ SUBROUTINE stats_print_report(report, output_unit) total_flops_homo = 0 flops_homo(:) = 0 has_acc_untuned_kernel = .FALSE. + use_cpu_kernels = .FALSE. + DO i = 1, SIZE(sort_idx) j = sort_idx(i) + 1 total = SUM(report%num_mnk_stacks(j, 4:6)) @@ -595,6 +598,10 @@ SUBROUTINE stats_print_report(report, output_unit) has_acc_untuned_kernel = .TRUE. END IF + IF (SUM(report%num_mnk_stacks(j, 4:5)) .GT. 0) THEN + use_cpu_kernels = .TRUE. + END IF + WRITE (output_unit, "(A,I5,' x ',I5,' x ',I5,T30,I20,5X,F5.1,'%',4X,F5.1,'%',4X,F5.1,'% ',A)") & " flops ", report%num_mnk_stacks(j, 1:3), & flops, & @@ -603,7 +610,13 @@ SUBROUTINE stats_print_report(report, output_unit) END DO IF (has_acc_untuned_kernel) THEN - DBCSR_WARN(" (*) ACC Untuned kernels, consider to run the tuning procedure") + CALL dbcsr_warn(__LOCATION__, & + " (*) ACC Untuned kernels, consider to run the ACC tuning procedure for them") + END IF + + IF (use_cpu_kernels .AND. use_acc()) THEN + CALL dbcsr_warn(__LOCATION__, & + " Some kernels are running on the CPU, consider to run the ACC tuning procedure for them") END IF total = report%cpu_flop + report%smm_flop + report%acc_flop