Add --warmup-run option (unisa-hpc#14)

Add option to perform a first "warmup" run that will not be taken into consideration to omit possible JIT overhead in the evaluation. --------- Signed-off-by: Victor Perez <[email protected]>
sommerlukas · Nov 17, 2023 · fe9f58f · fe9f58f
1 parent ac3c42e
commit fe9f58f
Show file tree

Hide file tree

Showing 3 changed files with 18 additions and 2 deletions.
diff --git a/bin/run-suite b/bin/run-suite
@@ -112,6 +112,7 @@ def create_log_range(begin, end):
     --no-verification - disable verification entirely
     --no-ndrange-kernels - do not run kernels based on ndrange parallel for
     --hierarchical-kernels - run kernels using hierarchical parallelism
+    --warmup-run - run benchmarks once before evaluation to discard possible "warmup" times, e.g., JIT compilation
 '''
 output_file = "./sycl-bench.csv"
 discard = io.StringIO()
@@ -138,6 +139,7 @@ parser.add_argument("-t", "--timeout", metavar="SEC",
                     default=-1,
                     help="Time out (second)", type=int)
 parser.add_argument("--mlir-only", action='store_true', help="Run MLIR tests only")
+parser.add_argument("--warmup-run", action='store_true', help="Perform warmup run")
 
 parse_args = parser.parse_args()
 if parse_args.timeout <= 0:
@@ -357,6 +359,9 @@ if __name__ == '__main__':
               args.append('--size='+str(size))
               args.append('--local='+str(localsize))
 
+              if parse_args.warmup_run:
+                args.append('--warmup-run')
+
               retcode, elapsed_time = invoke_benchmark(benchmark_executable, args)
               if retcode == 0:
                 max_runtime = max(max_runtime, elapsed_time)

diff --git a/include/command_line.h b/include/command_line.h
@@ -166,6 +166,7 @@ struct BenchmarkArgs
   // can be used to query additional benchmark specific information from the command line
   CommandLine cli;
   std::shared_ptr<ResultConsumer> result_consumer;
+  bool warmup_run;
 };
 
 class CUDASelector : public cl::sycl::device_selector {
@@ -193,6 +194,11 @@ class BenchmarkCommandLine
     std::size_t num_runs = cli_parser.getOrDefault<std::size_t>("--num-runs", 5);
 
     std::string device_type = cli_parser.getOrDefault<std::string>("--device", "default");
+    bool warmup_run = cli_parser.isFlagSet("--warmup-run");
+    if (warmup_run) {
+      // Make drop of first run transparent to the user
+      ++num_runs;
+    }
     cl::sycl::queue q = getQueue(device_type);
 
     bool verification_enabled = true;
@@ -216,7 +222,8 @@ class BenchmarkCommandLine
                                              verification_begin,
                                              verification_range},
                          cli_parser,
-                         result_consumer};
+                         result_consumer,
+                         warmup_run};
   }
 
 private:

diff --git a/include/time_metrics.h b/include/time_metrics.h
@@ -75,7 +75,11 @@ class TimeMetricsProcessor {
     for(const auto& name : allTimings) {
       if(unavailableTimings.count(name) == 0) {
         std::vector<double> resultsSeconds;
-        std::transform(timingResults.at(name).begin(), timingResults.at(name).end(), std::back_inserter(resultsSeconds),
+        auto times_begin = timingResults.at(name).begin();
+        if (args.warmup_run) {
+          ++times_begin;
+        }
+        std::transform(times_begin, timingResults.at(name).end(), std::back_inserter(resultsSeconds),
             [](auto r) { return r.count() / 1.0e9; });
         std::sort(resultsSeconds.begin(), resultsSeconds.end());