Add perfetto trace analysis benchmark (#969)

Summary: Add a benchmark for trace analysis tasks on backends like Perfetto. Step 1: install the benchmark (this will download and decompress sample trace from Amazon S3) ``` $ python -m benchmarks.perfetto.install Checking out https://ossci-datasets.s3.amazonaws.com/torchbench/traces/torchbench_traces.tar.gz to /Users/xzhao9/git/kineto/benchmarks/trace_analysis/.data/torchbench_traces.tar.gz decompressing input tarball: /Users/xzhao9/git/kineto/benchmarks/trace_analysis/.data/torchbench_traces.tar.gz...OK Requirement already satisfied: perfetto in /Users/xzhao9/miniconda3/envs/test-numpy/lib/python3.11/site-packages (from -r /Users/xzhao9/git/kineto/benchmarks/trace_analysis/requirements.txt (line 1)) (0.7.0) Requirement already satisfied: tabulate in /Users/xzhao9/miniconda3/envs/test-numpy/lib/python3.11/site-packages (from -r /Users/xzhao9/git/kineto/benchmarks/trace_analysis/requirements.txt (line 2)) (0.9.0) Requirement already satisfied: protobuf in /Users/xzhao9/miniconda3/envs/test-numpy/lib/python3.11/site-packages (from perfetto->-r /Users/xzhao9/git/kineto/benchmarks/trace_analysis/requirements.txt (line 1)) (4.25.3) ``` Step 2: run the benchmark ``` $ python -m benchmarks.perfetto.run input-task perfetto-latency ---------------------------------------------- ------------------ torchbench_resnet50_3080ti-load 8.53069 torchbench_resnet50_3080ti-search_gemm_kernels 0.067583 torchbench_resnet50_3080ti-select_kernels 0.000549563 torchbench_resnet50_3080ti-group_kernels 0.0145147 ``` Right now, only latency metric is available. We could add other metrics like memory footprint later. Pull Request resolved: #969 Differential Revision: D60466932 Pulled By: xuzhao9
pytorch · Jul 30, 2024 · 18efd40 · 18efd40
1 parent 188c5f5
commit 18efd40
Show file tree

Hide file tree

Showing 12 changed files with 440 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
 # ignore common items
 .idea
 .vscode
+.data
+__pycache__
diff --git a/benchmarks/perfetto/__init__.py b/benchmarks/perfetto/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+
+BENCHMARK_ROOT = os.path.dirname(__file__)
+BENCHMARK_DATA_DIR = os.path.join(BENCHMARK_ROOT, ".data")
diff --git a/benchmarks/perfetto/backends/__init__.py b/benchmarks/perfetto/backends/__init__.py
@@ -0,0 +1,25 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .clp import CLPTraceAnalysis
+from .common import DEFAULT_METRICS  # noqa: F401
+from .perfetto import PerfettoTraceAnalysis
+
+AVAILABLE_BACKENDS = {
+    "perfetto": PerfettoTraceAnalysis,
+    "clp": CLPTraceAnalysis,
+}
+
+AVAILABLE_TASKS = [
+    "load",
+    "search_gemm_kernels",
+    "select_kernels",
+    "group_kernels",
+]
+
+for name in AVAILABLE_BACKENDS:
+    analysis = AVAILABLE_BACKENDS[name]
+    analysis.name = name
diff --git a/benchmarks/perfetto/backends/clp.py b/benchmarks/perfetto/backends/clp.py
@@ -0,0 +1,15 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+
+from .common import TraceAnalysis
+
+
+class CLPTraceAnalysis(TraceAnalysis):
+
+    def __init__(self, args: argparse.Namespace):
+        super().__init__(args)
diff --git a/benchmarks/perfetto/backends/common.py b/benchmarks/perfetto/backends/common.py
@@ -0,0 +1,76 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+
+import time
+from dataclasses import dataclass, field, fields
+
+from typing import Callable, Dict
+
+import numpy
+
+from .. import BENCHMARK_DATA_DIR
+
+
+def _get_input_path(input_name):
+    input_name = f"{input_name}.json"
+    return os.path.join(BENCHMARK_DATA_DIR, "torchbench_traces", input_name)
+
+
+@dataclass
+class TraceAnalysisMetrics:
+    # Latency to perform trace analysis tasks
+    latency: Dict[str, float] = field(default_factory=dict)
+    # Peak CPU memory to perform trace analysis tasks
+    peak_mem: Dict[str, float] = field(default_factory=dict)
+    # extra metrics
+    extra_metrics: Dict[str, float] = field(default_factory=dict)
+
+
+DEFAULT_METRICS = ["latency"]
+BUILTIN_METRICS = {x.name for x in fields(TraceAnalysisMetrics)} - {"extra_metrics"}
+
+
+class TraceAnalysis:
+    output: TraceAnalysisMetrics
+
+    def __init__(self, args: argparse.Namespace):
+        self.output = TraceAnalysisMetrics()
+        self.warmup = args.warmup
+        self.iter = args.iter
+
+    def _load(self, input: str):
+        input_path = _get_input_path(input)
+        t_iter_begin = time.perf_counter()
+        self.load(input_path)
+        t_iter_end = time.perf_counter()
+        self.output.latency["load"] = t_iter_end - t_iter_begin
+
+    def _run(self, task: str):
+        run_lambda = self.run(task)
+        # warmup
+        for _ in range(self.warmup):
+            run_lambda()
+        latencies = []
+        # TODO: does perfetto cache the query result?
+        for _ in range(self.iter):
+            t_iter_begin = time.perf_counter()
+            run_lambda()
+            t_iter_end = time.perf_counter()
+            latencies.append(t_iter_end - t_iter_begin)
+        # record p50 latency only
+        self.output.latency[task] = numpy.median(latencies)
+
+    def load(self, input_file_path: str):
+        raise NotImplementedError("Trace loading is not implemented yet.")
+
+    def run(self, task: str) -> Callable:
+        task_lambda = getattr(self, task, None)
+        if not task_lambda:
+            raise NotImplementedError(f"Task {task} is not implemented yet.")
+        return lambda: task_lambda()
diff --git a/benchmarks/perfetto/backends/perfetto.py b/benchmarks/perfetto/backends/perfetto.py
@@ -0,0 +1,40 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+
+from typing import List
+
+from perfetto.trace_processor import TraceProcessor
+
+from .common import TraceAnalysis
+
+
+class PerfettoTraceAnalysis(TraceAnalysis):
+
+    name = "perfetto"
+
+    def __init__(self, args: argparse.Namespace):
+        super().__init__(args)
+
+    def load(self, input_file_path: str):
+        self.tp = TraceProcessor(input_file_path)
+
+    def search_gemm_kernels(self) -> List[str]:
+        query = "SELECT DISTINCT(name) FROM slice WHERE name like '%sm90_xmma_gemm_%' ORDER BY ts"
+        query_result = [str(x) for x in self.tp.query(query)]
+        return query_result
+
+    def select_kernels(self):
+        query = "SELECT ts, dur, name FROM slice WHERE category == 'kernel' ORDER BY ts limit 30"
+        query_result = [str(x) for x in self.tp.query(query)]
+        return query_result
+
+    def group_kernels(self):
+        query = "SELECT name, sum(dur), avg(dur), count(*) as occ FROM slice WHERE category == 'kernel' GROUP BY name ORDER BY occ DESC"
+
+        query_result = [str(x) for x in self.tp.query(query)]
+        return query_result
diff --git a/benchmarks/perfetto/framework.py b/benchmarks/perfetto/framework.py
@@ -0,0 +1,42 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+
+from .backends import AVAILABLE_BACKENDS, DEFAULT_METRICS
+from .table import TraceAnalysisBenchmarkResult
+
+
+class TraceAnalysisBenchmark:
+    def __init__(self, args: argparse.Namespace):
+        self.inputs = args.inputs
+        self.tasks = args.tasks
+        self.backends = {
+            x_val(args)
+            for x_name, x_val in AVAILABLE_BACKENDS.items()
+            if x_name in args.backends
+        }
+
+        self.metrics = args.metrics if args.metrics else DEFAULT_METRICS
+
+        assert self.inputs, "Inputs to benchmark cannot be empty."
+        assert self.tasks, "Tasks to benchmark cannot be empty."
+        assert self.backends, "Backends to benchmark cannot be empty."
+
+        self.result = TraceAnalysisBenchmarkResult(
+            inputs=self.inputs,
+            tasks=self.tasks,
+            metrics=self.metrics,
+        )
+
+    def run(self):
+        for backend in self.backends:
+            for input in self.inputs:
+                backend._load(input)
+                for task in filter(lambda x: not x == "load", self.tasks):
+                    backend._run(task)
+            result_key = (input, backend.name)
+            self.result.data[result_key] = backend.output
diff --git a/benchmarks/perfetto/install.py b/benchmarks/perfetto/install.py
@@ -0,0 +1,41 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+
+import subprocess
+import sys
+
+from . import BENCHMARK_ROOT, s3_utils
+
+
+TRACES = [
+    "torchbench_traces.tar.gz",
+]
+
+
+def download_traces_from_s3():
+    """Download trace to benchmarks/trace_analysis/.data"""
+    for trace in TRACES:
+        s3_utils.checkout_s3_data(trace, decompress=True)
+
+
+def install_deps(requirements_txt="requirements.txt"):
+    subprocess.check_call(
+        [
+            sys.executable,
+            "-m",
+            "pip",
+            "install",
+            "-r",
+            os.path.join(BENCHMARK_ROOT, requirements_txt),
+        ]
+    )
+
+
+if __name__ == "__main__":
+    install_deps()
+    download_traces_from_s3()
diff --git a/benchmarks/perfetto/requirements.txt b/benchmarks/perfetto/requirements.txt
@@ -0,0 +1,4 @@
+perfetto
+tabulate
+numpy
+requests
diff --git a/benchmarks/perfetto/run.py b/benchmarks/perfetto/run.py
@@ -0,0 +1,56 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Kineto trace analysis benchmark.
+"""
+
+import argparse
+import sys
+
+from .backends import AVAILABLE_TASKS, DEFAULT_METRICS
+
+from .framework import TraceAnalysisBenchmark
+
+
+def _get_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--inputs",
+        nargs="+",
+        default=["torchbench_resnet50_3080ti"],
+        help="Name of the inputs.",
+    )
+    parser.add_argument(
+        "--tasks", nargs="+", default=AVAILABLE_TASKS, help="Name of the tasks."
+    )
+    parser.add_argument(
+        "--backends", nargs="+", default=["perfetto"], help="Name of the backends."
+    )
+    parser.add_argument(
+        "--metrics", nargs="+", default=DEFAULT_METRICS, help="Metrics to collect."
+    )
+
+    parser.add_argument("--csv", action="store_true", help="Output the result as csv")
+    parser.add_argument(
+        "--warmup", default=10, type=int, help="Number of warmup iterations."
+    )
+
+    parser.add_argument("--iter", default=20, type=int, help="Run iterations.")
+    return parser
+
+
+if __name__ == "__main__":
+    parser = _get_parser()
+    args = parser.parse_args()
+    benchmark = TraceAnalysisBenchmark(args)
+    benchmark.run()
+    result = benchmark.result
+
+    if args.csv:
+        print(result.write_csv_to_file(sys.stdout))
+    else:
+        print(result)
diff --git a/benchmarks/perfetto/s3_utils.py b/benchmarks/perfetto/s3_utils.py
@@ -0,0 +1,45 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+from pathlib import Path
+
+from . import BENCHMARK_DATA_DIR
+
+
+def decompress_s3_data(s3_tarball_path: Path):
+    assert str(s3_tarball_path.absolute()).endswith(
+        ".tar.gz"
+    ), f"Expected .tar.gz file path but got {s3_tarball_path}."
+    import tarfile
+
+    # Hide decompressed file in .data directory so that they won't be checked in
+    decompress_dir = os.path.join(
+        BENCHMARK_DATA_DIR, s3_tarball_path.name.removesuffix(".tar.gz")
+    )
+
+    os.makedirs(decompress_dir, exist_ok=True)
+    print(f"Decompressing input tarball: {s3_tarball_path}...", end="", flush=True)
+    tar = tarfile.open(s3_tarball_path)
+    tar.extractall(path=decompress_dir)
+    tar.close()
+    print("OK")
+
+
+def checkout_s3_data(name: str, decompress: bool = True):
+    S3_URL_BASE = "https://ossci-datasets.s3.amazonaws.com/torchbench"
+    download_dir = Path(BENCHMARK_DATA_DIR)
+    download_dir.mkdir(parents=True, exist_ok=True)
+    import requests
+
+    full_path = download_dir.joinpath(name)
+    s3_url = f"{S3_URL_BASE}/traces/{name}"
+    r = requests.get(s3_url, allow_redirects=True)
+    with open(str(full_path.absolute()), "wb") as output:
+        print(f"Checking out {s3_url} to {full_path}")
+        output.write(r.content)
+    if decompress:
+        decompress_s3_data(full_path)