Scripts to build s3-benchrunner-cli (#14)

- new `build.py` script for CLI runner - pull aws-cli and aws-crt-python from Github, installs them to a python virtual environment - new `README.md` for CLI runner - Move some copy/pastey stuff to common scripts: - `fetch-git-repos.py`: I needed this logic a 3rd time, so now it's in a helper script - `install-tools-AL2023.py`: Decided to just have 1 script for the whole repo, instead of 1 per runner. - They were getting too copy/pastey, with a lot of overlap in the tools they need
awslabs · Oct 3, 2023 · 7a5142e · 7a5142e
1 parent e40fadf
commit 7a5142e
Show file tree

Hide file tree

Showing 10 changed files with 293 additions and 135 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -1,7 +1,8 @@
 name: CI
 on: [push]
 jobs:
-  CI:
+
+  Lint:
     runs-on: ubuntu-22.04 # latest at time of writing
     steps:
     - uses: actions/checkout@v4
@@ -32,8 +33,27 @@ jobs:
         python scripts/build-benchmarks.py
         git diff --exit-code
 
-    - name: Build s3-benchrunner-c
-      run: python runners/s3-benchrunner-c/scripts/build.py --build-dir ${{runner.temp}}/build/c
+  Build:
+    runs-on: ubuntu-22.04 # latest at time of writing
+    strategy:
+      matrix:
+        runner:
+          - s3-benchrunner-c
+          - s3-benchrunner-cli
+          - s3-benchrunner-crt-java
+    steps:
+    - uses: actions/checkout@v4
+
+    - uses: actions/setup-python@v4
+      with:
+        python-version: '3.9' # default version on Amazon Linux 2023
+
+    - uses: actions/setup-java@v3
+      with:
+        distribution: 'corretto'
+        java-version: '17' # default version on Amazon Linux 2023
+
+    - run: python -m pip install -r scripts/requirements.txt
 
-    - name: Build s3-benchrunner-crt-java
-      run: python runners/s3-benchrunner-crt-java/scripts/build.py --build-dir ${{runner.temp}}/build/crt-java
+    - name: Build
+      run: python runners/${{ matrix.runner }}/scripts/build.py --build-dir ${{runner.temp}}/build
diff --git a/README.md b/README.md
@@ -4,11 +4,16 @@ This project is for benchmarking different S3 workloads using various languages
 
 ## Running Benchmarks
 
-### Minimum Requirements
-
-*   Python 3.9+ with pip
-
-If you use Amazon Linux 2023 (recommended), there are scripts to help install further tools.
+### Requirements
+*   To start:
+    *   Python 3.9+ with pip
+*   On Amazon Linux 2023, a script is provided to install further tools.
+    Otherwise, depending on the language you want to benchmark, you'll need:
+    *   CMake 3.22+
+    *   C99 / C++20 compiler (e.g. gcc, clang)
+    *   JDK17+ (e.g. corretto, openjdk)
+    *   Maven
+    *   Python C extension headers and libraries (e.g. python3-devel)
 
 To run **ALL** the benchmarks, your machine needs 300+ GiB of disk space available,
 and fast enough internet to upload a terabyte to S3 within your lifetime.
@@ -20,7 +25,13 @@ Your machine must have AWS credentials, with permission to read and write to an
 
 First, clone this repo.
 
-Then install packages needed by the python scripts:
+Then install the [requirements](#requirements) listed above.
+On Amazon Linux 2023, you can simply run this script:
+```sh
+./aws-crt-s3-benchmarks/scripts/install-tools-AL2023.py
+```
+
+Then, install packages needed by the python scripts:
 ```sh
 python3 -m pip install -r aws-crt-s3-benchmarks/scripts/requirements.txt
 ```
@@ -52,15 +63,7 @@ For example, [runners/s3-benchrunner-c](runners/s3-benchrunner-c/) tests the
 [aws-c-s3](https://github.com/awslabs/aws-c-s3/) library.
 See [runners/](runners/#readme) for more info.
 
-Every runner comes with 2 scripts, which you should run now.
-
-`install-tools.py` installs tools on Amazon Linux 2023
-(On another OS? Read the script to see what you need):
-```sh
-./aws-crt-s3-benchmarks/runners/RUNNER_X/scripts/install-tools.py
-```
-
-And `build.py` to build the runner:
+Every runner comes a `build.py` script:
 ```sh
 ./aws-crt-s3-benchmarks/runners/RUNNER_X/scripts/build.py --build-dir BUILD_DIR
 ```

diff --git a/runners/s3-benchrunner-c/scripts/build.py b/runners/s3-benchrunner-c/scripts/build.py
@@ -3,6 +3,7 @@
 import os
 from pathlib import Path
 import subprocess
+import sys
 
 ARG_PARSER = argparse.ArgumentParser(
     description='Build runner and its dependencies',
@@ -30,14 +31,8 @@
 
 
 def run(cmd_args: list[str]):
-    if not try_run(cmd_args):
-        exit(f'FAILED: {subprocess.list2cmdline(cmd_args)}')
-
-
-def try_run(cmd_args: list[str]):
     print(f'> {subprocess.list2cmdline(cmd_args)}')
-    result = subprocess.run(cmd_args)
-    return result.returncode == 0
+    subprocess.run(cmd_args, check=True)
 
 
 def fetch_dep(work_dir: Path, dep_name: str, branch: str) -> Path:
@@ -47,22 +42,11 @@ def fetch_dep(work_dir: Path, dep_name: str, branch: str) -> Path:
     """
     dep_dir = work_dir.joinpath(dep_name)
 
-    # git clone (if necessary)
-    os.chdir(str(work_dir))
-    if not dep_dir.exists():
-        run(['git', 'clone', f'https://github.com/awslabs/{dep_name}'])
-
-    os.chdir(str(dep_dir))
-
-    # git fetch before checkout (in case repo was already there and new branch was not fetched)
-    run(['git', 'fetch'])
-
-    # git checkout branch, but if it doesn't exist use main
-    if not try_run(['git', 'checkout', branch]):
-        run(['git', 'checkout', 'main'])
-
-    # git pull (in case repo was already there without latest commits)
-    run(['git', 'pull'])
+    root = Path(__file__).parent.parent.parent.parent
+    run([sys.executable, str(root.joinpath('scripts/fetch-git-repo.py')),
+         '--repo', f'https://github.com/awslabs/{dep_name}.git',
+         '--preferred-branch', branch,
+         '--dir', str(dep_dir)])
 
     return dep_dir
 
@@ -106,6 +90,9 @@ def main(work_dir: Path, branch: str):
     work_dir = work_dir.resolve()  # normalize path
     work_dir.mkdir(parents=True, exist_ok=True)
 
+    # for faster C compilation
+    os.environ['CMAKE_BUILD_PARALLEL_LEVEL'] = str(os.cpu_count())
+
     # fetch and build dependencies
     for dep in DEPS:
         dep_src = fetch_dep(work_dir, dep, branch)

diff --git a/runners/s3-benchrunner-cli/README.md b/runners/s3-benchrunner-cli/README.md
@@ -0,0 +1,81 @@
+# s3-benchrunner-cli
+
+```
+usage: benchrunner.py [-h] [--verbose] [--use-existing-aws-config] BENCHMARK BUCKET REGION TARGET_THROUGHPUT
+
+Benchmark runner for AWS CLI
+
+positional arguments:
+  BENCHMARK
+  BUCKET
+  REGION
+  TARGET_THROUGHPUT
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --verbose             Show CLI commands and their output
+  --use-existing-aws-config
+                        If set, your existing AWS_CONFIG_FILE is used. (instead of one that customizes
+                        'preferred_transfer_client')
+```
+
+This runner uses your existing `aws` CLI installation.
+If you want to build the CLI yourself, see [instructions below](#building-locally).
+
+This runner skips benchmarks unless it can do them in a single AWS CLI command.
+If we used multiple commands, one after another, performance would look bad
+compared to other runners that run multiple commands in parallel.
+That's not a fair comparison (no one runs CLI commands in parallel) so we skip those benchmarks.
+
+Here are examples, showing how a given benchmark is run in a single CLI command:
+
+1) Uploading or downloading a single file is simple:
+    * benchmark: `upload-5GiB`
+    * cmd: `aws s3 cp upload/5GiB/1 s3://my-s3-benchmarks/upload/5GiB/1`
+
+2) A benchmark with multiple files only works if they're in the same directory:
+    * benchmark: `upload-5GiB-20x`
+    * cmd: `aws s3 cp upload/5GiB s3://my-s3-benchmarks/upload/5GiB --recursive`
+
+3) If the benchmark doesn't use every file in the directory, then we `--include` the ones we want:
+    * benchmark: `upload-5GiB-10x`
+    * cmd: `aws s3 cp upload/5GiB s3://my-s3-benchmarks/upload/5GiB --recursive --exclude "*" --include 1 --include 2 --include 3 --include 4 --include 5 --include 6 --include 7 --include 8 --include 9 --include 10`
+
+4) If the benchmark has `"filesOnDisk": false` then we upload from stdin, or download to stdout. This only works if the benchmark has 1 file.
+    * benchmark: `upload-5GiB-ram`
+    * cmd: `<5GiB_random_data> | aws s3 cp - s3://my-s3-benchmarks/upload/5GiB/1`
+
+## Building locally
+
+Here are instructions to use a locally built AWS CLI.
+
+First, create a virtual environment, to isolate your dev versions from system defaults:
+```sh
+python3 -m venv .venv
+```
+
+Now we'll use python in the virtual environment...
+Install some dependencies...
+```
+.venv/bin/python3 -m pip install --upgrade pip boto3
+```
+
+Next, pull the AWS CLI source code and install it in your virtual environment
+(`--editable` so we can modify its source without reinstalling):
+```sh
+git clone --branch v2 https://github.com/aws/aws-cli.git
+.venv/bin/python3 -m pip install --editable aws-cli
+```
+
+And if you want the latest aws-crt-python, pull it and install that too:
+```sh
+git clone --recurse-submodules https://github.com/awslabs/aws-crt-python.git
+.venv/bin/python3 -m pip install --editable aws-crt-python
+```
+pip complains that the newly installed 1.0.0.dev0 clashes
+with the version requirements from awscli, but we ignore this.
+
+Now, you can execute the runner using your virtual environment with the latest CLI and CRT:
+```sh
+.venv/bin/python3 path/to/aws-crt-s3-benchmarks/runners/s3-benchrunner-cli/benchrunner.py --help
+```
diff --git a/runners/s3-benchrunner-cli/scripts/build.py b/runners/s3-benchrunner-cli/scripts/build.py
@@ -1,26 +1,94 @@
 #!/usr/bin/env python3
 import argparse
+import os
 from pathlib import Path
 import subprocess
+import sys
+from typing import Optional
 
-ARG_PARSER = argparse.ArgumentParser(
-    description='Build runner and its dependencies',
-    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-ARG_PARSER.add_argument(
+PARSER = argparse.ArgumentParser(
+    description='Build runner and its dependencies')
+PARSER.add_argument(
+    '--branch',
+    help='Git branch/commit/tag to use when pulling dependencies')
+PARSER.add_argument(
     '--build-dir', required=True,
     help='Root dir for build artifacts')
 
 
+def run(cmd_args: list[str]):
+    print(f'> {subprocess.list2cmdline(cmd_args)}')
+    subprocess.run(cmd_args, check=True)
+
+
+def fetch_git_repo(url: str, dir: Path, main_branch: str, preferred_branch: Optional[str]):
+    # use helper script
+    root = Path(__file__).parent.parent.parent.parent
+    fetch_cmd = [sys.executable, str(root.joinpath('scripts/fetch-git-repo.py')),
+                 '--repo', url,
+                 '--main-branch', main_branch,
+                 '--dir', str(dir)]
+    if preferred_branch:
+        fetch_cmd.extend(['--preferred-branch', preferred_branch])
+    run(fetch_cmd)
+
+
+def build_cli(work_dir: Path, branch: Optional[str], venv_python: str):
+    cli_dir = work_dir.joinpath('aws-cli')
+
+    # fetch git repo (if necessary)
+    fetch_git_repo('https://github.com/aws/aws-cli.git', cli_dir,
+                   main_branch='v2', preferred_branch=branch)
+
+    # install CLI into virtual env
+    # use --editable so we don't need to reinstall after simple file edits
+    run([venv_python, '-m', 'pip', 'install', '--editable', str(cli_dir)])
+
+
+def build_crt(work_dir: Path, branch: Optional[str], venv_python: str):
+    crt_dir = work_dir.joinpath('aws-crt-python')
+
+    # fetch git repo (if necessary)
+    fetch_git_repo('https://github.com/awslabs/aws-crt-python.git', crt_dir,
+                   main_branch='main', preferred_branch=branch)
+
+    # for faster C compilation
+    os.environ['CMAKE_BUILD_PARALLEL_LEVEL'] = str(os.cpu_count())
+
+    # install into virtual env
+    # use --editable so we don't need to reinstall after simple file edits
+    run([venv_python, '-m', 'pip', 'install', '--editable', str(crt_dir)])
+
+
 if __name__ == '__main__':
-    args = ARG_PARSER.parse_args()
+    args = PARSER.parse_args()
+    work_dir = Path(args.build_dir).resolve()
+
+    # create virtual environment (if necessary) awscli from Github
+    # doesn't interfere with system installation of awscli
+    venv_dir = work_dir.joinpath('.venv')
+    venv_python = str(venv_dir.joinpath('bin/python3'))
+    if not venv_dir.exists():
+        run([sys.executable, '-m', 'venv', str(venv_dir)])
+
+        # upgrade pip to avoid warnings
+        run([venv_python, '-m', 'pip', 'install', '--upgrade', 'pip'])
+
+    # install aws-cli from Github
+    build_cli(work_dir, args.branch, venv_python)
+
+    # the runner uses boto3 too
+    run([venv_python, '-m', 'pip', 'install', 'boto3'])
+
+    # install aws-crt-python from Github
+    # (pip complains that the newly installed 1.0.0.dev0 clashes
+    # with the version requirements from awscli, but we ignore this)
+    build_crt(work_dir, args.branch, venv_python)
 
     runner_dir = Path(__file__).parent.parent.resolve()  # normalize path
     runner_py = str(runner_dir.joinpath('benchrunner.py'))
 
-    # TODO: install CLI from github
-    # for now, we'll just use what's in the package manager
-
-    # finally, print command for executing the runner
+    # finally, print command for executing the runner, using the virtual environment
     print("------ RUNNER_CMD ------")
-    runner_cmd = [runner_py]
+    runner_cmd = [venv_python, str(runner_dir.joinpath('benchrunner.py'))]
     print(subprocess.list2cmdline(runner_cmd))
diff --git a/runners/s3-benchrunner-cli/scripts/install-tools.py b/runners/s3-benchrunner-cli/scripts/install-tools.py