-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Rust telemetry to JSON files, instead of OTLP/gRPC (#79)
Lots going on here, apologies... - Write telemetry to file, instead of exporting via OTLP/gRPC via [opentelemetry-otlp](https://crates.io/crates/opentelemetry-otlp/) crate. - Reasons for change: - Exporting via OTLP/gRPC was impacting noticeably performance. I tried to improve this by delaying any actual work until the benchmark run was complete (increasing buffer sizes, manually flushing buffers), but it still slowed max throughput from ~70 -> ~50 Gb/s. - It's just much simpler to spit out a file - Add our own telemetry exporter code, that writes [OpenTelemetry Protocol](https://opentelemetry.io/docs/specs/otel/protocol/) (OTLP) in JSON format. - We started with code from an old version of the `opentelemetry-stdout` crate. Specifically, we took the commit just before the [Simplify LogExporter::Export interface](open-telemetry/opentelemetry-rust@3193320) change. This change removed JSON export, so the code would be simpler for others to copy/paste. But we wanted JSON export, so copy/pasting from that point. - Modify the exporter so it doesn't do ANY work (buffering up all spans) until `flush_to_file(filename)` is called. - Hopefully, this eliminates the performance impact of gathering telemetry - This gives us a unique file per run. It's much nicer to analyze a run on its own, vs trying to isolate run 5 of 10 within an enormous file. - File is named like `trace_20241009T185957Z_download-30GiB-1x-ram_run01.json` - Add python scripts to visualize the telemetry data - Start with `allspans.py` which graphs ALL tracing spans, each in its own row, similar to tools like [Jaeger](https://github.com/jaegertracing/jaeger-ui) - Draw with [`plotly.express.timeline()`](https://plotly.com/python/gantt/). - `plotly` was recommended by ChatGPT, I asked for a python graphing library where you could hide information until you mouse-over it. I also specified that files should be easy to share, and `plotly` generates a single HTML file. - The figure could use more work, but it's a start - More visualizations coming...
- Loading branch information
Showing
14 changed files
with
1,151 additions
and
399 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
#!/usr/bin/env python3 | ||
import argparse | ||
import json | ||
from pathlib import Path | ||
|
||
from graph import PerfTimer | ||
import graph.allspans | ||
|
||
PARSER = argparse.ArgumentParser(description="Graph a benchmark run") | ||
|
||
# File contains JSON representation of OTLP TracesData. | ||
# Contents look like: | ||
# {"resourceSpans":[ | ||
# {"resource": {"attributes":[{"key":"service.name","value":{"stringValue":"s3-benchrunner-rust"}}, ...]}, | ||
# "scopeSpans":[ | ||
# {"scope":{"name":"s3-benchrunner-rust"}, | ||
# "spans":[ | ||
# {"traceId":"0e506aee98c24b869337620977f30cbb","spanId":"6fb4c16d1d1652d6", ...}, | ||
# {"traceId":"0e506aee98c24b869337620977f30cbb","spanId":"6440f82fb6fc6299", ...}, | ||
# ... | ||
# | ||
# Official protobuf format specified here: | ||
# https://github.com/open-telemetry/opentelemetry-proto/blob/main/opentelemetry/proto/trace/v1/trace.proto | ||
# | ||
# Note that when proto data is mapped to JSON, snake_case names become camelCase | ||
# see: https://protobuf.dev/programming-guides/proto3/#json | ||
PARSER.add_argument('TRACE_JSON', help="trace_*.json file to graph.") | ||
|
||
args = PARSER.parse_args() | ||
|
||
with PerfTimer(f'Open {args.TRACE_JSON}'): | ||
with open(args.TRACE_JSON) as f: | ||
traces_data = json.load(f) | ||
|
||
with PerfTimer('Graph all spans'): | ||
fig = graph.allspans.draw(traces_data) | ||
|
||
html_path = Path(args.TRACE_JSON).with_suffix('.html') | ||
with PerfTimer(f'Write {html_path}'): | ||
fig.write_html(html_path) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
import time | ||
|
||
|
||
class PerfTimer: | ||
"""Context manager that prints how long a `with` statement took""" | ||
|
||
def __init__(self, name): | ||
self.name = name | ||
|
||
def __enter__(self): | ||
self.start = time.perf_counter() | ||
|
||
def __exit__(self, exc_type, exc_value, traceback): | ||
if exc_type is None: | ||
end = time.perf_counter() | ||
print(f"{self.name}: {end - self.start:.3f} sec") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,160 @@ | ||
from collections import defaultdict | ||
import pandas as pd # type: ignore | ||
import plotly.express as px # type: ignore | ||
|
||
|
||
def draw(data): | ||
# gather all spans into a single list | ||
spans = [] | ||
for resource_span in data['resourceSpans']: | ||
for scope_span in resource_span['scopeSpans']: | ||
spans.extend(scope_span['spans']) | ||
|
||
# simplify attributes of each span to be simple dict | ||
for span in spans: | ||
span['attributes'] = _simplify_attributes(span['attributes']) | ||
|
||
# sort spans according to parent-child hierarchy | ||
spans = _sort_spans_by_hierarchy(spans) | ||
|
||
# prepare columns for plotly | ||
columns = defaultdict(list) | ||
name_count = defaultdict(int) | ||
for (idx, span) in enumerate(spans): | ||
|
||
name = span['name'] | ||
# we want each span in its own row, so assign a unique name and use that as Y value | ||
# TODO: improve unique name, using "seq" or "part-num" | ||
name_count[name] += 1 | ||
unique_name = f"{name}#{name_count[name]}" | ||
|
||
columns['Name'].append(name) | ||
columns['Unique Name'].append(unique_name) | ||
columns['Duration (ns)'].append( | ||
span['endTimeUnixNano'] - span['startTimeUnixNano']) | ||
columns['Start Time'].append(pd.to_datetime(span['startTimeUnixNano'])) | ||
columns['End Time'].append(pd.to_datetime(span['endTimeUnixNano'])) | ||
columns['Index'].append(idx) | ||
columns['Span ID'].append(span['spanId']) | ||
columns['Parent ID'].append(span['parentSpanId']) | ||
columns['Attributes'].append( | ||
[f"<br> {k}: {v}" for (k, v) in span['attributes'].items()]) | ||
|
||
# if a span name occurs only once, remove the "#1" from its unique name | ||
for (i, name) in enumerate(columns['Name']): | ||
if name_count[name] == 1: | ||
columns['Unique Name'][i] = name | ||
|
||
df = pd.DataFrame(columns) | ||
|
||
# By default, show all columns in hover text. | ||
# Omit a column by setting false. You can also set special formatting rules here. | ||
hover_data = {col: True for col in columns.keys()} | ||
hover_data['Name'] = False # already shown | ||
hover_data['Unique Name'] = False # already shown | ||
hover_data['End Time'] = False # who cares | ||
|
||
fig = px.timeline( | ||
data_frame=df, | ||
x_start='Start Time', | ||
x_end='End Time', | ||
y='Unique Name', | ||
hover_data=hover_data, | ||
# spans with same original name get same color | ||
# TODO: combine name with code.namespace, in case same name used in multiple places | ||
color='Name', | ||
# force ordering, otherwise plotly will group by 'color' | ||
category_orders={'Unique Name': df['Unique Name']}, | ||
) | ||
|
||
# if there are lots of rows, ensure they're not drawn too small | ||
num_rows = len(spans) | ||
if num_rows > 20: | ||
preferred_total_height = 800 | ||
min_row_height = 3 | ||
row_height = preferred_total_height / num_rows | ||
row_height = int(max(min_row_height, row_height)) | ||
height = num_rows * row_height | ||
# don't show yaxis labels if they're so squished that some are omitted | ||
show_yaxis_labels = row_height >= 15 | ||
else: | ||
# otherwise auto-height | ||
height = None | ||
show_yaxis_labels = True | ||
|
||
fig.update_layout( | ||
title="All Benchmark Spans", | ||
xaxis_title="Time", | ||
yaxis_title="Span Name", | ||
height=height, | ||
yaxis=dict( | ||
showticklabels=show_yaxis_labels, | ||
), | ||
hovermode='y unified', # show hover if mouse anywhere in row | ||
) | ||
|
||
return fig | ||
|
||
|
||
def _sort_spans_by_hierarchy(spans): | ||
# map from ID to span | ||
id_to_span = {} | ||
# map from parent ID to to child span IDs | ||
parent_to_child_ids = defaultdict(list) | ||
for span in spans: | ||
id = span['spanId'] | ||
id_to_span[id] = span | ||
|
||
parent_id = span['parentSpanId'] | ||
parent_to_child_ids[parent_id].append(id) | ||
|
||
# sort spans in depth-first order, by crawling the parent/child tree starting at root | ||
sorted_spans = [] | ||
# ids_to_process is FIFO | ||
# With each loop, we pop the last item in ids_to_process | ||
# and then append its children, so that we process them next. | ||
ids_to_process = ['0000000000000000'] | ||
while ids_to_process: | ||
id = ids_to_process.pop(-1) | ||
if id in parent_to_child_ids: | ||
child_ids = parent_to_child_ids[id] | ||
# sorted by start time, but reversed because we pop from the BACK of ids_to_process | ||
child_ids = sorted( | ||
child_ids, key=lambda x: id_to_span[x]['startTimeUnixNano'], reverse=True) | ||
ids_to_process.extend(child_ids) | ||
|
||
if id in id_to_span: | ||
sorted_spans.append(id_to_span[id]) | ||
|
||
# warn if any spans are missing | ||
if (num_leftover := len(spans) - len(sorted_spans)): | ||
print(f"WARNING: {num_leftover} spans not shown (missing parents)") | ||
|
||
return sorted_spans | ||
|
||
|
||
# Transform attributes from like: | ||
# [ | ||
# {"key": "code.namespace", "value": {"stringValue": "s3_benchrunner_rust::transfer_manager"}}, | ||
# {"key": "code.lineno", "value": {"intValue": 136}} | ||
# ] | ||
# To like: | ||
# { | ||
# "code.namespace": "s3_benchrunner_rust::transfer_manager", | ||
# "code.lineno": 136, | ||
# } | ||
def _simplify_attributes(attributes_list): | ||
simple_dict = {} | ||
for attr in attributes_list: | ||
key = attr['key'] | ||
# extract actual value, ignoring value's key which looks like "intValue" | ||
value = next(iter(attr['value'].values())) | ||
|
||
# trim down long filepaths by omitting everything before "src/" | ||
if key == 'code.filepath': | ||
if (src_idx := value.find("src/")) > 0: | ||
value = value[src_idx:] | ||
|
||
simple_dict[key] = value | ||
|
||
return simple_dict |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.