Skip to content

Commit

Permalink
Add sortIndex to fix ordering of roctracer GPU id tracks (#926)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #926

Although hipGetDeviceProperties shows 8 devices enumerating from 0 to 7, when using roctracer_record_t, the record->device_id enumerates from 2 to 9. This is because roctracer considers id 0 and 1 as CPU sockets.

So we can expect up to 8 CPU sockets and 8 GPUs, up to a total count of 16. Which would mean the enumeration will be 0 - 15. This will help put the GPU tracks in order and at the bottom of the chrome trace.

Original Bug report in ROCm/roctracer: ROCm/roctracer#98. We can just show the device id that is given by roctracer.

Test Plan: CI, added a unit test: JsonGPUIDSortTest, and ran locally.

Reviewed By: chaekit

Differential Revision: D57067065

Pulled By: aaronenyeshi
  • Loading branch information
aaronenyeshi authored and facebook-github-bot committed Jun 7, 2024
1 parent 95e06b8 commit 20807ca
Show file tree
Hide file tree
Showing 5 changed files with 104 additions and 9 deletions.
13 changes: 10 additions & 3 deletions libkineto/include/IActivityProfiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,19 @@ enum class TraceStatus {
};

/* DeviceInfo:
* Can be used to specify process name, PID and device label
* Can be used to specify process name, sort order, PID and device label.
* The sort order is determined by the sortIndex field to handle ordering of
* processes and gpu rows in the trace viewer.
*/
struct DeviceInfo {
DeviceInfo(int64_t id, const std::string& name, const std::string& label)
: id(id), name(name), label(label) {}
DeviceInfo(
int64_t id,
int64_t sortIndex,
const std::string& name,
const std::string& label)
: id(id), sortIndex(sortIndex), name(name), label(label) {}
int64_t id; // process id
int64_t sortIndex; // position in trace view
const std::string name; // process name
const std::string label; // device label
};
Expand Down
5 changes: 5 additions & 0 deletions libkineto/include/output_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ namespace libkineto {

using namespace KINETO_NAMESPACE;

// Used by sortIndex to put GPU tracks at the bottom
// of the trace timelines. The largest valid CPU PID is 4,194,304,
// so 5000000 is enough to guarantee that GPU tracks are sorted after CPU.
constexpr int64_t kExceedMaxPid = 5000000;

class ActivityLogger {
public:

Expand Down
15 changes: 10 additions & 5 deletions libkineto/src/CuptiActivityProfiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <fmt/format.h>
#include <time.h>
#include <atomic>
#include <cstdint>
#include <functional>
#include <iomanip>
#include <optional>
Expand Down Expand Up @@ -1232,13 +1233,17 @@ void CuptiActivityProfiler::finalizeTrace(const Config& config, ActivityLogger&
string process_name = processName(pid);
if (!process_name.empty()) {
logger.handleDeviceInfo(
{pid, process_name, "CPU"}, captureWindowStartTime_);
{pid, pid, process_name, "CPU"}, captureWindowStartTime_);
if (!cpuOnly_) {
// GPU events use device id as pid (0-7).
constexpr int kMaxGpuCount = 8;
for (int gpu = 0; gpu < kMaxGpuCount; gpu++) {
// Usually, GPU events use device id as pid (0-7).
// In some cases, CPU sockets are numbered starting from 0.
// In the worst case, 8 CPU sockets + 8 GPUs, so the max GPU ID is 15.
constexpr int kMaxGpuID = 15;
// sortIndex is gpu + kExceedMaxPid to put GPU tracks at the bottom
// of the trace timelines.
for (int gpu = 0; gpu <= kMaxGpuID; gpu++) {
logger.handleDeviceInfo(
{gpu, process_name, fmt::format("GPU {}", gpu)},
{gpu, gpu + kExceedMaxPid, process_name, fmt::format("GPU {}", gpu)},
captureWindowStartTime_);
}
}
Expand Down
2 changes: 1 addition & 1 deletion libkineto/src/output_json.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ void ChromeTraceLogger::handleDeviceInfo(
time/1000, time%1000, info.id,
info.label,
time/1000, time%1000, info.id,
info.id < 8 ? info.id + 0x1000000ll : info.id);
info.sortIndex);
// clang-format on
}

Expand Down
78 changes: 78 additions & 0 deletions libkineto/test/CuptiActivityProfilerTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -997,3 +997,81 @@ TEST(CuptiActivityProfiler, MetadataJsonFormatingTest) {
EXPECT_EQ(1, countSubstrings(jsonStr, "/test/metadata/path"));
#endif
}

TEST_F(CuptiActivityProfilerTest, JsonGPUIDSortTest) {
// Set logging level for debugging purpose
std::vector<std::string> log_modules(
{"CuptiActivityProfiler.cpp", "output_json.cpp"});
SET_LOG_VERBOSITY_LEVEL(2, log_modules);

// Start and stop profiling
CuptiActivityProfiler profiler(cuptiActivities_, /*cpu only*/ false);
int64_t start_time_ns = libkineto::timeSinceEpoch(std::chrono::system_clock::now());
int64_t duration_ns = 500;
auto start_time = time_point<system_clock>(nanoseconds(start_time_ns));
profiler.configure(*cfg_, start_time);
profiler.startTrace(start_time);
profiler.stopTrace(start_time + nanoseconds(duration_ns));
profiler.recordThreadInfo();

// Set up CPU events and corresponding GPU events
auto cpuOps = std::make_unique<MockCpuActivityBuffer>(
start_time_ns, start_time_ns + duration_ns);
cpuOps->addOp("op1", start_time_ns + 10, start_time_ns + 30, 1);
profiler.transferCpuTrace(std::move(cpuOps));
auto gpuOps = std::make_unique<MockCuptiActivityBuffer>();
gpuOps->addRuntimeActivity(CUDA_LAUNCH_KERNEL, start_time_ns + 23, start_time_ns + 28, 1);
gpuOps->addKernelActivity(start_time_ns + 50, start_time_ns + 70, 1);
cuptiActivities_.activityBuffer = std::move(gpuOps);

// Process trace
auto logger = std::make_unique<MemoryTraceLogger>(*cfg_);
profiler.processTrace(*logger);
profiler.setLogger(logger.get());

// Profiler can be reset at this point - logger owns the activities
profiler.reset();

// Check the content of GPU event and we should see extra
// collective fields get populated from CPU event.
ActivityTrace trace(std::move(logger), loggerFactory);
EXPECT_EQ(3, trace.activities()->size());

#ifdef __linux__
// Test saved output can be loaded as JSON
char filename[] = "/tmp/libkineto_testXXXXXX.json";
mkstemps(filename, 5);
LOG(INFO) << "Logging to tmp file: " << filename;
trace.save(filename);

// Check that the saved JSON file can be loaded and deserialized
std::ifstream file(filename);
if (!file.is_open()) {
throw std::runtime_error("Failed to open the trace JSON file.");
}
std::string jsonStr(
(std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
folly::dynamic jsonData = folly::parseJson(jsonStr);

std::unordered_map<int64_t, std::string> sortLabel;
std::unordered_map<int64_t, int64_t> sortIdx;
for (auto& event : jsonData["traceEvents"]) {
if (event["name"] == "process_labels" && event["tid"] == 0 && event["pid"].isInt()) {
sortLabel[event["pid"].asInt()] = event["args"]["labels"].asString();
}
if (event["name"] == "process_sort_index" && event["tid"] == 0 && event["pid"].isInt()) {
sortIdx[event["pid"].asInt()] = event["args"]["sort_index"].asInt();
}
}

// Expect there is 1 CUPTI Overhead, and 16 CPU + GPU sorts, total 17.
EXPECT_EQ(17, sortLabel.size());
for (int i = 0; i<16; i++) {
// Check there are 16 GPU sorts (0-15) with expected sort_index.
EXPECT_EQ("GPU " + std::to_string(i), sortLabel[i]);
// sortIndex is gpu + kExceedMaxPid to put GPU tracks at the bottom
// of the trace timelines.
EXPECT_EQ(i + kExceedMaxPid, sortIdx[i]);
}
#endif
}

0 comments on commit 20807ca

Please sign in to comment.