diff --git a/libkineto/sample_programs/kineto_cupti_profiler.cpp b/libkineto/sample_programs/kineto_cupti_profiler.cpp index 79447f76f..6c1d65252 100644 --- a/libkineto/sample_programs/kineto_cupti_profiler.cpp +++ b/libkineto/sample_programs/kineto_cupti_profiler.cpp @@ -30,7 +30,7 @@ int main() { libkineto::ActivityType::CUDA_PROFILER_RANGE, }; - libkineto_init(false, true); + //libkineto_init(false, true); libkineto::api().initProfilerIfRegistered(); // Use a special kineto__cuda_core_flop metric that counts individual diff --git a/libkineto/sample_programs/kineto_playground.cpp b/libkineto/sample_programs/kineto_playground.cpp index 63cd98f81..b8feb24e4 100644 --- a/libkineto/sample_programs/kineto_playground.cpp +++ b/libkineto/sample_programs/kineto_playground.cpp @@ -21,23 +21,27 @@ using namespace kineto; static const std::string kFileName = "/tmp/kineto_playground_trace.json"; int main() { + std::cout << "Warm up " << std::endl; warmup(); // Kineto config // Empty types set defaults to all types std::set types; - libkineto_init(false, true); - libkineto::api().initProfilerIfRegistered(); auto& profiler = libkineto::api().activityProfiler(); + libkineto::api().initProfilerIfRegistered(); profiler.prepareTrace(types); // Good to warm up after prepareTrace to get cupti initialization to settle + std::cout << "Warm up " << std::endl; warmup(); + std::cout << "Start trace" << std::endl; profiler.startTrace(); + std::cout << "Start playground" << std::endl; playground(); + std::cout << "Stop Trace" << std::endl; auto trace = profiler.stopTrace(); std::cout << "Stopped and processed trace. Got " << trace->activities()->size() << " activities."; trace->save(kFileName); diff --git a/libkineto/sample_programs/kineto_playground.cu b/libkineto/sample_programs/kineto_playground.cu index cc3550186..e6a693eaa 100644 --- a/libkineto/sample_programs/kineto_playground.cu +++ b/libkineto/sample_programs/kineto_playground.cu @@ -76,6 +76,9 @@ __global__ void square(float* A, int N) { void playground(void) { // Add your experimental CUDA implementation here. + basicMemcpyFromDevice(); + compute(); + basicMemcpyFromDevice(); } void compute(void) { diff --git a/libkineto/src/CuptiActivityProfiler.cpp b/libkineto/src/CuptiActivityProfiler.cpp index a8cb4aa5d..91984b962 100644 --- a/libkineto/src/CuptiActivityProfiler.cpp +++ b/libkineto/src/CuptiActivityProfiler.cpp @@ -903,16 +903,21 @@ void CuptiActivityProfiler::configureChildProfilers() { derivedConfig_->profileStartTime().time_since_epoch()) .count(); for (auto& profiler : profilers_) { - LOG(INFO) << "Evaluating whether to run child profiler = " << profiler->name(); + LOG(INFO) << "[Profiler = " << profiler->name() << "] " + << "Evaluating whether to run child profiler." ; auto session = profiler->configure( start_time_ms, derivedConfig_->profileDuration().count(), derivedConfig_->profileActivityTypes(), *config_); if (session) { - LOG(INFO) << "Running child profiler " << profiler->name() << " for " + LOG(INFO) << "[Profiler = " << profiler->name() << "] " + << "Running child profiler " << profiler->name() << " for " << derivedConfig_->profileDuration().count() << " ms"; sessions_.push_back(std::move(session)); + } else { + LOG(INFO) << "[Profiler = " << profiler->name() << "] " + << "Not running child profiler."; } } } diff --git a/libkineto/src/CuptiCallbackApi.cpp b/libkineto/src/CuptiCallbackApi.cpp index 2eefc80f9..b6d4fc69f 100644 --- a/libkineto/src/CuptiCallbackApi.cpp +++ b/libkineto/src/CuptiCallbackApi.cpp @@ -75,7 +75,7 @@ void CuptiCallbackApi::__callback_switchboard( CUpti_CallbackDomain domain, CUpti_CallbackId cbid, const CUpti_CallbackData* cbInfo) { - VLOG(0) << "Callback: domain = " << domain << ", cbid = " << cbid; + LOG(INFO) << "Callback: domain = " << domain << ", cbid = " << cbid; CallbackList *cblist = nullptr; switch (domain) { @@ -87,6 +87,12 @@ void CuptiCallbackApi::__callback_switchboard( cblist = &callbacks_.runtime[ CUDA_LAUNCH_KERNEL - __RUNTIME_CB_DOMAIN_START]; break; +#if defined(CUDA_VERSION) && (CUDA_VERSION >= 11080) + case CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_v11060: + cblist = &callbacks_.runtime[ + CUDA_LAUNCH_KERNEL_EXC - __RUNTIME_CB_DOMAIN_START]; + break; +#endif default: break; } @@ -168,9 +174,12 @@ void CuptiCallbackApi::initCallbackApi() { cuptiSubscribe(&subscriber_, (CUpti_CallbackFunc)callback_switchboard, nullptr)); - if (lastCuptiStatus_ != CUPTI_SUCCESS) { - VLOG(1) << "Failed cuptiSubscribe, status: " << lastCuptiStatus_; - } + + // TODO: Remove temporarily to work around static initialization order issue + // betweent this and GLOG. + // if (lastCuptiStatus_ != CUPTI_SUCCESS) { + // LOG(INFO) << "Failed cuptiSubscribe, status: " << lastCuptiStatus_; + // } initSuccess_ = (lastCuptiStatus_ == CUPTI_SUCCESS); #endif diff --git a/libkineto/src/CuptiCallbackApi.h b/libkineto/src/CuptiCallbackApi.h index 8cd53c471..ba7503c76 100644 --- a/libkineto/src/CuptiCallbackApi.h +++ b/libkineto/src/CuptiCallbackApi.h @@ -28,6 +28,7 @@ namespace KINETO_NAMESPACE { using namespace libkineto; + /* CuptiCallbackApi : Provides an abstraction over CUPTI callback * interface. This enables various callback functions to be registered * with this class. The class registers a global callback handler that @@ -54,6 +55,7 @@ class CuptiCallbackApi { // can possibly support more callback ids per domain // __RUNTIME_CB_DOMAIN_START = CUDA_LAUNCH_KERNEL, + CUDA_LAUNCH_KERNEL_EXC, // Used in H100 // Callbacks under Resource CB domain RESOURCE_CONTEXT_CREATED, diff --git a/libkineto/src/CuptiCallbackApiMock.h b/libkineto/src/CuptiCallbackApiMock.h index ac4a3577b..1457624e5 100644 --- a/libkineto/src/CuptiCallbackApiMock.h +++ b/libkineto/src/CuptiCallbackApiMock.h @@ -17,6 +17,10 @@ enum CUpti_CallbackDomain { }; enum CUpti_CallbackId { CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000, + +#if defined(CUDA_VERSION) && (CUDA_VERSION >= 11080) + CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_v11060, +#endif CUPTI_CBID_RESOURCE_CONTEXT_CREATED, CUPTI_CBID_RESOURCE_CONTEXT_DESTROY_STARTING, }; diff --git a/libkineto/src/CuptiRangeProfilerApi.cpp b/libkineto/src/CuptiRangeProfilerApi.cpp index 3d4eecfcf..a7bf3817d 100644 --- a/libkineto/src/CuptiRangeProfilerApi.cpp +++ b/libkineto/src/CuptiRangeProfilerApi.cpp @@ -8,6 +8,7 @@ #include #include +#include "ILoggerObserver.h" #ifdef HAS_CUPTI #include #include @@ -15,10 +16,10 @@ #include #include -#include "time_since_epoch.h" -#include "Logger.h" #include "Demangle.h" #include "DeviceUtil.h" +#include "Logger.h" +#include "time_since_epoch.h" // TODO(T90238193) // @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude @@ -35,8 +36,7 @@ TraceSpan CuptiRBProfilerSession::getProfilerTraceSpan() { return TraceSpan( timeSinceEpoch(profilerStartTs_), timeSinceEpoch(profilerStopTs_), - "__cupti_profiler__" - ); + "__cupti_profiler__"); } #if HAS_CUPTI_RANGE_PROFILER @@ -53,7 +53,7 @@ std::unordered_map disable_flag; std::mutex contextMutex_; std::unordered_map ctx_to_dev; std::set active_devices; -} +} // namespace // forward declarations void __trackCudaCtx(CUcontext ctx, uint32_t device_id, CUpti_CallbackId cbid); @@ -106,7 +106,8 @@ inline uint32_t getDevID(CUcontext ctx) { // 1. Track cuda contexts and maintain a list of active GPUs to profile // 2. Callbacks on kernel launches to track the name of automatic // ranges that correspond to names of kernels -// 3. Lastly CUPTI range profiler has to be enabled on the same thread executing +// 3. Lastly CUPTI range profiler has to be enabled on the same thread +// executing // the CUDA kernels. We use Callbacks to enable the profiler // asynchronously from another thread. @@ -116,7 +117,7 @@ void trackCudaCtx( CUpti_CallbackDomain /*domain*/, CUpti_CallbackId cbid, const CUpti_CallbackData* cbInfo) { - auto *d = reinterpret_cast(cbInfo); + auto* d = reinterpret_cast(cbInfo); auto ctx = d->context; uint32_t device_id = getDevID(ctx); @@ -130,14 +131,14 @@ void trackCudaCtx( void __trackCudaCtx(CUcontext ctx, uint32_t device_id, CUpti_CallbackId cbid) { std::lock_guard g(contextMutex_); if (cbid == CUPTI_CBID_RESOURCE_CONTEXT_CREATED) { - VLOG(0) << "CUPTI Profiler observed CUDA Context created = " - << ctx << " device id = " << device_id; + VLOG(0) << "CUPTI Profiler observed CUDA Context created = " << ctx + << " device id = " << device_id; active_devices.insert(device_id); ctx_to_dev[ctx] = device_id; } else if (cbid == CUPTI_CBID_RESOURCE_CONTEXT_DESTROY_STARTING) { - VLOG(0) << "CUPTI Profiler observed CUDA Context destroyed = " - << ctx << " device id = " << device_id; + VLOG(0) << "CUPTI Profiler observed CUDA Context destroyed = " << ctx + << " device id = " << device_id; auto it = active_devices.find(device_id); if (it != active_devices.end()) { active_devices.erase(it); @@ -151,7 +152,7 @@ void trackCudaKernelLaunch( CUpti_CallbackId /*cbid*/, const CUpti_CallbackData* cbInfo) { VLOG(1) << " Trace : Callback name = " - << (cbInfo->symbolName ? cbInfo->symbolName: "") + << (cbInfo->symbolName ? cbInfo->symbolName : "") << " context ptr = " << cbInfo->context; auto ctx = cbInfo->context; // should be in CUPTI_API_ENTER call site @@ -161,9 +162,7 @@ void trackCudaKernelLaunch( __trackCudaKernelLaunch(ctx, cbInfo->symbolName); } -void __trackCudaKernelLaunch( - CUcontext ctx, - const char* kernelName) { +void __trackCudaKernelLaunch(CUcontext ctx, const char* kernelName) { VLOG(0) << " Tracking kernel name = " << (kernelName ? kernelName : "") << " context ptr = " << ctx; @@ -214,27 +213,43 @@ void __trackCudaKernelLaunch( void enableKernelCallbacks() { auto cbapi = CuptiCallbackApi::singleton(); + bool status = cbapi->enableCallback( CUPTI_CB_DOMAIN_RUNTIME_API, CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000); +// cudaLaunchKernelExC() used from H100 onwards. +#if defined(CUDA_VERSION) && (CUDA_VERSION >= 11080) + status &= cbapi->enableCallback( + CUPTI_CB_DOMAIN_RUNTIME_API, + CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_v11060); +#endif + if (!status) { LOG(WARNING) << "CUPTI Range Profiler unable to " - << "enable cuda kernel launch callback"; - return; + << "enable cuda kernel launch callback."; } + LOG(INFO) << "CUPTI Profiler kernel callbacks enabled"; } void disableKernelCallbacks() { auto cbapi = CuptiCallbackApi::singleton(); + bool status = cbapi->disableCallback( CUPTI_CB_DOMAIN_RUNTIME_API, CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000); +#if defined(CUDA_VERSION) && (CUDA_VERSION >= 11080) + status &= cbapi->disableCallback( + CUPTI_CB_DOMAIN_RUNTIME_API, + CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_v11060); +#endif + if (!status) { LOG(WARNING) << "CUPTI Range Profiler unable to " - << "disable cuda kernel launch callback"; + << "disable cuda kernel launch callback."; return; } + LOG(INFO) << "CUPTI Profiler kernel callbacks disabled"; } @@ -251,8 +266,8 @@ bool CuptiRBProfilerSession::initCupti() { // when you plan to use CUPTI range profiler CUpti_Profiler_Initialize_Params profilerInitializeParams = { CUpti_Profiler_Initialize_Params_STRUCT_SIZE, nullptr}; - CUptiResult status = CUPTI_CALL_NOWARN( - cuptiProfilerInitialize(&profilerInitializeParams)); + CUptiResult status = + CUPTI_CALL_NOWARN(cuptiProfilerInitialize(&profilerInitializeParams)); return (status == CUPTI_SUCCESS); } @@ -270,12 +285,14 @@ bool CuptiRBProfilerSession::staticInit() { CUpti_CallbackDomain domain = CUPTI_CB_DOMAIN_RESOURCE; bool status = cbapi->registerCallback( domain, CuptiCallbackApi::RESOURCE_CONTEXT_CREATED, trackCudaCtx); - status = status && cbapi->registerCallback( - domain, CuptiCallbackApi::RESOURCE_CONTEXT_DESTROYED, trackCudaCtx); - status = status && cbapi->enableCallback( - domain, CUPTI_CBID_RESOURCE_CONTEXT_CREATED); - status = status && cbapi->enableCallback( - domain, CUPTI_CBID_RESOURCE_CONTEXT_DESTROY_STARTING); + status = status && + cbapi->registerCallback( + domain, CuptiCallbackApi::RESOURCE_CONTEXT_DESTROYED, trackCudaCtx); + status = status && + cbapi->enableCallback(domain, CUPTI_CBID_RESOURCE_CONTEXT_CREATED); + status = status && + cbapi->enableCallback( + domain, CUPTI_CBID_RESOURCE_CONTEXT_DESTROY_STARTING); if (!status) { LOG(WARNING) << "CUPTI Range Profiler unable to attach cuda context " @@ -287,6 +304,11 @@ bool CuptiRBProfilerSession::staticInit() { domain = CUPTI_CB_DOMAIN_RUNTIME_API; status = cbapi->registerCallback( domain, CuptiCallbackApi::CUDA_LAUNCH_KERNEL, trackCudaKernelLaunch); + status = status && + cbapi->registerCallback( + domain, + CuptiCallbackApi::CUDA_LAUNCH_KERNEL_EXC, + trackCudaKernelLaunch); if (!status) { LOG(WARNING) << "CUPTI Range Profiler unable to attach cuda kernel " @@ -303,7 +325,6 @@ std::vector& CuptiRBProfilerSession::counterAvailabilityImage() { return counterAvailabilityImage_; } - // Setup the profiler sessions CuptiRBProfilerSession::CuptiRBProfilerSession( const CuptiRangeProfilerOptions& opts) @@ -326,8 +347,8 @@ CuptiRBProfilerSession::CuptiRBProfilerSession( return; } - LOG(INFO) << "Initializing CUPTI range profiler session : device = " << deviceId_ - << " chip = " << chipName_; + LOG(INFO) << "Initializing CUPTI range profiler session : device = " + << deviceId_ << " chip = " << chipName_; /* Generate configuration for metrics, this can also be done offline*/ NVPW_InitializeHost_Params initializeHostParams = { NVPW_InitializeHost_Params_STRUCT_SIZE, nullptr}; @@ -343,9 +364,7 @@ CuptiRBProfilerSession::CuptiRBProfilerSession( return; } if (!nvperf::getCounterDataPrefixImage( - chipName_, - metricNames_, - counterDataImagePrefix)) { + chipName_, metricNames_, counterDataImagePrefix)) { LOG(ERROR) << "Failed to create counterDataImagePrefix"; return; } @@ -360,13 +379,13 @@ CuptiRBProfilerSession::CuptiRBProfilerSession( } LOG(INFO) << "Size of structs" - << " config image size = " << configImage.size() << " B" - << " counter data image prefix = " - << counterDataImagePrefix.size() << " B" + << " config image size = " << configImage.size() << " B" + << " counter data image prefix = " << counterDataImagePrefix.size() + << " B" << " counter data image size = " << counterDataImage.size() / 1024 << " KB" - << " counter sb image size = " - << counterDataScratchBuffer.size() << " B"; + << " counter sb image size = " << counterDataScratchBuffer.size() + << " B"; beginPassParams_ = {CUpti_Profiler_BeginPass_Params_STRUCT_SIZE, nullptr, {}}; beginPassParams_.ctx = cuContext_; @@ -411,7 +430,8 @@ void CuptiRBProfilerSession::startInternal( beginSessionParams.pCounterDataImage = counterDataImage.data(); beginSessionParams.counterDataScratchBufferSize = counterDataScratchBuffer.size(); - beginSessionParams.pCounterDataScratchBuffer = counterDataScratchBuffer.data(); + beginSessionParams.pCounterDataScratchBuffer = + counterDataScratchBuffer.data(); beginSessionParams.range = profilerRange; beginSessionParams.replayMode = profilerReplayMode; beginSessionParams.maxRangesPerPass = maxRanges_; @@ -422,6 +442,8 @@ void CuptiRBProfilerSession::startInternal( LOG(WARNING) << "Failed to start CUPTI range profiler"; initSuccess_ = false; return; + } else { + LOG(INFO) << "Successfully started CUPTI range profiler"; } // Set counter configuration @@ -584,9 +606,7 @@ void CuptiRBProfilerSession::asyncDisableAndStop() { disable_flag[deviceId_] = true; } - -CuptiProfilerResult CuptiRBProfilerSession::evaluateMetrics( - bool verbose) { +CuptiProfilerResult CuptiRBProfilerSession::evaluateMetrics(bool verbose) { if (!initSuccess_) { LOG(WARNING) << "Profiling failed, no results to return"; return {}; @@ -635,54 +655,57 @@ bool CuptiRBProfilerSession::createCounterDataImage() { // Calculate size of counter data image CUpti_Profiler_CounterDataImage_CalculateSize_Params calculateSizeParams = { - CUpti_Profiler_CounterDataImage_CalculateSize_Params_STRUCT_SIZE, nullptr}; + CUpti_Profiler_CounterDataImage_CalculateSize_Params_STRUCT_SIZE, + nullptr}; calculateSizeParams.pOptions = &counterDataImageOptions; calculateSizeParams.sizeofCounterDataImageOptions = CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE; - CUPTI_CALL( - cuptiProfilerCounterDataImageCalculateSize(&calculateSizeParams)); + CUPTI_CALL(cuptiProfilerCounterDataImageCalculateSize(&calculateSizeParams)); counterDataImage.resize(calculateSizeParams.counterDataImageSize); // Initialize counter data image CUpti_Profiler_CounterDataImage_Initialize_Params initializeParams = { - CUpti_Profiler_CounterDataImage_Initialize_Params_STRUCT_SIZE, nullptr}; + CUpti_Profiler_CounterDataImage_Initialize_Params_STRUCT_SIZE, nullptr}; initializeParams.sizeofCounterDataImageOptions = - CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE; + CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE; initializeParams.pOptions = &counterDataImageOptions; initializeParams.counterDataImageSize = - calculateSizeParams.counterDataImageSize; + calculateSizeParams.counterDataImageSize; initializeParams.pCounterDataImage = counterDataImage.data(); CUPTI_CALL(cuptiProfilerCounterDataImageInitialize(&initializeParams)); // Calculate counter Scratch Buffer size CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params - scratchBufferSizeParams = { - CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params_STRUCT_SIZE, nullptr}; + scratchBufferSizeParams = { + CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params_STRUCT_SIZE, + nullptr}; scratchBufferSizeParams.counterDataImageSize = - calculateSizeParams.counterDataImageSize; + calculateSizeParams.counterDataImageSize; scratchBufferSizeParams.pCounterDataImage = - initializeParams.pCounterDataImage; + initializeParams.pCounterDataImage; CUPTI_CALL(cuptiProfilerCounterDataImageCalculateScratchBufferSize( - &scratchBufferSizeParams)); + &scratchBufferSizeParams)); counterDataScratchBuffer.resize( scratchBufferSizeParams.counterDataScratchBufferSize); // Initialize scratch buffer CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params - initScratchBufferParams = { - CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params_STRUCT_SIZE, nullptr}; + initScratchBufferParams = { + CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params_STRUCT_SIZE, + nullptr}; initScratchBufferParams.counterDataImageSize = - calculateSizeParams.counterDataImageSize; + calculateSizeParams.counterDataImageSize; - initScratchBufferParams.pCounterDataImage = initializeParams.pCounterDataImage; + initScratchBufferParams.pCounterDataImage = + initializeParams.pCounterDataImage; initScratchBufferParams.counterDataScratchBufferSize = - scratchBufferSizeParams.counterDataScratchBufferSize; + scratchBufferSizeParams.counterDataScratchBufferSize; initScratchBufferParams.pCounterDataScratchBuffer = - counterDataScratchBuffer.data(); + counterDataScratchBuffer.data(); CUPTI_CALL(cuptiProfilerCounterDataImageInitializeScratchBuffer( &initScratchBufferParams)); @@ -705,13 +728,15 @@ CuptiRBProfilerSession::CuptiRBProfilerSession( deviceId_(opts.deviceId), maxRanges_(opts.maxRanges), numNestingLevels_(opts.numNestingLevels), - cuContext_(opts.cuContext) {}; + cuContext_(opts.cuContext){}; CuptiRBProfilerSession::~CuptiRBProfilerSession() {} void CuptiRBProfilerSession::stop() {} void CuptiRBProfilerSession::enable() {} void CuptiRBProfilerSession::disable() {} void CuptiRBProfilerSession::beginPass() {} -bool CuptiRBProfilerSession::endPass() { return true; } +bool CuptiRBProfilerSession::endPass() { + return true; +} void CuptiRBProfilerSession::flushCounterData() {} void CuptiRBProfilerSession::pushRange(const std::string& /*rangeName*/) {} void CuptiRBProfilerSession::popRange() {} @@ -728,11 +753,19 @@ CuptiProfilerResult CuptiRBProfilerSession::evaluateMetrics(bool verbose) { void CuptiRBProfilerSession::saveCounterData( const std::string& /*CounterDataFileName*/, const std::string& /*CounterDataSBFileName*/) {} -bool CuptiRBProfilerSession::initCupti() { return false; } +bool CuptiRBProfilerSession::initCupti() { + return false; +} void CuptiRBProfilerSession::deInitCupti() {} -bool CuptiRBProfilerSession::staticInit() { return false; } -std::set CuptiRBProfilerSession::getActiveDevices() { return {}; } -bool CuptiRBProfilerSession::createCounterDataImage() { return true; } +bool CuptiRBProfilerSession::staticInit() { + return false; +} +std::set CuptiRBProfilerSession::getActiveDevices() { + return {}; +} +bool CuptiRBProfilerSession::createCounterDataImage() { + return true; +} void CuptiRBProfilerSession::startInternal( CUpti_ProfilerRange /*profilerRange*/, CUpti_ProfilerReplayMode /*profilerReplayMode*/) {} @@ -742,8 +775,8 @@ std::vector& CuptiRBProfilerSession::counterAvailabilityImage() { } #endif // HAS_CUPTI_RANGE_PROFILER -std::unique_ptr -CuptiRBProfilerSessionFactory::make(const CuptiRangeProfilerOptions& opts) { +std::unique_ptr CuptiRBProfilerSessionFactory::make( + const CuptiRangeProfilerOptions& opts) { return std::make_unique(opts); } diff --git a/libkineto/test/CuptiActivityProfilerTest.cpp b/libkineto/test/CuptiActivityProfilerTest.cpp index 1811a57a1..20e50f77d 100644 --- a/libkineto/test/CuptiActivityProfilerTest.cpp +++ b/libkineto/test/CuptiActivityProfilerTest.cpp @@ -7,26 +7,26 @@ */ #include +#include #include #include #include #include #include -#include #ifdef __linux__ +#include #include #include -#include #endif -#include "include/libkineto.h" #include "include/Config.h" #include "include/output_base.h" #include "include/time_since_epoch.h" -#include "src/CuptiActivityProfiler.h" +#include "include/libkineto.h" #include "src/ActivityTrace.h" #include "src/CuptiActivityApi.h" +#include "src/CuptiActivityProfiler.h" #include "src/output_json.h" #include "src/output_membuf.h" @@ -62,12 +62,12 @@ const TraceSpan& defaultTraceSpan() { static TraceSpan span(0, 0, "Unknown", ""); return span; } -} +} // namespace // Provides ability to easily create a few test CPU-side ops struct MockCpuActivityBuffer : public CpuTraceBuffer { MockCpuActivityBuffer(int64_t startTime, int64_t endTime) { - span = TraceSpan(startTime, endTime,"Test trace"); + span = TraceSpan(startTime, endTime, "Test trace"); gpuOpCount = 0; } @@ -95,8 +95,12 @@ struct MockCpuActivityBuffer : public CpuTraceBuffer { // Provides ability to easily create a few test CUPTI ops struct MockCuptiActivityBuffer { - void addCorrelationActivity(int64_t correlation, CUpti_ExternalCorrelationKind externalKind, int64_t externalId) { - auto& act = *(CUpti_ActivityExternalCorrelation*) malloc(sizeof(CUpti_ActivityExternalCorrelation)); + void addCorrelationActivity( + int64_t correlation, + CUpti_ExternalCorrelationKind externalKind, + int64_t externalId) { + auto& act = *(CUpti_ActivityExternalCorrelation*)malloc( + sizeof(CUpti_ActivityExternalCorrelation)); act.kind = CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION; act.externalId = externalId; act.externalKind = externalKind; @@ -183,9 +187,8 @@ struct MockCuptiActivityBuffer { activities.push_back(reinterpret_cast(&act)); } - template - T& createActivity( - int64_t start_ns, int64_t end_ns, int64_t correlation) { + template + T& createActivity(int64_t start_ns, int64_t end_ns, int64_t correlation) { T& act = *static_cast(malloc(sizeof(T))); bzero(&act, sizeof(act)); act.start = start_ns; @@ -215,8 +218,7 @@ class MockCuptiActivities : public CuptiActivityApi { return {activityBuffer->activities.size(), 100}; } - virtual std::unique_ptr - activityBuffers() override { + virtual std::unique_ptr activityBuffers() override { auto map = std::make_unique(); auto buf = std::make_unique(100); uint8_t* addr = buf->data(); @@ -224,14 +226,16 @@ class MockCuptiActivities : public CuptiActivityApi { return map; } - void bufferRequestedOverride(uint8_t** buffer, size_t* size, size_t* maxNumRecords) { + void bufferRequestedOverride( + uint8_t** buffer, + size_t* size, + size_t* maxNumRecords) { this->bufferRequested(buffer, size, maxNumRecords); } std::unique_ptr activityBuffer; }; - // Common setup / teardown and helper functions class CuptiActivityProfilerTest : public ::testing::Test { protected: @@ -241,7 +245,7 @@ class CuptiActivityProfilerTest : public ::testing::Test { cfg_ = std::make_unique(); cfg_->validate(std::chrono::system_clock::now()); loggerFactory.addProtocol("file", [](const std::string& url) { - return std::unique_ptr(new ChromeTraceLogger(url)); + return std::unique_ptr(new ChromeTraceLogger(url)); }); } @@ -260,7 +264,7 @@ void checkTracefile(const char* filename) { } EXPECT_TRUE(fd); // Should expect at least 100 bytes - struct stat buf{}; + struct stat buf {}; fstat(fd, &buf); EXPECT_GT(buf.st_size, 100); close(fd); @@ -285,20 +289,24 @@ TEST(CuptiActivityProfiler, AsyncTrace) { auto now = system_clock::now(); auto startTime = now + seconds(10); - bool success = cfg.parse(fmt::format(R"CFG( + bool success = cfg.parse(fmt::format( + R"CFG( ACTIVITIES_WARMUP_PERIOD_SECS = {} ACTIVITIES_DURATION_SECS = 1 ACTIVITIES_LOG_FILE = {} PROFILE_START_TIME = {} - )CFG", warmup, filename, duration_cast(startTime.time_since_epoch()).count())); + )CFG", + warmup, + filename, + duration_cast(startTime.time_since_epoch()).count())); EXPECT_TRUE(success); EXPECT_FALSE(profiler.isActive()); auto logger = std::make_unique(cfg.activitiesLogFile()); - // Usually configuration is done when now is startTime - warmup to kick off warmup - // but start right away in the test + // Usually configuration is done when now is startTime - warmup to kick off + // warmup but start right away in the test profiler.configure(cfg, now); profiler.setLogger(logger.get()); @@ -316,8 +324,9 @@ TEST(CuptiActivityProfiler, AsyncTrace) { auto next = now + milliseconds(1000); - // performRunLoopStep can also be called by an application thread to update iteration count - // since this config does not use iteration this should have no effect on the state + // performRunLoopStep can also be called by an application thread to update + // iteration count since this config does not use iteration this should have + // no effect on the state while (++iter < 20) { profiler.performRunLoopStep(now, now, iter); } @@ -340,8 +349,8 @@ TEST(CuptiActivityProfiler, AsyncTrace) { EXPECT_TRUE(profiler.isActive()); - profiler.performRunLoopStep(nextnext,nextnext); - profiler.performRunLoopStep(nextnext,nextnext); + profiler.performRunLoopStep(nextnext, nextnext); + profiler.performRunLoopStep(nextnext, nextnext); // Assert that tracing has completed EXPECT_FALSE(profiler.isActive()); @@ -354,12 +363,10 @@ TEST(CuptiActivityProfiler, AsyncTraceUsingIter) { {"CuptiActivityProfiler.cpp", "output_json.cpp"}); SET_LOG_VERBOSITY_LEVEL(1, log_modules); - auto runIterTest = [&]( - int start_iter, int warmup_iters, int trace_iters) { - - LOG(INFO ) << "Async Trace Test: start_iteration = " << start_iter - << " warmup iterations = " << warmup_iters - << " trace iterations = " << trace_iters; + auto runIterTest = [&](int start_iter, int warmup_iters, int trace_iters) { + LOG(INFO) << "Async Trace Test: start_iteration = " << start_iter + << " warmup iterations = " << warmup_iters + << " trace iterations = " << trace_iters; MockCuptiActivities activities; CuptiActivityProfiler profiler(activities, /*cpu only*/ true); @@ -372,21 +379,26 @@ TEST(CuptiActivityProfiler, AsyncTraceUsingIter) { int iter = 0; auto now = system_clock::now(); - bool success = cfg.parse(fmt::format(R"CFG( + bool success = cfg.parse(fmt::format( + R"CFG( PROFILE_START_ITERATION = {} ACTIVITIES_WARMUP_ITERATIONS={} ACTIVITIES_ITERATIONS={} ACTIVITIES_DURATION_SECS = 1 ACTIVITIES_LOG_FILE = {} - )CFG", start_iter, warmup_iters, trace_iters, filename)); + )CFG", + start_iter, + warmup_iters, + trace_iters, + filename)); EXPECT_TRUE(success); EXPECT_FALSE(profiler.isActive()); auto logger = std::make_unique(cfg.activitiesLogFile()); - // Usually configuration is done when now is startIter - warmup iter to kick off warmup - // but start right away in the test + // Usually configuration is done when now is startIter - warmup iter to kick + // off warmup but start right away in the test while (iter < (start_iter - warmup_iters)) { profiler.performRunLoopStep(now, now, iter++); } @@ -441,10 +453,8 @@ TEST(CuptiActivityProfiler, AsyncTraceUsingIter) { } TEST_F(CuptiActivityProfilerTest, SyncTrace) { - // Verbose logging is useful for debugging - std::vector log_modules( - {"CuptiActivityProfiler.cpp"}); + std::vector log_modules({"CuptiActivityProfiler.cpp"}); SET_LOG_VERBOSITY_LEVEL(2, log_modules); // Start and stop profiling @@ -487,7 +497,8 @@ TEST_F(CuptiActivityProfilerTest, SyncTrace) { gpuOps->addSyncActivity( start_time_ns + 224, start_time_ns + 226, 7, CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_WAIT_EVENT, 1 /*stream*/); - // This event should be ignored because it is not on a stream that has no GPU kernels + // This event should be ignored because it is not on a stream that has no GPU + // kernels gpuOps->addSyncActivity( start_time_ns + 226, start_time_ns + 230, 8, CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_WAIT_EVENT, 4 /*stream*/); @@ -512,6 +523,7 @@ TEST_F(CuptiActivityProfilerTest, SyncTrace) { for (auto& activity : *trace.activities()) { activityCounts[activity->name()]++; resourceIds[activity->resourceId()]++; + LOG(INFO) << "[test]" << activity->name() << "," << activity->resourceId(); } for (const auto& p : activityCounts) { LOG(INFO) << p.first << ": " << p.second; @@ -548,7 +560,7 @@ TEST_F(CuptiActivityProfilerTest, SyncTrace) { } EXPECT_TRUE(fd); // Should expect at least 100 bytes - struct stat buf{}; + struct stat buf {}; fstat(fd, &buf); EXPECT_GT(buf.st_size, 100); #endif @@ -669,11 +681,11 @@ TEST_F(CuptiActivityProfilerTest, GpuNCCLCollectiveTest) { auto expectedInSplitStr = fmt::format("\"[{}, ...]\"", fmt::join(expectedInSplit, ", ")); EXPECT_EQ(cpu_annotation->getMetadataValue(kInSplit), expectedInSplitStr); - std::vector expectedGroupRanks(kTruncatLength-1, 0); - auto expectedGroupRanksStr = - fmt::format("\"[{}, ..., {}]\"", fmt::join(expectedGroupRanks, ", "), "0"); - EXPECT_EQ(cpu_annotation->getMetadataValue(kGroupRanks), expectedGroupRanksStr); - + std::vector expectedGroupRanks(kTruncatLength - 1, 0); + auto expectedGroupRanksStr = fmt::format( + "\"[{}, ..., {}]\"", fmt::join(expectedGroupRanks, ", "), "0"); + EXPECT_EQ( + cpu_annotation->getMetadataValue(kGroupRanks), expectedGroupRanksStr); #ifdef __linux__ // Test saved output can be loaded as JSON @@ -727,8 +739,7 @@ TEST_F(CuptiActivityProfilerTest, GpuNCCLCollectiveTest) { TEST_F(CuptiActivityProfilerTest, GpuUserAnnotationTest) { // Verbose logging is useful for debugging - std::vector log_modules( - {"CuptiActivityProfiler.cpp"}); + std::vector log_modules({"CuptiActivityProfiler.cpp"}); SET_LOG_VERBOSITY_LEVEL(2, log_modules); // Start and stop profiling @@ -788,14 +799,12 @@ TEST_F(CuptiActivityProfilerTest, GpuUserAnnotationTest) { EXPECT_EQ(gpu_annotation->deviceId(), kernel1->deviceId()); EXPECT_EQ(gpu_annotation->resourceId(), kernel1->resourceId()); EXPECT_EQ(gpu_annotation->correlationId(), annotation->correlationId()); - EXPECT_EQ(gpu_annotation->name(), annotation->name()); + EXPECT_EQ(gpu_annotation->name(), annotation->name()); } TEST_F(CuptiActivityProfilerTest, SubActivityProfilers) { - // Verbose logging is useful for debugging - std::vector log_modules( - {"CuptiActivityProfiler.cpp"}); + std::vector log_modules({"CuptiActivityProfiler.cpp"}); SET_LOG_VERBOSITY_LEVEL(2, log_modules); // Setup example events to test @@ -819,12 +828,11 @@ TEST_F(CuptiActivityProfilerTest, SubActivityProfilers) { test_activities[2].activityName = "Operator bar"; auto mock_activity_profiler = - std::make_unique(test_activities); + std::make_unique(test_activities); MockCuptiActivities activities; CuptiActivityProfiler profiler(activities, /*cpu only*/ true); - profiler.addChildActivityProfiler( - std::move(mock_activity_profiler)); + profiler.addChildActivityProfiler(std::move(mock_activity_profiler)); profiler.configure(*cfg_, start_time); profiler.startTrace(start_time); @@ -860,7 +868,7 @@ TEST_F(CuptiActivityProfilerTest, SubActivityProfilers) { EXPECT_TRUE(fd); // Should expect at least 100 bytes - struct stat buf{}; + struct stat buf {}; fstat(fd, &buf); EXPECT_GT(buf.st_size, 100); } @@ -873,12 +881,12 @@ TEST_F(CuptiActivityProfilerTest, BufferSizeLimitTestWarmup) { int maxBufferSizeMB = 3; - auto startTimeEpoch = std::to_string(duration_cast(startTime.time_since_epoch()).count()); + auto startTimeEpoch = std::to_string( + duration_cast(startTime.time_since_epoch()).count()); std::string maxBufferSizeMBStr = std::to_string(maxBufferSizeMB); cfg_->handleOption("ACTIVITIES_MAX_GPU_BUFFER_SIZE_MB", maxBufferSizeMBStr); cfg_->handleOption("PROFILE_START_TIME", startTimeEpoch); - EXPECT_FALSE(profiler.isActive()); profiler.configure(*cfg_, now); EXPECT_TRUE(profiler.isActive()); @@ -887,7 +895,8 @@ TEST_F(CuptiActivityProfilerTest, BufferSizeLimitTestWarmup) { uint8_t* buf; size_t gpuBufferSize; size_t maxNumRecords; - cuptiActivities_.bufferRequestedOverride(&buf, &gpuBufferSize, &maxNumRecords); + cuptiActivities_.bufferRequestedOverride( + &buf, &gpuBufferSize, &maxNumRecords); } // fast forward to startTime and profiler is now running @@ -921,20 +930,23 @@ TEST(CuptiActivityProfiler, MetadataJsonFormatingTest) { auto now = system_clock::now(); auto startTime = now + seconds(2); - bool success = cfg.parse(fmt::format(R"CFG( + bool success = cfg.parse(fmt::format( + R"CFG( ACTIVITIES_WARMUP_PERIOD_SECS = 1 ACTIVITIES_DURATION_SECS = 1 ACTIVITIES_LOG_FILE = {} PROFILE_START_TIME = {} - )CFG", filename, duration_cast(startTime.time_since_epoch()).count())); + )CFG", + filename, + duration_cast(startTime.time_since_epoch()).count())); EXPECT_TRUE(success); EXPECT_FALSE(profiler.isActive()); auto logger = std::make_unique(cfg.activitiesLogFile()); - // Usually configuration is done when now is startTime - warmup to kick off warmup - // but start right away in the test + // Usually configuration is done when now is startTime - warmup to kick off + // warmup but start right away in the test profiler.configure(cfg, now); profiler.setLogger(logger.get()); diff --git a/libkineto/test/CuptiCallbackApiTest.cpp b/libkineto/test/CuptiCallbackApiTest.cpp index 66dfc4143..b292688e2 100644 --- a/libkineto/test/CuptiCallbackApiTest.cpp +++ b/libkineto/test/CuptiCallbackApiTest.cpp @@ -6,8 +6,8 @@ * LICENSE file in the root directory of this source tree. */ -#include "src/Logger.h" #include "src/CuptiCallbackApi.h" +#include "src/Logger.h" #include #include @@ -21,16 +21,31 @@ const size_t some_data = 42; std::atomic simple_cb_calls = 0; -void simple_cb( +void simple_cudaLaunchKernel_cb( CUpti_CallbackDomain domain, CUpti_CallbackId cbid, const CUpti_CallbackData* cbInfo) { - // simple arg check EXPECT_EQ(domain, CUPTI_CB_DOMAIN_RUNTIME_API); EXPECT_EQ(cbid, CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000); EXPECT_EQ(*reinterpret_cast(cbInfo), some_data); + LOG(INFO) << "CUDA Launch Kernel called"; + + simple_cb_calls++; +} + +void simple_cudaLaunchKernelExC_cb( + CUpti_CallbackDomain domain, + CUpti_CallbackId cbid, + const CUpti_CallbackData* cbInfo) { + // simple arg check + EXPECT_EQ(domain, CUPTI_CB_DOMAIN_RUNTIME_API); + EXPECT_EQ(cbid, CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_v11060); + EXPECT_EQ(*reinterpret_cast(cbInfo), some_data); + + LOG(INFO) << "CUDA Launch Kernel ExC called"; + simple_cb_calls++; } @@ -48,24 +63,35 @@ void atomic_cb( void empty_cb( CUpti_CallbackDomain /*domain*/, CUpti_CallbackId /*cbid*/, - const CUpti_CallbackData* /*cbInfo*/) { -} + const CUpti_CallbackData* /*cbInfo*/) {} TEST(CuptiCallbackApiTest, SimpleTest) { auto api = CuptiCallbackApi::singleton(); - auto addSimpleCallback = [&]() -> bool { - bool ret = api->registerCallback( - CUPTI_CB_DOMAIN_RUNTIME_API, - CuptiCallbackApi::CUDA_LAUNCH_KERNEL, - &simple_cb - ); + auto addSimpleCallback = [&](CuptiCallbackApi::CuptiCallBackID cbApi, + CuptiCallbackFn cb) -> bool { + bool ret = api->registerCallback(CUPTI_CB_DOMAIN_RUNTIME_API, cbApi, cb); return ret; }; - EXPECT_TRUE(addSimpleCallback()) << "Failed to add callback"; + + EXPECT_TRUE(addSimpleCallback( + CuptiCallbackApi::CuptiCallBackID::CUDA_LAUNCH_KERNEL, + &simple_cudaLaunchKernel_cb)) + << "Failed to add callback"; + EXPECT_TRUE(addSimpleCallback( + CuptiCallbackApi::CuptiCallBackID::CUDA_LAUNCH_KERNEL_EXC, + &simple_cudaLaunchKernelExC_cb)) + << "Failed to add callback"; // duplicate add should be okay - EXPECT_TRUE(addSimpleCallback()) << "Failed to re-add callback"; + EXPECT_TRUE(addSimpleCallback( + CuptiCallbackApi::CuptiCallBackID::CUDA_LAUNCH_KERNEL, + &simple_cudaLaunchKernel_cb)) + << "Failed to re-add callback"; + EXPECT_TRUE(addSimpleCallback( + CuptiCallbackApi::CuptiCallBackID::CUDA_LAUNCH_KERNEL_EXC, + &simple_cudaLaunchKernelExC_cb)) + << "Failed to re-add callback"; simple_cb_calls = 0; @@ -77,19 +103,38 @@ TEST(CuptiCallbackApiTest, SimpleTest) { EXPECT_EQ(simple_cb_calls, 1); + api->__callback_switchboard( + CUPTI_CB_DOMAIN_RUNTIME_API, + CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_v11060, + reinterpret_cast(&some_data)); + + EXPECT_EQ(simple_cb_calls, 2); + bool ret = api->deleteCallback( CUPTI_CB_DOMAIN_RUNTIME_API, CuptiCallbackApi::CUDA_LAUNCH_KERNEL, - &simple_cb - ); + &simple_cudaLaunchKernel_cb); + + EXPECT_TRUE(ret) << "Failed to remove callback"; + + ret = api->deleteCallback( + CUPTI_CB_DOMAIN_RUNTIME_API, + CuptiCallbackApi::CUDA_LAUNCH_KERNEL_EXC, + &simple_cudaLaunchKernelExC_cb); EXPECT_TRUE(ret) << "Failed to remove callback"; ret = api->deleteCallback( CUPTI_CB_DOMAIN_RUNTIME_API, CuptiCallbackApi::CUDA_LAUNCH_KERNEL, - &atomic_cb - ); + &atomic_cb); + + EXPECT_FALSE(ret) << "oops! deleted a callback that was never added"; + + ret = api->deleteCallback( + CUPTI_CB_DOMAIN_RUNTIME_API, + CuptiCallbackApi::CUDA_LAUNCH_KERNEL_EXC, + &atomic_cb); EXPECT_FALSE(ret) << "oops! deleted a callback that was never added"; } @@ -97,11 +142,10 @@ TEST(CuptiCallbackApiTest, SimpleTest) { TEST(CuptiCallbackApiTest, AllCallbacks) { auto api = CuptiCallbackApi::singleton(); - auto testCallback = [&]( - CUpti_CallbackDomain domain, - CUpti_CallbackId cbid, - CuptiCallbackApi::CuptiCallBackID kineto_cbid) -> bool { - + auto testCallback = + [&](CUpti_CallbackDomain domain, + CUpti_CallbackId cbid, + CuptiCallbackApi::CuptiCallBackID kineto_cbid) -> bool { bool ret = api->registerCallback(domain, kineto_cbid, atomic_cb); EXPECT_TRUE(ret) << "Failed to add callback"; @@ -119,27 +163,29 @@ TEST(CuptiCallbackApiTest, AllCallbacks) { return ret; }; - EXPECT_TRUE( - testCallback( - CUPTI_CB_DOMAIN_RESOURCE, - CUPTI_CBID_RESOURCE_CONTEXT_CREATED, - CuptiCallbackApi::RESOURCE_CONTEXT_CREATED)) - << "Failed to run callback for RESOURCE_CONTEXT_CREATED"; - - EXPECT_TRUE( - testCallback( - CUPTI_CB_DOMAIN_RESOURCE, - CUPTI_CBID_RESOURCE_CONTEXT_DESTROY_STARTING, - CuptiCallbackApi::RESOURCE_CONTEXT_DESTROYED)) - << "Failed to run callback for RESOURCE_CONTEXT_DESTROYED"; - - EXPECT_TRUE( - testCallback( - CUPTI_CB_DOMAIN_RUNTIME_API, - CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000, - CuptiCallbackApi::CUDA_LAUNCH_KERNEL)) - << "Failed to run callback for CUDA_LAUNCH_KERNEL"; + EXPECT_TRUE(testCallback( + CUPTI_CB_DOMAIN_RESOURCE, + CUPTI_CBID_RESOURCE_CONTEXT_CREATED, + CuptiCallbackApi::RESOURCE_CONTEXT_CREATED)) + << "Failed to run callback for RESOURCE_CONTEXT_CREATED"; + + EXPECT_TRUE(testCallback( + CUPTI_CB_DOMAIN_RESOURCE, + CUPTI_CBID_RESOURCE_CONTEXT_DESTROY_STARTING, + CuptiCallbackApi::RESOURCE_CONTEXT_DESTROYED)) + << "Failed to run callback for RESOURCE_CONTEXT_DESTROYED"; + + EXPECT_TRUE(testCallback( + CUPTI_CB_DOMAIN_RUNTIME_API, + CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000, + CuptiCallbackApi::CUDA_LAUNCH_KERNEL)) + << "Failed to run callback for CUDA_LAUNCH_KERNEL"; + EXPECT_TRUE(testCallback( + CUPTI_CB_DOMAIN_RUNTIME_API, + CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_v11060, + CuptiCallbackApi::CUDA_LAUNCH_KERNEL_EXC)) + << "Failed to run callback for CUDA_LAUNCH_KERNEL_EXC"; } TEST(CuptiCallbackApiTest, ContentionTest) { @@ -147,7 +193,7 @@ TEST(CuptiCallbackApiTest, ContentionTest) { const CUpti_CallbackDomain domain = CUPTI_CB_DOMAIN_RUNTIME_API; const CUpti_CallbackId cbid = CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000; const CuptiCallbackApi::CuptiCallBackID kineto_cbid = - CuptiCallbackApi::CUDA_LAUNCH_KERNEL; + CuptiCallbackApi::CUDA_LAUNCH_KERNEL; bool ret = api->registerCallback(domain, kineto_cbid, empty_cb); EXPECT_TRUE(ret) << "Failed to add callback"; @@ -160,19 +206,18 @@ TEST(CuptiCallbackApiTest, ContentionTest) { // simulate callbacks being executed on multiple threads in parallel // during this interval add a new atomic_callback. // this test ensured mutual exclusion is working fine - auto read_fn = [&](int tid){ + auto read_fn = [&](int tid) { auto start_ts = high_resolution_clock::now(); for (int i = 0; i < iters; i++) { api->__callback_switchboard(domain, cbid, nullptr); } - auto runtime_ms = duration_cast( - high_resolution_clock::now() - start_ts); + auto runtime_ms = + duration_cast(high_resolution_clock::now() - start_ts); LOG(INFO) << "th " << tid << " done in " << runtime_ms.count() << " ms"; }; - std::vector read_ths; - for (int i = 0; i< num_readers; i++) { + for (int i = 0; i < num_readers; i++) { read_ths.emplace_back(read_fn, i); } @@ -183,7 +228,7 @@ TEST(CuptiCallbackApiTest, ContentionTest) { t.join(); } - //EXPECT_GT(simple_cb_calls, 0) + // EXPECT_GT(simple_cb_calls, 0) // << "Atomic callback should have been called at least once."; api->deleteCallback(domain, kineto_cbid, empty_cb); @@ -191,14 +236,13 @@ TEST(CuptiCallbackApiTest, ContentionTest) { } TEST(CuptiCallbackApiTest, Bechmark) { - constexpr int iters = 1000; // atomic bench a number of times to get a baseline const CUpti_CallbackDomain domain = CUPTI_CB_DOMAIN_RUNTIME_API; const CUpti_CallbackId cbid = CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000; const CuptiCallbackApi::CuptiCallBackID kineto_cbid = - CuptiCallbackApi::CUDA_LAUNCH_KERNEL; + CuptiCallbackApi::CUDA_LAUNCH_KERNEL; LOG(INFO) << "Iteration count = " << iters; @@ -214,11 +258,10 @@ TEST(CuptiCallbackApiTest, Bechmark) { for (int i = 0; i < iters; i++) { (*cbfn)(domain, cbid, nullptr); } - auto delta_baseline_ns = duration_cast( - high_resolution_clock::now() - start_ts); + auto delta_baseline_ns = + duration_cast(high_resolution_clock::now() - start_ts); LOG(INFO) << "Baseline runtime = " << delta_baseline_ns.count() << " ns"; - auto api = CuptiCallbackApi::singleton(); bool ret = api->registerCallback(domain, kineto_cbid, cbfn); EXPECT_TRUE(ret) << "Failed to add callback"; @@ -233,12 +276,12 @@ TEST(CuptiCallbackApiTest, Bechmark) { api->__callback_switchboard(domain, cbid, nullptr); } - auto delta_callback_ns = duration_cast( - high_resolution_clock::now() - start_ts); + auto delta_callback_ns = + duration_cast(high_resolution_clock::now() - start_ts); LOG(INFO) << "Callback runtime = " << delta_callback_ns.count() << " ns"; - LOG(INFO) << "Callback runtime per iteration = " << - (delta_callback_ns.count() - delta_baseline_ns.count()) / (double) iters - << " ns"; - + LOG(INFO) << "Callback runtime per iteration = " + << (delta_callback_ns.count() - delta_baseline_ns.count()) / + (double)iters + << " ns"; }