From c7cb73cdc27e4d4205bc1ce68c6c2fdadf79ceb9 Mon Sep 17 00:00:00 2001 From: Moz Date: Thu, 2 Nov 2023 22:28:52 +0100 Subject: [PATCH] add marker --- cuda/CudaMatch.cu | 6 ++-- cuda/CudaParkSerializer.cu | 6 ++-- cuda/CudaPlotContext.h | 6 ++-- cuda/CudaPlotPhase2.cu | 15 ++++++---- cuda/CudaPlotPhase3.cu | 24 ++++++++++------ cuda/CudaPlotPhase3Step2.cu | 27 ++++++++++++------ cuda/CudaPlotPhase3Step3.cu | 51 ++++++++++++++++++++++----------- cuda/CudaPlotUtil.cu | 6 ++-- cuda/CudaPlotter.cu | 24 ++++++++++------ cuda/GpuDownloadStream.cu | 18 ++++++++---- cuda/GpuQueue.cu | 44 ++++++++++++++++++---------- cuda/GpuStreams.cu | 57 ++++++++++++++++++++++++------------- cuda/chacha8.cu | 15 ++++++---- 13 files changed, 199 insertions(+), 100 deletions(-) diff --git a/cuda/CudaMatch.cu b/cuda/CudaMatch.cu index 359df68d..c1fdbff7 100644 --- a/cuda/CudaMatch.cu +++ b/cuda/CudaMatch.cu @@ -592,8 +592,10 @@ void CudaMatchBucketizedK32( CudaSetFirstAndLastGroup<<<1,2,0,stream>>>( tmpGroupCounts, entryCount ); } - CudaErrCheck( cudaMemsetAsync( cx.devGroupCount, 0, sizeof( uint32 ), stream ) ); - CudaErrCheck( cudaMemsetAsync( cx.devMatchCount, 0, sizeof( uint32 ), stream ) ); + Log::Line( "Marker Set to %d", 1) +CudaErrCheck( cudaMemsetAsync( cx.devGroupCount, 0, sizeof( uint32 ), stream ) ); + Log::Line( "Marker Set to %d", 2) +CudaErrCheck( cudaMemsetAsync( cx.devMatchCount, 0, sizeof( uint32 ), stream ) ); ScanGroupsCudaK32Bucket<<>>( devY, tmpGroupCounts+2, cx.devGroupCount, entryCount, bucketMask ); byte* sortTmpAlloc = (byte*)( tmpGroupCounts + BBCU_MAX_GROUP_COUNT ); diff --git a/cuda/CudaParkSerializer.cu b/cuda/CudaParkSerializer.cu index f3e8b8d4..91516a7a 100644 --- a/cuda/CudaParkSerializer.cu +++ b/cuda/CudaParkSerializer.cu @@ -20,8 +20,10 @@ void InitFSEBitMask( CudaK32PlotContext& cx ) 0x3FFFFFFF, 0x7FFFFFFF }; - CudaErrCheck( cudaMemcpyToSymbolAsync( CUDA_FSE_BIT_mask, bitmask, sizeof( bitmask ), 0, cudaMemcpyHostToDevice, cx.computeStream ) ); - CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) ); + Log::Line( "Marker Set to %d", 3) +CudaErrCheck( cudaMemcpyToSymbolAsync( CUDA_FSE_BIT_mask, bitmask, sizeof( bitmask ), 0, cudaMemcpyHostToDevice, cx.computeStream ) ); + Log::Line( "Marker Set to %d", 4) +CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) ); } diff --git a/cuda/CudaPlotContext.h b/cuda/CudaPlotContext.h index fc5884b3..c83b3a32 100644 --- a/cuda/CudaPlotContext.h +++ b/cuda/CudaPlotContext.h @@ -576,8 +576,10 @@ inline void DbgPrintDeviceHash( const char* msg, const void* ptr, const size_t s byte hash[32]; void* hostBuffer = bbvirtallocboundednuma( size ); - CudaErrCheck( cudaMemcpyAsync( hostBuffer, ptr, size, cudaMemcpyDeviceToHost, stream ) ); - CudaErrCheck( cudaStreamSynchronize( stream ) ); + Log::Line( "Marker Set to %d", 5) +CudaErrCheck( cudaMemcpyAsync( hostBuffer, ptr, size, cudaMemcpyDeviceToHost, stream ) ); + Log::Line( "Marker Set to %d", 6) +CudaErrCheck( cudaStreamSynchronize( stream ) ); blake3_hasher hasher; blake3_hasher_init( &hasher ); diff --git a/cuda/CudaPlotPhase2.cu b/cuda/CudaPlotPhase2.cu index 87c1b16c..d66ab8b6 100644 --- a/cuda/CudaPlotPhase2.cu +++ b/cuda/CudaPlotPhase2.cu @@ -119,7 +119,8 @@ static void BytefieldToBitfield( CudaK32PlotContext& cx, const byte* bytefield, #if DBG_BBCU_P2_COUNT_PRUNED_ENTRIES #define G_PRUNED_COUNTS ,cx.phase2->devPrunedCount - CudaErrCheck( cudaMemsetAsync( cx.phase2->devPrunedCount, 0, sizeof( uint32 ), stream ) ); + Log::Line( "Marker Set to %d", 7) +CudaErrCheck( cudaMemsetAsync( cx.phase2->devPrunedCount, 0, sizeof( uint32 ), stream ) ); #else #define G_PRUNED_COUNTS #endif @@ -176,7 +177,8 @@ void MarkTable( CudaK32PlotContext& cx, CudaK32Phase2& p2 ) } // Zero-out marks - CudaErrCheck( cudaMemsetAsync( devLMarks, 0, GetMarkingTableByteSize(), cx.computeStream ) ); + Log::Line( "Marker Set to %d", 8) +CudaErrCheck( cudaMemsetAsync( devLMarks, 0, GetMarkingTableByteSize(), cx.computeStream ) ); // Load first bucket's worth of pairs LoadPairs( cx, p2, rTable, 0 ); @@ -230,9 +232,12 @@ void MarkTable( CudaK32PlotContext& cx, CudaK32Phase2& p2 ) #if DBG_BBCU_P2_COUNT_PRUNED_ENTRIES { uint32 prunedEntryCount = 0; - CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) ); - CudaErrCheck( cudaMemcpyAsync( &prunedEntryCount, p2.devPrunedCount, sizeof( uint32 ), cudaMemcpyDeviceToHost, cx.computeStream ) ); - CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) ); + Log::Line( "Marker Set to %d", 9) +CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) ); + Log::Line( "Marker Set to %d", 10) +CudaErrCheck( cudaMemcpyAsync( &prunedEntryCount, p2.devPrunedCount, sizeof( uint32 ), cudaMemcpyDeviceToHost, cx.computeStream ) ); + Log::Line( "Marker Set to %d", 11) +CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) ); const uint64 lEntryCount = cx.tableEntryCounts[(int)lTable]; Log::Line( "Table %u now has %u / %llu ( %.2lf%% ) entries.", (uint)lTable+1, diff --git a/cuda/CudaPlotPhase3.cu b/cuda/CudaPlotPhase3.cu index 8fcdfe2a..176f344c 100644 --- a/cuda/CudaPlotPhase3.cu +++ b/cuda/CudaPlotPhase3.cu @@ -374,12 +374,14 @@ void Step1( CudaK32PlotContext& cx ) const TableId rTable = cx.table; // Clear pruned table count - CudaErrCheck( cudaMemsetAsync( p3.devPrunedEntryCount, 0, sizeof( uint32 ), cx.computeStream ) ); + Log::Line( "Marker Set to %d", 12) +CudaErrCheck( cudaMemsetAsync( p3.devPrunedEntryCount, 0, sizeof( uint32 ), cx.computeStream ) ); // Load marking table (must be loaded before first bucket, on the same stream) if( cx.table < TableId::Table7 ) { - CudaErrCheck( cudaMemcpyAsync( s1.rTableMarks, cx.hostMarkingTables[(int)rTable], + Log::Line( "Marker Set to %d", 13) +CudaErrCheck( cudaMemcpyAsync( s1.rTableMarks, cx.hostMarkingTables[(int)rTable], GetMarkingTableBitFieldSize(), cudaMemcpyHostToDevice, s1.pairsLIn.GetQueue()->GetStream() ) ); } @@ -416,7 +418,8 @@ void Step1( CudaK32PlotContext& cx ) // Generate map #define KERN_RMAP_ARGS entryCount, rTableOffset, devSliceCounts, p3.devPrunedEntryCount, devRMap, devLPairs, devRPairs, s1.rTableMarks - CudaErrCheck( cudaMemsetAsync( devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT, cx.computeStream ) ); + Log::Line( "Marker Set to %d", 14) +CudaErrCheck( cudaMemsetAsync( devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT, cx.computeStream ) ); if( cx.table < TableId::Table7 ) PruneAndWriteRMap<<>>( KERN_RMAP_ARGS ); @@ -436,7 +439,8 @@ void Step1( CudaK32PlotContext& cx ) // Download slice counts cudaStream_t downloadStream = s1.rMapOut.GetQueue()->GetStream(); - CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, + Log::Line( "Marker Set to %d", 15) +CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cudaMemcpyDeviceToHost, downloadStream ) ); // Wait for completion @@ -446,7 +450,8 @@ void Step1( CudaK32PlotContext& cx ) s1.pairsLIn.Reset(); s1.pairsRIn.Reset(); - CudaErrCheck( cudaStreamSynchronize( downloadStream ) ); + Log::Line( "Marker Set to %d", 16) +CudaErrCheck( cudaStreamSynchronize( downloadStream ) ); // Add-up pruned bucket counts and tables counts memcpy( &s1.prunedBucketSlices[0][0], cx.hostBucketSlices, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT ); @@ -521,7 +526,8 @@ void CompressInlinedTable( CudaK32PlotContext& cx ) #endif // Load R Marking table (must be loaded before first bucket, on the same stream) - CudaErrCheck( cudaMemcpyAsync( (void*)tx.devRMarks, cx.hostMarkingTables[(int)rTable], + Log::Line( "Marker Set to %d", 17) +CudaErrCheck( cudaMemcpyAsync( (void*)tx.devRMarks, cx.hostMarkingTables[(int)rTable], GetMarkingTableBitFieldSize(), cudaMemcpyHostToDevice, p3.xTable.xIn.GetQueue()->GetStream() ) ); // Load initial bucket @@ -536,7 +542,8 @@ void CompressInlinedTable( CudaK32PlotContext& cx ) uint64 tablePrunedEntryCount = 0; uint32 rTableOffset = 0; - CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cx.computeStream ) ); + Log::Line( "Marker Set to %d", 18) +CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cx.computeStream ) ); for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ ) { @@ -559,7 +566,8 @@ void CompressInlinedTable( CudaK32PlotContext& cx ) uint32* devSliceCounts = cx.devSliceCounts + bucket * BBCU_BUCKET_COUNT; #if _DEBUG - CudaErrCheck( cudaMemsetAsync( outLps, 0, sizeof( uint64 ) * P3_PRUNED_BUCKET_MAX, cx.computeStream ) ); + Log::Line( "Marker Set to %d", 19) +CudaErrCheck( cudaMemsetAsync( outLps, 0, sizeof( uint64 ) * P3_PRUNED_BUCKET_MAX, cx.computeStream ) ); #endif CudaConvertInlinedXsToLinePoints<<>>( diff --git a/cuda/CudaPlotPhase3Step2.cu b/cuda/CudaPlotPhase3Step2.cu index 3a7a6449..eaa03f15 100644 --- a/cuda/CudaPlotPhase3Step2.cu +++ b/cuda/CudaPlotPhase3Step2.cu @@ -148,7 +148,8 @@ static void ConvertRMapToLinePoints( CudaK32PlotContext& cx, const uint32 entryC const uint64 divisor = P3_CalculateMaxLPValue( prunedEntryCount ) / BBCU_BUCKET_COUNT; // #TODO: Use upload stream? - CudaErrCheck( cudaMemcpyToSymbolAsync( BucketDivisor, &divisor, sizeof( divisor ), 0, cudaMemcpyHostToDevice, cx.computeStream ) ); + Log::Line( "Marker Set to %d", 20) +CudaErrCheck( cudaMemcpyToSymbolAsync( BucketDivisor, &divisor, sizeof( divisor ), 0, cudaMemcpyHostToDevice, cx.computeStream ) ); } CudaConvertRMapToLinePoints<<>>( Rmap2LPParams, 0 ); @@ -223,7 +224,8 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx ) if( isCompressed ) { // Copy from upload buffer to working buffer - CudaErrCheck( cudaMemcpyAsync( lTable, lMap, BBCU_BUCKET_ENTRY_COUNT * sizeof( uint32 ), cudaMemcpyDeviceToDevice, cx.computeStream ) ); + Log::Line( "Marker Set to %d", 21) +CudaErrCheck( cudaMemcpyAsync( lTable, lMap, BBCU_BUCKET_ENTRY_COUNT * sizeof( uint32 ), cudaMemcpyDeviceToDevice, cx.computeStream ) ); } else { @@ -264,7 +266,8 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx ) LoadRBucket( cx, 0 ); // Clear pruned entry count - CudaErrCheck( cudaMemsetAsync( p3.devPrunedEntryCount, 0, sizeof( uint32 ), cx.computeStream ) ); + Log::Line( "Marker Set to %d", 22) +CudaErrCheck( cudaMemsetAsync( p3.devPrunedEntryCount, 0, sizeof( uint32 ), cx.computeStream ) ); // Unpack the first map beforehand UnpackLBucket( cx, 0 ); @@ -275,7 +278,8 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx ) /// uint32 rTableOffset = 0; // Track the global origin index of R entry/line point - CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cx.computeStream ) ); + Log::Line( "Marker Set to %d", 23) +CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cx.computeStream ) ); for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ ) { @@ -298,7 +302,8 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx ) static_assert( BBCU_BUCKET_ALLOC_ENTRY_COUNT - BBCU_BUCKET_ENTRY_COUNT == copyCount ); uint32* nextLTable = s2.devLTable[nextBucket & 1]; - CudaErrCheck( cudaMemcpyAsync( (uint32*)devLTable + BBCU_BUCKET_ENTRY_COUNT, nextLTable, copyCount * sizeof( uint32 ), cudaMemcpyDeviceToDevice, cx.computeStream ) ); + Log::Line( "Marker Set to %d", 24) +CudaErrCheck( cudaMemcpyAsync( (uint32*)devLTable + BBCU_BUCKET_ENTRY_COUNT, nextLTable, copyCount * sizeof( uint32 ), cudaMemcpyDeviceToDevice, cx.computeStream ) ); } if( nextBucketL < BBCU_BUCKET_COUNT ) @@ -351,12 +356,14 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx ) // Copy slice counts & bucket count cudaStream_t downloadStream = s2.lpOut.GetQueue()->GetStream(); - CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, + Log::Line( "Marker Set to %d", 25) +CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cudaMemcpyDeviceToHost, downloadStream ) ); memset( p3.prunedBucketCounts[(int)rTable], 0, BBCU_BUCKET_COUNT * sizeof( uint32 ) ); - CudaErrCheck( cudaStreamSynchronize( downloadStream ) ); + Log::Line( "Marker Set to %d", 26) +CudaErrCheck( cudaStreamSynchronize( downloadStream ) ); bbmemcpy_t( &s2.prunedBucketSlices[0][0], cx.hostBucketSlices, BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT ); for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ ) { @@ -481,9 +488,11 @@ void WritePark7( CudaK32PlotContext& cx ) const size_t copySize = sizeof( uint32 ) * retainedEntryCount; if( !isLastBucket ) - CudaErrCheck( cudaMemcpyAsync( devIndexBuffer - retainedEntryCount, copySource, copySize, cudaMemcpyDeviceToDevice, cx.computeStream ) ); + Log::Line( "Marker Set to %d", 27) +CudaErrCheck( cudaMemcpyAsync( devIndexBuffer - retainedEntryCount, copySource, copySize, cudaMemcpyDeviceToDevice, cx.computeStream ) ); else - CudaErrCheck( cudaMemcpyAsync( hostLastParkEntries, copySource, copySize, cudaMemcpyDeviceToHost, cx.computeStream ) ); + Log::Line( "Marker Set to %d", 28) +CudaErrCheck( cudaMemcpyAsync( hostLastParkEntries, copySource, copySize, cudaMemcpyDeviceToHost, cx.computeStream ) ); } // Download parks & write to plot diff --git a/cuda/CudaPlotPhase3Step3.cu b/cuda/CudaPlotPhase3Step3.cu index c8f9337b..93b7e681 100644 --- a/cuda/CudaPlotPhase3Step3.cu +++ b/cuda/CudaPlotPhase3Step3.cu @@ -61,7 +61,8 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx ) const FSE_CTable* hostCTable = !isCompressed ? CTables[(int)lTable] : cx.gCfg->ctable; // (upload must be loaded before first bucket, on the same stream) - CudaErrCheck( cudaMemcpyAsync( s3.devCTable, hostCTable, cTableSize, cudaMemcpyHostToDevice, + Log::Line( "Marker Set to %d", 29) +CudaErrCheck( cudaMemcpyAsync( s3.devCTable, hostCTable, cTableSize, cudaMemcpyHostToDevice, s3.lpIn.GetQueue()->GetStream() ) ); // Load initial bucket @@ -116,11 +117,14 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx ) cudaStream_t lpStream = cx.computeStream;//B; cudaStream_t downloadStream = cx.gpuDownloadStream[0]->GetStream(); - CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, sortAndMapStream ) ); - CudaErrCheck( cudaMemsetAsync( s3.devParkOverrunCount, 0, sizeof( uint32 ), sortAndMapStream ) ); + Log::Line( "Marker Set to %d", 30) +CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, sortAndMapStream ) ); + Log::Line( "Marker Set to %d", 31) +CudaErrCheck( cudaMemsetAsync( s3.devParkOverrunCount, 0, sizeof( uint32 ), sortAndMapStream ) ); // Set initial event LP stream event as set. - CudaErrCheck( cudaEventRecord( cx.computeEventA, lpStream ) ); + Log::Line( "Marker Set to %d", 32) +CudaErrCheck( cudaEventRecord( cx.computeEventA, lpStream ) ); cx.parkFence->Reset( 0 ); s3.parkBucket = 0; @@ -145,23 +149,27 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx ) #if _DEBUG { size_t sortRequiredSize = 0; - CudaErrCheck( cub::DeviceRadixSort::SortPairs( nullptr, sortRequiredSize, nullptr, nullptr, nullptr, nullptr, bucketEntryCount, 0, 64 ) ); + Log::Line( "Marker Set to %d", 33) +CudaErrCheck( cub::DeviceRadixSort::SortPairs( nullptr, sortRequiredSize, nullptr, nullptr, nullptr, nullptr, bucketEntryCount, 0, 64 ) ); ASSERT( s3.sizeTmpSort >= sortRequiredSize ); } #endif // Wait for the previous bucket's LP work to finish, so we can re-use the device buffer - CudaErrCheck( cudaStreamWaitEvent( sortAndMapStream, cx.computeEventA ) ); + Log::Line( "Marker Set to %d", 34) +CudaErrCheck( cudaStreamWaitEvent( sortAndMapStream, cx.computeEventA ) ); // #TODO: We can use 63-7 (log2(128 buckets)), which might be faster // #NOTE: I did change it and the sort failed. Investigate. - CudaErrCheck( cub::DeviceRadixSort::SortPairs( + Log::Line( "Marker Set to %d", 35) +CudaErrCheck( cub::DeviceRadixSort::SortPairs( s3.devSortTmpData, s3.sizeTmpSort, unsortedLinePoints, sortedLinePoints, unsortedIndices, sortedIndices, bucketEntryCount, 0, 64, sortAndMapStream ) ); - CudaErrCheck( cudaEventRecord( cx.computeEventB, sortAndMapStream ) ); + Log::Line( "Marker Set to %d", 36) +CudaErrCheck( cudaEventRecord( cx.computeEventB, sortAndMapStream ) ); s3.lpIn .ReleaseDeviceBuffer( sortAndMapStream ); unsortedLinePoints = nullptr; s3.indexIn.ReleaseDeviceBuffer( sortAndMapStream ); unsortedIndices = nullptr; @@ -190,12 +198,14 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx ) ASSERT( parkCount <= P3_PRUNED_MAX_PARKS_PER_BUCKET ); // Wait for sort to finish - CudaErrCheck( cudaStreamWaitEvent( lpStream, cx.computeEventB ) ); + Log::Line( "Marker Set to %d", 37) +CudaErrCheck( cudaStreamWaitEvent( lpStream, cx.computeEventB ) ); // Deltafy line points DeltafyLinePoints( cx, entryCount, parkLinePoints, s3.devDeltaLinePoints, lpStream ); - CudaErrCheck( cudaEventRecord( cx.computeEventC, lpStream ) ); // Signal download stream can download remaining line points for last park + Log::Line( "Marker Set to %d", 38) +CudaErrCheck( cudaEventRecord( cx.computeEventC, lpStream ) ); // Signal download stream can download remaining line points for last park // Compress line point parks byte* devParks = (byte*)s3.parksOut.LockDeviceBuffer( lpStream ); @@ -214,20 +224,24 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx ) if( !isLastBucket ) { // Not the last bucket, so retain entries for the next GPU compression bucket - CudaErrCheck( cudaMemcpyAsync( sortedLinePoints - retainedLPCount, copySource, copySize, cudaMemcpyDeviceToDevice, lpStream ) ); + Log::Line( "Marker Set to %d", 39) +CudaErrCheck( cudaMemcpyAsync( sortedLinePoints - retainedLPCount, copySource, copySize, cudaMemcpyDeviceToDevice, lpStream ) ); } else { // No more buckets so we have to compress this last park on the CPU - CudaErrCheck( cudaStreamWaitEvent( downloadStream, cx.computeEventC ) ); + Log::Line( "Marker Set to %d", 40) +CudaErrCheck( cudaStreamWaitEvent( downloadStream, cx.computeEventC ) ); hostRetainedEntries = cx.useParkContext ? cx.parkContext->hostRetainedLinePoints : (uint64*)( hostParksWriter + hostParkSize * parkCount ); - CudaErrCheck( cudaMemcpyAsync( hostRetainedEntries, copySource, copySize, cudaMemcpyDeviceToHost, downloadStream ) ); + Log::Line( "Marker Set to %d", 41) +CudaErrCheck( cudaMemcpyAsync( hostRetainedEntries, copySource, copySize, cudaMemcpyDeviceToHost, downloadStream ) ); } } - CudaErrCheck( cudaEventRecord( cx.computeEventA, lpStream ) ); // Signal sortedLinePoints buffer ready for use again + Log::Line( "Marker Set to %d", 42) +CudaErrCheck( cudaEventRecord( cx.computeEventA, lpStream ) ); // Signal sortedLinePoints buffer ready for use again // Download parks @@ -270,17 +284,20 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx ) } // Copy park overrun count - CudaErrCheck( cudaMemcpyAsync( s3.hostParkOverrunCount, s3.devParkOverrunCount, sizeof( uint32 ), cudaMemcpyDeviceToHost, downloadStream ) ); + Log::Line( "Marker Set to %d", 43) +CudaErrCheck( cudaMemcpyAsync( s3.hostParkOverrunCount, s3.devParkOverrunCount, sizeof( uint32 ), cudaMemcpyDeviceToHost, downloadStream ) ); // Wait for parks to complete downloading s3.parksOut.WaitForCompletion(); s3.parksOut.Reset(); // Copy map slice counts (for the next step 2) - CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, + Log::Line( "Marker Set to %d", 44) +CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cudaMemcpyDeviceToHost, downloadStream ) ); - CudaErrCheck( cudaStreamSynchronize( downloadStream ) ); + Log::Line( "Marker Set to %d", 45) +CudaErrCheck( cudaStreamSynchronize( downloadStream ) ); memcpy( &s3.prunedBucketSlices[0][0], cx.hostBucketSlices, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT ); FatalIf( *s3.hostParkOverrunCount > 0, "Park buffer overrun." ); diff --git a/cuda/CudaPlotUtil.cu b/cuda/CudaPlotUtil.cu index 4f7f18b3..228be9f6 100644 --- a/cuda/CudaPlotUtil.cu +++ b/cuda/CudaPlotUtil.cu @@ -21,7 +21,8 @@ void CudaK32PlotGenSortKey( const uint32 entryCount, uint32* devKey, cudaStream_ GenSortKey<<>>( entryCount, devKey ); if( synchronize ) - CudaErrCheck( cudaStreamSynchronize( stream ) ); + Log::Line( "Marker Set to %d", 46) +CudaErrCheck( cudaStreamSynchronize( stream ) ); } @@ -48,7 +49,8 @@ void CudaK32PlotSortByKey( const uint32 entryCount, const uint32* devKey, const SortByKey<<>>( entryCount, devKey, devInput, devOutput ); if( synchronize ) - CudaErrCheck( cudaStreamSynchronize( stream ) ); + Log::Line( "Marker Set to %d", 47) +CudaErrCheck( cudaStreamSynchronize( stream ) ); } //----------------------------------------------------------- diff --git a/cuda/CudaPlotter.cu b/cuda/CudaPlotter.cu index 80ba8b0e..64e9e94d 100644 --- a/cuda/CudaPlotter.cu +++ b/cuda/CudaPlotter.cu @@ -201,10 +201,14 @@ void InitContext( CudaK32PlotConfig& cfg, CudaK32PlotContext*& outContext ) CudaInit( cx ); - CudaErrCheck( cudaStreamCreateWithFlags( &cx.computeStream , cudaStreamNonBlocking ) ); - CudaErrCheck( cudaStreamCreateWithFlags( &cx.computeStreamB, cudaStreamNonBlocking ) ); - CudaErrCheck( cudaStreamCreateWithFlags( &cx.computeStreamC, cudaStreamNonBlocking ) ); - CudaErrCheck( cudaStreamCreateWithFlags( &cx.computeStreamD, cudaStreamNonBlocking ) ); + Log::Line( "Marker Set to %d", 48) +CudaErrCheck( cudaStreamCreateWithFlags( &cx.computeStream , cudaStreamNonBlocking ) ); + Log::Line( "Marker Set to %d", 49) +CudaErrCheck( cudaStreamCreateWithFlags( &cx.computeStreamB, cudaStreamNonBlocking ) ); + Log::Line( "Marker Set to %d", 50) +CudaErrCheck( cudaStreamCreateWithFlags( &cx.computeStreamC, cudaStreamNonBlocking ) ); + Log::Line( "Marker Set to %d", 51) +CudaErrCheck( cudaStreamCreateWithFlags( &cx.computeStreamD, cudaStreamNonBlocking ) ); cudaEventCreateWithFlags( &cx.computeEventA, cudaEventDisableTiming ); cudaEventCreateWithFlags( &cx.computeEventB, cudaEventDisableTiming ); @@ -318,7 +322,8 @@ void CudaInit( CudaK32PlotContext& cx ) cx.cudaDevice = (int32)cx.cfg.deviceIndex; cudaDeviceProp* cudaDevProps = new cudaDeviceProp{}; - CudaErrCheck( cudaGetDeviceProperties( cudaDevProps, cx.cudaDevice ) ); + Log::Line( "Marker Set to %d", 52) +CudaErrCheck( cudaGetDeviceProperties( cudaDevProps, cx.cudaDevice ) ); cx.cudaDevProps = cudaDevProps; Log::Line( "Selected cuda device %u : %s", cx.cudaDevice, cudaDevProps->name ); @@ -364,7 +369,8 @@ void CudaK32Plotter::Run( const PlotRequest& req ) const auto& cfg = _cfg; // Only start profiling from here (don't profile allocations) - CudaErrCheck( cudaProfilerStart() ); + Log::Line( "Marker Set to %d", 53) +CudaErrCheck( cudaProfilerStart() ); ASSERT( cx.plotWriter == nullptr ); cx.plotWriter = new PlotWriter( !cfg.gCfg->disableOutputDirectIO ); @@ -522,7 +528,8 @@ void FpTable( CudaK32PlotContext& cx ) } // Clear slice counts - CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cx.computeStream ) ); + Log::Line( "Marker Set to %d", 54) +CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cx.computeStream ) ); // Load initial buckets UploadBucketForTable( cx, 0 ); @@ -533,7 +540,8 @@ void FpTable( CudaK32PlotContext& cx ) FpTableBucket( cx, bucket ); } - CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) ); + Log::Line( "Marker Set to %d", 55) +CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) ); // Copy bucket slices to host cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, diff --git a/cuda/GpuDownloadStream.cu b/cuda/GpuDownloadStream.cu index 4d35a06b..d903adc2 100644 --- a/cuda/GpuDownloadStream.cu +++ b/cuda/GpuDownloadStream.cu @@ -11,7 +11,8 @@ void* GpuDownloadBuffer::GetDeviceBuffer() { const uint32 index = self->outgoingSequence % self->bufferCount; - CudaErrCheck( cudaEventSynchronize( self->events[index] ) ); + Log::Line( "Marker Set to %d", 56) +CudaErrCheck( cudaEventSynchronize( self->events[index] ) ); return self->deviceBuffer[index]; } @@ -25,7 +26,8 @@ void* GpuDownloadBuffer::LockDeviceBuffer( cudaStream_t stream ) self->lockSequence++; // Wait for the device buffer to be free to be used by kernels - CudaErrCheck( cudaStreamWaitEvent( stream, self->events[index] ) ); + Log::Line( "Marker Set to %d", 57) +CudaErrCheck( cudaStreamWaitEvent( stream, self->events[index] ) ); return self->deviceBuffer[index]; } @@ -73,10 +75,12 @@ void GpuDownloadBuffer::DownloadAndCopy( void* hostBuffer, void* finalBuffer, co // // Copy // // Signal that the device buffer is free to be re-used - // CudaErrCheck( cudaEventRecord( self->events[index], stream ) ); + // Log::Line( "Marker Set to %d", 58) +CudaErrCheck( cudaEventRecord( self->events[index], stream ) ); // // Launch copy command - // CudaErrCheck( cudaLaunchHostFunc( stream, []( void* userData ){ + // Log::Line( "Marker Set to %d", 59) +CudaErrCheck( cudaLaunchHostFunc( stream, []( void* userData ){ // const CopyInfo& c = *reinterpret_cast( userData ); // IGpuBuffer* self = c.self; @@ -131,11 +135,13 @@ void GpuDownloadBuffer::PerformDownload2D( void* hostBuffer, size_t width, size_ // Signal from the work stream when it has finished doing kernel work with the device buffer - CudaErrCheck( cudaEventRecord( self->workEvent[index], workStream ) ); + Log::Line( "Marker Set to %d", 60) +CudaErrCheck( cudaEventRecord( self->workEvent[index], workStream ) ); // From the download stream, wait for the work stream to finish cudaStream_t downloadStream = self->queue->_stream; - CudaErrCheck( cudaStreamWaitEvent( downloadStream, self->workEvent[index] ) ); + Log::Line( "Marker Set to %d", 61) +CudaErrCheck( cudaStreamWaitEvent( downloadStream, self->workEvent[index] ) ); if( self->diskBuffer ) diff --git a/cuda/GpuQueue.cu b/cuda/GpuQueue.cu index f673f640..110929a0 100644 --- a/cuda/GpuQueue.cu +++ b/cuda/GpuQueue.cu @@ -9,9 +9,12 @@ GpuQueue::GpuQueue( Kind kind ) : _kind( kind ) , _bufferReadySignal( BBCU_BUCKET_COUNT ) { - CudaErrCheck( cudaStreamCreateWithFlags( &_stream , cudaStreamNonBlocking ) ); - CudaErrCheck( cudaStreamCreateWithFlags( &_preloadStream , cudaStreamNonBlocking ) ); - CudaErrCheck( cudaStreamCreateWithFlags( &_callbackStream, cudaStreamNonBlocking ) ); + Log::Line( "Marker Set to %d", 62) +CudaErrCheck( cudaStreamCreateWithFlags( &_stream , cudaStreamNonBlocking ) ); + Log::Line( "Marker Set to %d", 63) +CudaErrCheck( cudaStreamCreateWithFlags( &_preloadStream , cudaStreamNonBlocking ) ); + Log::Line( "Marker Set to %d", 64) +CudaErrCheck( cudaStreamCreateWithFlags( &_callbackStream, cudaStreamNonBlocking ) ); _queueThread.Run( QueueThreadEntryPoint, this ); } @@ -168,13 +171,19 @@ struct IGpuBuffer* GpuQueue::CreateGpuBuffer( const GpuStreamDescriptor& desc, b for( int32 i = 0; i < desc.bufferCount; i++ ) { - CudaErrCheck( cudaEventCreateWithFlags( &buf->events[i] , cudaEventDisableTiming ) ); - CudaErrCheck( cudaEventCreateWithFlags( &buf->completedEvents[i], cudaEventDisableTiming ) ); - CudaErrCheck( cudaEventCreateWithFlags( &buf->readyEvents[i] , cudaEventDisableTiming ) ); - CudaErrCheck( cudaEventCreateWithFlags( &buf->pinnedEvent[i] , cudaEventDisableTiming ) ); - - CudaErrCheck( cudaEventCreateWithFlags( &buf->callbackLockEvent , cudaEventDisableTiming ) ); - CudaErrCheck( cudaEventCreateWithFlags( &buf->callbackCompletedEvent, cudaEventDisableTiming ) ); + Log::Line( "Marker Set to %d", 65) +CudaErrCheck( cudaEventCreateWithFlags( &buf->events[i] , cudaEventDisableTiming ) ); + Log::Line( "Marker Set to %d", 66) +CudaErrCheck( cudaEventCreateWithFlags( &buf->completedEvents[i], cudaEventDisableTiming ) ); + Log::Line( "Marker Set to %d", 67) +CudaErrCheck( cudaEventCreateWithFlags( &buf->readyEvents[i] , cudaEventDisableTiming ) ); + Log::Line( "Marker Set to %d", 68) +CudaErrCheck( cudaEventCreateWithFlags( &buf->pinnedEvent[i] , cudaEventDisableTiming ) ); + + Log::Line( "Marker Set to %d", 69) +CudaErrCheck( cudaEventCreateWithFlags( &buf->callbackLockEvent , cudaEventDisableTiming ) ); + Log::Line( "Marker Set to %d", 70) +CudaErrCheck( cudaEventCreateWithFlags( &buf->callbackCompletedEvent, cudaEventDisableTiming ) ); buf->deviceBuffer[i] = devBuffers[i]; buf->pinnedBuffer[i] = pinnedBuffers[i]; @@ -240,14 +249,17 @@ void GpuQueue::DispatchHostFunc( GpuCallbackDispath func, cudaStream_t stream, c // #MAYBE: Perhaps support having multiple callback streams, and multiple copy streams. // Signal from the work stream into the callback stream that we are ready for callback - CudaErrCheck( cudaEventRecord( lockEvent, stream ) ); + Log::Line( "Marker Set to %d", 71) +CudaErrCheck( cudaEventRecord( lockEvent, stream ) ); // Wait on the callback stream until it's ready to dsitpatch - CudaErrCheck( cudaStreamWaitEvent( _callbackStream, lockEvent ) ); + Log::Line( "Marker Set to %d", 72) +CudaErrCheck( cudaStreamWaitEvent( _callbackStream, lockEvent ) ); // #MAYBE: Use a bump allocator perhaps later to avoid locking here by new/delete if needed for performance. auto* fnCpy = new std::function( std::move( func ) ); - CudaErrCheck( cudaLaunchHostFunc( _callbackStream, []( void* userData ){ + Log::Line( "Marker Set to %d", 73) +CudaErrCheck( cudaLaunchHostFunc( _callbackStream, []( void* userData ){ auto& fn = *reinterpret_cast*>( userData ); fn(); @@ -256,10 +268,12 @@ void GpuQueue::DispatchHostFunc( GpuCallbackDispath func, cudaStream_t stream, c }, fnCpy ) ); // Signal from the callback stream that the callback finished - CudaErrCheck( cudaEventRecord( completedEvent, _callbackStream ) ); + Log::Line( "Marker Set to %d", 74) +CudaErrCheck( cudaEventRecord( completedEvent, _callbackStream ) ); // Wait on work stream for the callback to complete - CudaErrCheck( cudaStreamWaitEvent( stream, completedEvent ) ); + Log::Line( "Marker Set to %d", 75) +CudaErrCheck( cudaStreamWaitEvent( stream, completedEvent ) ); } size_t GpuQueue::CalculateSliceSizeFromDescriptor( const GpuStreamDescriptor& desc ) diff --git a/cuda/GpuStreams.cu b/cuda/GpuStreams.cu index 2c129408..e075e306 100644 --- a/cuda/GpuStreams.cu +++ b/cuda/GpuStreams.cu @@ -60,26 +60,32 @@ void GpuUploadBuffer::Upload( const void* hostBuffer, size_t size, cudaStream_t { // Copy from unpinned to pinned first // #TODO: This should be done in a different backgrund host-to-host copy stream - CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->pinnedEvent[index] ) ); - CudaErrCheck( cudaMemcpyAsync( self->pinnedBuffer[index], hostBuffer, size, cudaMemcpyHostToHost, uploadStream ) ); + Log::Line( "Marker Set to %d", 76) +CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->pinnedEvent[index] ) ); + Log::Line( "Marker Set to %d", 77) +CudaErrCheck( cudaMemcpyAsync( self->pinnedBuffer[index], hostBuffer, size, cudaMemcpyHostToHost, uploadStream ) ); hostBuffer = self->pinnedBuffer[index]; } // Ensure the device buffer is ready for use - CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->deviceEvents[index] ) ); + Log::Line( "Marker Set to %d", 78) +CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->deviceEvents[index] ) ); // Upload to the device buffer - CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], hostBuffer, size, cudaMemcpyHostToDevice, uploadStream ) ); + Log::Line( "Marker Set to %d", 79) +CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], hostBuffer, size, cudaMemcpyHostToDevice, uploadStream ) ); if( !isDirect ) { // Signal that the pinned buffer is ready for re-use - CudaErrCheck( cudaEventRecord( self->pinnedEvent[index], uploadStream ) ); + Log::Line( "Marker Set to %d", 80) +CudaErrCheck( cudaEventRecord( self->pinnedEvent[index], uploadStream ) ); } // Signal work stream that the device buffer is ready to be used - CudaErrCheck( cudaEventRecord( self->readyEvents[index], uploadStream ) ); + Log::Line( "Marker Set to %d", 81) +CudaErrCheck( cudaEventRecord( self->readyEvents[index], uploadStream ) ); } void GpuUploadBuffer::UploadAndPreLoad( void* hostBuffer, const size_t size, const void* copyBufferSrc, const size_t copySize ) @@ -164,7 +170,8 @@ void GpuUploadBuffer::UploadArray( const void* hostBuffer, uint32 length, uint32 const auto copyMode = isDirect ? cudaMemcpyHostToDevice : cudaMemcpyHostToHost; // Wait on device or pinned buffer to be ready (depending if a direct copy or not) - CudaErrCheck( cudaStreamWaitEvent( uploadStream, waitEvent ) ); + Log::Line( "Marker Set to %d", 82) +CudaErrCheck( cudaStreamWaitEvent( uploadStream, waitEvent ) ); const byte* src = (byte*)hostBuffer; byte* dst = (byte*)( isDirect ? self->deviceBuffer[index] : self->pinnedBuffer[index] ); @@ -174,7 +181,8 @@ void GpuUploadBuffer::UploadArray( const void* hostBuffer, uint32 length, uint32 { const size_t size = *sizes * (size_t)elementSize; - CudaErrCheck( cudaMemcpyAsync( dst, src, size, copyMode, uploadStream ) ); + Log::Line( "Marker Set to %d", 83) +CudaErrCheck( cudaMemcpyAsync( dst, src, size, copyMode, uploadStream ) ); dst += size; src += srcStride; @@ -199,15 +207,19 @@ void GpuUploadBuffer::UploadArray( const void* hostBuffer, uint32 length, uint32 } // #TODO: This should be done in a copy stream to perform the copies in the background - CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->deviceEvents[index] ) ); - CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], hostBuffer, totalBufferSize, cudaMemcpyHostToDevice, uploadStream ) ); + Log::Line( "Marker Set to %d", 84) +CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->deviceEvents[index] ) ); + Log::Line( "Marker Set to %d", 84) +CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], hostBuffer, totalBufferSize, cudaMemcpyHostToDevice, uploadStream ) ); if( !self->diskBuffer ) - CudaErrCheck( cudaEventRecord( self->pinnedEvent[index], uploadStream ) ); + Log::Line( "Marker Set to %d", 85) +CudaErrCheck( cudaEventRecord( self->pinnedEvent[index], uploadStream ) ); } // Signal work stream that the device buffer is ready to be used - CudaErrCheck( cudaEventRecord( self->readyEvents[index], uploadStream ) ); + Log::Line( "Marker Set to %d", 86) +CudaErrCheck( cudaEventRecord( self->readyEvents[index], uploadStream ) ); } void GpuUploadBuffer::UploadArrayForIndex( const uint32 index, const void* hostBuffer, uint32 length, @@ -218,7 +230,8 @@ void GpuUploadBuffer::UploadArrayForIndex( const uint32 index, const void* hostB auto stream = self->queue->GetStream(); // Ensure the device buffer is ready for use - CudaErrCheck( cudaStreamWaitEvent( stream, self->events[index] ) ); + Log::Line( "Marker Set to %d", 87) +CudaErrCheck( cudaStreamWaitEvent( stream, self->events[index] ) ); // Perform uploads //size_t deviceCopySize = 0; @@ -229,7 +242,8 @@ void GpuUploadBuffer::UploadArrayForIndex( const uint32 index, const void* hostB { const size_t size = *counts * (size_t)elementSize; //memcpy( dst, src, size ); - CudaErrCheck( cudaMemcpyAsync( dst, src, size, cudaMemcpyHostToDevice, stream ) ); + Log::Line( "Marker Set to %d", 88) +CudaErrCheck( cudaMemcpyAsync( dst, src, size, cudaMemcpyHostToDevice, stream ) ); //deviceCopySize += size; @@ -239,10 +253,12 @@ void GpuUploadBuffer::UploadArrayForIndex( const uint32 index, const void* hostB } // Copy to device buffer - //CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], cpy.dstBuffer, deviceCopySize, cudaMemcpyHostToDevice, _stream ) ); + //Log::Line( "Marker Set to %d", 89) +CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], cpy.dstBuffer, deviceCopySize, cudaMemcpyHostToDevice, _stream ) ); // Signal work stream that the device buffer is ready to be used - CudaErrCheck( cudaEventRecord( self->readyEvents[index], stream ) ); + Log::Line( "Marker Set to %d", 90) +CudaErrCheck( cudaEventRecord( self->readyEvents[index], stream ) ); } void GpuUploadBuffer::Upload( const void* hostBuffer, const size_t size ) @@ -269,7 +285,8 @@ void* GpuUploadBuffer::GetUploadedDeviceBuffer( cudaStream_t workStream ) const uint32 index = self->completedSequence % self->bufferCount; self->completedSequence++; - CudaErrCheck( cudaStreamWaitEvent( workStream, self->readyEvents[index] ) ); + Log::Line( "Marker Set to %d", 91) +CudaErrCheck( cudaStreamWaitEvent( workStream, self->readyEvents[index] ) ); return self->deviceBuffer[index]; } @@ -283,7 +300,8 @@ void GpuUploadBuffer::ReleaseDeviceBuffer( cudaStream_t workStream ) const uint32 index = self->lockSequence % self->bufferCount; self->lockSequence++; - CudaErrCheck( cudaEventRecord( self->deviceEvents[index], workStream ) ); + Log::Line( "Marker Set to %d", 92) +CudaErrCheck( cudaEventRecord( self->deviceEvents[index], workStream ) ); } void GpuUploadBuffer::WaitForPreloadsToComplete() @@ -341,7 +359,8 @@ DiskBufferBase* GpuUploadBuffer::GetDiskBuffer() const void GpuUploadBuffer::CallHostFunctionOnStream( cudaStream_t stream, std::function func ) { auto* fnCpy = new std::function( std::move( func ) ); - CudaErrCheck( cudaLaunchHostFunc( stream, []( void* userData ) { + Log::Line( "Marker Set to %d", 93) +CudaErrCheck( cudaLaunchHostFunc( stream, []( void* userData ) { auto& fn = *reinterpret_cast*>( userData ); fn(); diff --git a/cuda/chacha8.cu b/cuda/chacha8.cu index 3e17c15a..32f665f2 100644 --- a/cuda/chacha8.cu +++ b/cuda/chacha8.cu @@ -201,8 +201,10 @@ void GenF1Cuda( CudaK32PlotContext& cx ) chacha8_ctx chacha; chacha8_keysetup( &chacha, key, 256, nullptr ); - CudaErrCheck( cudaMemcpyAsync( devChaChaInput, chacha.input, 64, cudaMemcpyHostToDevice, cx.computeStream ) ); - CudaErrCheck( cudaMemsetAsync( devBucketCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cx.computeStream ) ); + Log::Line( "Marker Set to %d", 94) +CudaErrCheck( cudaMemcpyAsync( devChaChaInput, chacha.input, 64, cudaMemcpyHostToDevice, cx.computeStream ) ); + Log::Line( "Marker Set to %d", 95) +CudaErrCheck( cudaMemsetAsync( devBucketCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cx.computeStream ) ); const uint32 outIndex = CudaK32PlotGetOutputIndex( cx ); @@ -218,7 +220,8 @@ void GenF1Cuda( CudaK32PlotContext& cx ) uint32* devMeta = (uint32*)cx.metaOut.LockDeviceBuffer( cx.computeStream ); #if _DEBUG - CudaErrCheck( cudaMemsetAsync( devY, 0, sizeof( uint32 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, cx.computeStream ) ); + Log::Line( "Marker Set to %d", 96) +CudaErrCheck( cudaMemsetAsync( devY, 0, sizeof( uint32 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, cx.computeStream ) ); #endif // Gen chacha blocks @@ -229,10 +232,12 @@ void GenF1Cuda( CudaK32PlotContext& cx ) } // Copy bucket slices to host - CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, + Log::Line( "Marker Set to %d", 97) +CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cudaMemcpyDeviceToHost, cx.computeStream ) ); - CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) ); + Log::Line( "Marker Set to %d", 98) +CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) ); memcpy( &cx.bucketSlices[0], cx.hostBucketSlices, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT );