diff --git a/cuda/CudaMatch.cu b/cuda/CudaMatch.cu index e827547f..c1fdbff7 100644 --- a/cuda/CudaMatch.cu +++ b/cuda/CudaMatch.cu @@ -564,17 +564,6 @@ cudaError CudaHarvestMatchK32( HarvestMatchK32Kernel<<>>( devOutPairs, devMatchCount, devYEntries, entryCount, matchOffset ); -// #if _DEBUG -// uint32 matchCount = 0; -// CudaErrCheck( cudaMemcpyAsync( &matchCount, devMatchCount, sizeof( uint32 ) , cudaMemcpyDeviceToHost, stream ) ); -// CudaErrCheck( cudaStreamSynchronize( stream ) ); -// CudaErrCheck( cudaStreamSynchronize( stream ) ); - -// Pair* matches = new Pair[matchCount]; -// CudaErrCheck( cudaMemcpyAsync( matches, devOutPairs, sizeof( Pair ) * matchCount , cudaMemcpyDeviceToHost, stream ) ); -// CudaErrCheck( cudaStreamSynchronize( stream ) ); -// CudaErrCheck( cudaStreamSynchronize( stream ) ); -// #endif return cudaSuccess; } @@ -603,8 +592,10 @@ void CudaMatchBucketizedK32( CudaSetFirstAndLastGroup<<<1,2,0,stream>>>( tmpGroupCounts, entryCount ); } - CudaErrCheck( cudaMemsetAsync( cx.devGroupCount, 0, sizeof( uint32 ), stream ) ); - CudaErrCheck( cudaMemsetAsync( cx.devMatchCount, 0, sizeof( uint32 ), stream ) ); + Log::Line( "Marker Set to %d", 1) +CudaErrCheck( cudaMemsetAsync( cx.devGroupCount, 0, sizeof( uint32 ), stream ) ); + Log::Line( "Marker Set to %d", 2) +CudaErrCheck( cudaMemsetAsync( cx.devMatchCount, 0, sizeof( uint32 ), stream ) ); ScanGroupsCudaK32Bucket<<>>( devY, tmpGroupCounts+2, cx.devGroupCount, entryCount, bucketMask ); byte* sortTmpAlloc = (byte*)( tmpGroupCounts + BBCU_MAX_GROUP_COUNT ); @@ -621,49 +612,3 @@ void CudaMatchBucketizedK32( MatchCudaK32Bucket<<>>( bucketMask, entryCount, cx.devGroupCount, devY, cx.devGroupBoundaries, cx.devMatchCount, cx.devMatches ); } -//----------------------------------------------------------- -// cudaError CudaHarvestMatchK32WithGroupScan( -// Pair* devOutPairs, -// uint32* devMatchCount, -// const uint32 maxMatches, -// uint32* devGroupIndices, -// uint32* devGroupIndicesTemp, -// const uint32 maxGroups, -// void* sortBuffer, -// const size_t sortBufferSize, -// const uint64* devYEntries, -// const uint32 entryCount, -// const uint32 matchOffset, -// cudaStream_t stream ) -// { -// // Scan for BC groups -// { -// const uint32 kblocks = 0; -// const uint32 kthreads = 0; - - -// // constexpr uint32 kscanblocks = CuCDiv( BBCU_BUCKET_ALLOC_ENTRY_COUNT, BBCU_SCAN_GROUP_THREADS ); -// // Initialize the entries to the max value so that they are not included in the sort -// CudaInitGroups<<>>( devGroupIndicesTemp, entryCount ); -// // CudaInitGroupsBucket<<>>( tmpGroupCounts ); - -// // Add first group and last ghost group -// CudaSetFirstAndLastGroup<<<1,2,0,stream>>>( tmpGroupCounts, entryCount ); -// } - -// CudaErrCheck( cudaMemsetAsync( cx.devGroupCount, 0, sizeof( uint32 ), stream ) ); -// CudaErrCheck( cudaMemsetAsync( cx.devMatchCount, 0, sizeof( uint32 ), stream ) ); -// ScanGroupsCudaK32Bucket<<>>( devY, tmpGroupCounts+2, cx.devGroupCount, entryCount, bucketMask ); - -// byte* sortTmpAlloc = (byte*)( tmpGroupCounts + BBCU_MAX_GROUP_COUNT ); -// size_t sortTmpSize = ( BBCU_BUCKET_ALLOC_ENTRY_COUNT - BBCU_MAX_GROUP_COUNT ) * sizeof( uint32 ); - -// #if _DEBUG -// size_t sortSize = 0; -// cub::DeviceRadixSort::SortKeys( nullptr, sortSize, nullptr, nullptr, BBCU_MAX_GROUP_COUNT, 0, 32 ); -// ASSERT( sortSize <= sortTmpSize ); -// #endif - -// cub::DeviceRadixSort::SortKeys( sortTmpAlloc, sortTmpSize, tmpGroupCounts, cx.devGroupBoundaries, BBCU_MAX_GROUP_COUNT, 0, 32, stream ); - -// } diff --git a/cuda/CudaParkSerializer.cu b/cuda/CudaParkSerializer.cu index f3e8b8d4..91516a7a 100644 --- a/cuda/CudaParkSerializer.cu +++ b/cuda/CudaParkSerializer.cu @@ -20,8 +20,10 @@ void InitFSEBitMask( CudaK32PlotContext& cx ) 0x3FFFFFFF, 0x7FFFFFFF }; - CudaErrCheck( cudaMemcpyToSymbolAsync( CUDA_FSE_BIT_mask, bitmask, sizeof( bitmask ), 0, cudaMemcpyHostToDevice, cx.computeStream ) ); - CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) ); + Log::Line( "Marker Set to %d", 3) +CudaErrCheck( cudaMemcpyToSymbolAsync( CUDA_FSE_BIT_mask, bitmask, sizeof( bitmask ), 0, cudaMemcpyHostToDevice, cx.computeStream ) ); + Log::Line( "Marker Set to %d", 4) +CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) ); } diff --git a/cuda/CudaPlotContext.h b/cuda/CudaPlotContext.h index fc5884b3..c83b3a32 100644 --- a/cuda/CudaPlotContext.h +++ b/cuda/CudaPlotContext.h @@ -576,8 +576,10 @@ inline void DbgPrintDeviceHash( const char* msg, const void* ptr, const size_t s byte hash[32]; void* hostBuffer = bbvirtallocboundednuma( size ); - CudaErrCheck( cudaMemcpyAsync( hostBuffer, ptr, size, cudaMemcpyDeviceToHost, stream ) ); - CudaErrCheck( cudaStreamSynchronize( stream ) ); + Log::Line( "Marker Set to %d", 5) +CudaErrCheck( cudaMemcpyAsync( hostBuffer, ptr, size, cudaMemcpyDeviceToHost, stream ) ); + Log::Line( "Marker Set to %d", 6) +CudaErrCheck( cudaStreamSynchronize( stream ) ); blake3_hasher hasher; blake3_hasher_init( &hasher ); diff --git a/cuda/CudaPlotPhase2.cu b/cuda/CudaPlotPhase2.cu index 8d2d5094..d66ab8b6 100644 --- a/cuda/CudaPlotPhase2.cu +++ b/cuda/CudaPlotPhase2.cu @@ -119,7 +119,8 @@ static void BytefieldToBitfield( CudaK32PlotContext& cx, const byte* bytefield, #if DBG_BBCU_P2_COUNT_PRUNED_ENTRIES #define G_PRUNED_COUNTS ,cx.phase2->devPrunedCount - CudaErrCheck( cudaMemsetAsync( cx.phase2->devPrunedCount, 0, sizeof( uint32 ), stream ) ); + Log::Line( "Marker Set to %d", 7) +CudaErrCheck( cudaMemsetAsync( cx.phase2->devPrunedCount, 0, sizeof( uint32 ), stream ) ); #else #define G_PRUNED_COUNTS #endif @@ -176,7 +177,8 @@ void MarkTable( CudaK32PlotContext& cx, CudaK32Phase2& p2 ) } // Zero-out marks - CudaErrCheck( cudaMemsetAsync( devLMarks, 0, GetMarkingTableByteSize(), cx.computeStream ) ); + Log::Line( "Marker Set to %d", 8) +CudaErrCheck( cudaMemsetAsync( devLMarks, 0, GetMarkingTableByteSize(), cx.computeStream ) ); // Load first bucket's worth of pairs LoadPairs( cx, p2, rTable, 0 ); @@ -230,9 +232,12 @@ void MarkTable( CudaK32PlotContext& cx, CudaK32Phase2& p2 ) #if DBG_BBCU_P2_COUNT_PRUNED_ENTRIES { uint32 prunedEntryCount = 0; - CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) ); - CudaErrCheck( cudaMemcpyAsync( &prunedEntryCount, p2.devPrunedCount, sizeof( uint32 ), cudaMemcpyDeviceToHost, cx.computeStream ) ); - CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) ); + Log::Line( "Marker Set to %d", 9) +CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) ); + Log::Line( "Marker Set to %d", 10) +CudaErrCheck( cudaMemcpyAsync( &prunedEntryCount, p2.devPrunedCount, sizeof( uint32 ), cudaMemcpyDeviceToHost, cx.computeStream ) ); + Log::Line( "Marker Set to %d", 11) +CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) ); const uint64 lEntryCount = cx.tableEntryCounts[(int)lTable]; Log::Line( "Table %u now has %u / %llu ( %.2lf%% ) entries.", (uint)lTable+1, @@ -245,22 +250,7 @@ void MarkTable( CudaK32PlotContext& cx, CudaK32Phase2& p2 ) #if _DEBUG p2.outMarks.WaitForCompletion(); - // CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) ); - // CudaErrCheck( cudaStreamSynchronize( cx.gpuDownloadStream[0]->GetStream() ) ); - // CudaErrCheck( cudaStreamSynchronize( cx.gpuDownloadStream[1]->GetStream() ) ); - // CudaErrCheck( cudaStreamSynchronize( cx.gpuDownloadStream[2]->GetStream() ) ); - // byte* hByteField = bbcvirtalloc( GetMarkingTableByteSize() ); - // uint64* hBitField = bbcvirtalloc( GetMarkingTableBitFieldSize() ); - // uint64* rBitField = bbcvirtalloc( GetMarkingTableBitFieldSize() ); - // CudaErrCheck( cudaMemcpyAsync( hByteField, devLMarks, GetMarkingTableByteSize(), cudaMemcpyDeviceToHost, cx.computeStream ) ); - // CudaErrCheck( cudaMemcpyAsync( hBitField, bitfield, GetMarkingTableBitFieldSize(), cudaMemcpyDeviceToHost, cx.computeStream ) ); - - // if( rTable < TableId::Table7 ) - // CudaErrCheck( cudaMemcpyAsync( rBitField, p2.devRMarks, GetMarkingTableBitFieldSize(), cudaMemcpyDeviceToHost, cx.computeStream ) ); - - // CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) ); - // // (void)p2.outMarks.GetDeviceBuffer(); uint64* hBitField = cx.hostMarkingTables[(int)lTable]; std::atomic bitfieldPrunedEntryCount = 0; @@ -276,23 +266,10 @@ void MarkTable( CudaK32PlotContext& cx, CudaK32Phase2& p2 ) uint64 localPrunedEntryCount = 0; uint64 rPrunedEntryCount = 0; - // BitField rMarks( rBitField, rEntryCount ); - // const byte* bytefield = hByteField; uint64 count, offset, end; - // // Count r entries again to make sure it's still valid - // if( rt < TableId::Table7 ) - // { - // GetThreadOffsets( self, rEntryCount, count, offset, end ); - // for( uint64 i = offset; i < end; i++ ) - // { - // if( rMarks.Get( i ) ) - // rPrunedEntryCount ++; - // } - - // rTablePrunedEntryCount += rPrunedEntryCount; - // } + GetThreadOffsets( self, lEntryCount, count, offset, end ); // for( uint64 i = offset; i < end; i++ ) diff --git a/cuda/CudaPlotPhase3.cu b/cuda/CudaPlotPhase3.cu index 8fcdfe2a..176f344c 100644 --- a/cuda/CudaPlotPhase3.cu +++ b/cuda/CudaPlotPhase3.cu @@ -374,12 +374,14 @@ void Step1( CudaK32PlotContext& cx ) const TableId rTable = cx.table; // Clear pruned table count - CudaErrCheck( cudaMemsetAsync( p3.devPrunedEntryCount, 0, sizeof( uint32 ), cx.computeStream ) ); + Log::Line( "Marker Set to %d", 12) +CudaErrCheck( cudaMemsetAsync( p3.devPrunedEntryCount, 0, sizeof( uint32 ), cx.computeStream ) ); // Load marking table (must be loaded before first bucket, on the same stream) if( cx.table < TableId::Table7 ) { - CudaErrCheck( cudaMemcpyAsync( s1.rTableMarks, cx.hostMarkingTables[(int)rTable], + Log::Line( "Marker Set to %d", 13) +CudaErrCheck( cudaMemcpyAsync( s1.rTableMarks, cx.hostMarkingTables[(int)rTable], GetMarkingTableBitFieldSize(), cudaMemcpyHostToDevice, s1.pairsLIn.GetQueue()->GetStream() ) ); } @@ -416,7 +418,8 @@ void Step1( CudaK32PlotContext& cx ) // Generate map #define KERN_RMAP_ARGS entryCount, rTableOffset, devSliceCounts, p3.devPrunedEntryCount, devRMap, devLPairs, devRPairs, s1.rTableMarks - CudaErrCheck( cudaMemsetAsync( devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT, cx.computeStream ) ); + Log::Line( "Marker Set to %d", 14) +CudaErrCheck( cudaMemsetAsync( devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT, cx.computeStream ) ); if( cx.table < TableId::Table7 ) PruneAndWriteRMap<<>>( KERN_RMAP_ARGS ); @@ -436,7 +439,8 @@ void Step1( CudaK32PlotContext& cx ) // Download slice counts cudaStream_t downloadStream = s1.rMapOut.GetQueue()->GetStream(); - CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, + Log::Line( "Marker Set to %d", 15) +CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cudaMemcpyDeviceToHost, downloadStream ) ); // Wait for completion @@ -446,7 +450,8 @@ void Step1( CudaK32PlotContext& cx ) s1.pairsLIn.Reset(); s1.pairsRIn.Reset(); - CudaErrCheck( cudaStreamSynchronize( downloadStream ) ); + Log::Line( "Marker Set to %d", 16) +CudaErrCheck( cudaStreamSynchronize( downloadStream ) ); // Add-up pruned bucket counts and tables counts memcpy( &s1.prunedBucketSlices[0][0], cx.hostBucketSlices, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT ); @@ -521,7 +526,8 @@ void CompressInlinedTable( CudaK32PlotContext& cx ) #endif // Load R Marking table (must be loaded before first bucket, on the same stream) - CudaErrCheck( cudaMemcpyAsync( (void*)tx.devRMarks, cx.hostMarkingTables[(int)rTable], + Log::Line( "Marker Set to %d", 17) +CudaErrCheck( cudaMemcpyAsync( (void*)tx.devRMarks, cx.hostMarkingTables[(int)rTable], GetMarkingTableBitFieldSize(), cudaMemcpyHostToDevice, p3.xTable.xIn.GetQueue()->GetStream() ) ); // Load initial bucket @@ -536,7 +542,8 @@ void CompressInlinedTable( CudaK32PlotContext& cx ) uint64 tablePrunedEntryCount = 0; uint32 rTableOffset = 0; - CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cx.computeStream ) ); + Log::Line( "Marker Set to %d", 18) +CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cx.computeStream ) ); for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ ) { @@ -559,7 +566,8 @@ void CompressInlinedTable( CudaK32PlotContext& cx ) uint32* devSliceCounts = cx.devSliceCounts + bucket * BBCU_BUCKET_COUNT; #if _DEBUG - CudaErrCheck( cudaMemsetAsync( outLps, 0, sizeof( uint64 ) * P3_PRUNED_BUCKET_MAX, cx.computeStream ) ); + Log::Line( "Marker Set to %d", 19) +CudaErrCheck( cudaMemsetAsync( outLps, 0, sizeof( uint64 ) * P3_PRUNED_BUCKET_MAX, cx.computeStream ) ); #endif CudaConvertInlinedXsToLinePoints<<>>( diff --git a/cuda/CudaPlotPhase3Step2.cu b/cuda/CudaPlotPhase3Step2.cu index 3a7a6449..eaa03f15 100644 --- a/cuda/CudaPlotPhase3Step2.cu +++ b/cuda/CudaPlotPhase3Step2.cu @@ -148,7 +148,8 @@ static void ConvertRMapToLinePoints( CudaK32PlotContext& cx, const uint32 entryC const uint64 divisor = P3_CalculateMaxLPValue( prunedEntryCount ) / BBCU_BUCKET_COUNT; // #TODO: Use upload stream? - CudaErrCheck( cudaMemcpyToSymbolAsync( BucketDivisor, &divisor, sizeof( divisor ), 0, cudaMemcpyHostToDevice, cx.computeStream ) ); + Log::Line( "Marker Set to %d", 20) +CudaErrCheck( cudaMemcpyToSymbolAsync( BucketDivisor, &divisor, sizeof( divisor ), 0, cudaMemcpyHostToDevice, cx.computeStream ) ); } CudaConvertRMapToLinePoints<<>>( Rmap2LPParams, 0 ); @@ -223,7 +224,8 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx ) if( isCompressed ) { // Copy from upload buffer to working buffer - CudaErrCheck( cudaMemcpyAsync( lTable, lMap, BBCU_BUCKET_ENTRY_COUNT * sizeof( uint32 ), cudaMemcpyDeviceToDevice, cx.computeStream ) ); + Log::Line( "Marker Set to %d", 21) +CudaErrCheck( cudaMemcpyAsync( lTable, lMap, BBCU_BUCKET_ENTRY_COUNT * sizeof( uint32 ), cudaMemcpyDeviceToDevice, cx.computeStream ) ); } else { @@ -264,7 +266,8 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx ) LoadRBucket( cx, 0 ); // Clear pruned entry count - CudaErrCheck( cudaMemsetAsync( p3.devPrunedEntryCount, 0, sizeof( uint32 ), cx.computeStream ) ); + Log::Line( "Marker Set to %d", 22) +CudaErrCheck( cudaMemsetAsync( p3.devPrunedEntryCount, 0, sizeof( uint32 ), cx.computeStream ) ); // Unpack the first map beforehand UnpackLBucket( cx, 0 ); @@ -275,7 +278,8 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx ) /// uint32 rTableOffset = 0; // Track the global origin index of R entry/line point - CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cx.computeStream ) ); + Log::Line( "Marker Set to %d", 23) +CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cx.computeStream ) ); for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ ) { @@ -298,7 +302,8 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx ) static_assert( BBCU_BUCKET_ALLOC_ENTRY_COUNT - BBCU_BUCKET_ENTRY_COUNT == copyCount ); uint32* nextLTable = s2.devLTable[nextBucket & 1]; - CudaErrCheck( cudaMemcpyAsync( (uint32*)devLTable + BBCU_BUCKET_ENTRY_COUNT, nextLTable, copyCount * sizeof( uint32 ), cudaMemcpyDeviceToDevice, cx.computeStream ) ); + Log::Line( "Marker Set to %d", 24) +CudaErrCheck( cudaMemcpyAsync( (uint32*)devLTable + BBCU_BUCKET_ENTRY_COUNT, nextLTable, copyCount * sizeof( uint32 ), cudaMemcpyDeviceToDevice, cx.computeStream ) ); } if( nextBucketL < BBCU_BUCKET_COUNT ) @@ -351,12 +356,14 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx ) // Copy slice counts & bucket count cudaStream_t downloadStream = s2.lpOut.GetQueue()->GetStream(); - CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, + Log::Line( "Marker Set to %d", 25) +CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cudaMemcpyDeviceToHost, downloadStream ) ); memset( p3.prunedBucketCounts[(int)rTable], 0, BBCU_BUCKET_COUNT * sizeof( uint32 ) ); - CudaErrCheck( cudaStreamSynchronize( downloadStream ) ); + Log::Line( "Marker Set to %d", 26) +CudaErrCheck( cudaStreamSynchronize( downloadStream ) ); bbmemcpy_t( &s2.prunedBucketSlices[0][0], cx.hostBucketSlices, BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT ); for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ ) { @@ -481,9 +488,11 @@ void WritePark7( CudaK32PlotContext& cx ) const size_t copySize = sizeof( uint32 ) * retainedEntryCount; if( !isLastBucket ) - CudaErrCheck( cudaMemcpyAsync( devIndexBuffer - retainedEntryCount, copySource, copySize, cudaMemcpyDeviceToDevice, cx.computeStream ) ); + Log::Line( "Marker Set to %d", 27) +CudaErrCheck( cudaMemcpyAsync( devIndexBuffer - retainedEntryCount, copySource, copySize, cudaMemcpyDeviceToDevice, cx.computeStream ) ); else - CudaErrCheck( cudaMemcpyAsync( hostLastParkEntries, copySource, copySize, cudaMemcpyDeviceToHost, cx.computeStream ) ); + Log::Line( "Marker Set to %d", 28) +CudaErrCheck( cudaMemcpyAsync( hostLastParkEntries, copySource, copySize, cudaMemcpyDeviceToHost, cx.computeStream ) ); } // Download parks & write to plot diff --git a/cuda/CudaPlotPhase3Step3.cu b/cuda/CudaPlotPhase3Step3.cu index c8f9337b..93b7e681 100644 --- a/cuda/CudaPlotPhase3Step3.cu +++ b/cuda/CudaPlotPhase3Step3.cu @@ -61,7 +61,8 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx ) const FSE_CTable* hostCTable = !isCompressed ? CTables[(int)lTable] : cx.gCfg->ctable; // (upload must be loaded before first bucket, on the same stream) - CudaErrCheck( cudaMemcpyAsync( s3.devCTable, hostCTable, cTableSize, cudaMemcpyHostToDevice, + Log::Line( "Marker Set to %d", 29) +CudaErrCheck( cudaMemcpyAsync( s3.devCTable, hostCTable, cTableSize, cudaMemcpyHostToDevice, s3.lpIn.GetQueue()->GetStream() ) ); // Load initial bucket @@ -116,11 +117,14 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx ) cudaStream_t lpStream = cx.computeStream;//B; cudaStream_t downloadStream = cx.gpuDownloadStream[0]->GetStream(); - CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, sortAndMapStream ) ); - CudaErrCheck( cudaMemsetAsync( s3.devParkOverrunCount, 0, sizeof( uint32 ), sortAndMapStream ) ); + Log::Line( "Marker Set to %d", 30) +CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, sortAndMapStream ) ); + Log::Line( "Marker Set to %d", 31) +CudaErrCheck( cudaMemsetAsync( s3.devParkOverrunCount, 0, sizeof( uint32 ), sortAndMapStream ) ); // Set initial event LP stream event as set. - CudaErrCheck( cudaEventRecord( cx.computeEventA, lpStream ) ); + Log::Line( "Marker Set to %d", 32) +CudaErrCheck( cudaEventRecord( cx.computeEventA, lpStream ) ); cx.parkFence->Reset( 0 ); s3.parkBucket = 0; @@ -145,23 +149,27 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx ) #if _DEBUG { size_t sortRequiredSize = 0; - CudaErrCheck( cub::DeviceRadixSort::SortPairs( nullptr, sortRequiredSize, nullptr, nullptr, nullptr, nullptr, bucketEntryCount, 0, 64 ) ); + Log::Line( "Marker Set to %d", 33) +CudaErrCheck( cub::DeviceRadixSort::SortPairs( nullptr, sortRequiredSize, nullptr, nullptr, nullptr, nullptr, bucketEntryCount, 0, 64 ) ); ASSERT( s3.sizeTmpSort >= sortRequiredSize ); } #endif // Wait for the previous bucket's LP work to finish, so we can re-use the device buffer - CudaErrCheck( cudaStreamWaitEvent( sortAndMapStream, cx.computeEventA ) ); + Log::Line( "Marker Set to %d", 34) +CudaErrCheck( cudaStreamWaitEvent( sortAndMapStream, cx.computeEventA ) ); // #TODO: We can use 63-7 (log2(128 buckets)), which might be faster // #NOTE: I did change it and the sort failed. Investigate. - CudaErrCheck( cub::DeviceRadixSort::SortPairs( + Log::Line( "Marker Set to %d", 35) +CudaErrCheck( cub::DeviceRadixSort::SortPairs( s3.devSortTmpData, s3.sizeTmpSort, unsortedLinePoints, sortedLinePoints, unsortedIndices, sortedIndices, bucketEntryCount, 0, 64, sortAndMapStream ) ); - CudaErrCheck( cudaEventRecord( cx.computeEventB, sortAndMapStream ) ); + Log::Line( "Marker Set to %d", 36) +CudaErrCheck( cudaEventRecord( cx.computeEventB, sortAndMapStream ) ); s3.lpIn .ReleaseDeviceBuffer( sortAndMapStream ); unsortedLinePoints = nullptr; s3.indexIn.ReleaseDeviceBuffer( sortAndMapStream ); unsortedIndices = nullptr; @@ -190,12 +198,14 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx ) ASSERT( parkCount <= P3_PRUNED_MAX_PARKS_PER_BUCKET ); // Wait for sort to finish - CudaErrCheck( cudaStreamWaitEvent( lpStream, cx.computeEventB ) ); + Log::Line( "Marker Set to %d", 37) +CudaErrCheck( cudaStreamWaitEvent( lpStream, cx.computeEventB ) ); // Deltafy line points DeltafyLinePoints( cx, entryCount, parkLinePoints, s3.devDeltaLinePoints, lpStream ); - CudaErrCheck( cudaEventRecord( cx.computeEventC, lpStream ) ); // Signal download stream can download remaining line points for last park + Log::Line( "Marker Set to %d", 38) +CudaErrCheck( cudaEventRecord( cx.computeEventC, lpStream ) ); // Signal download stream can download remaining line points for last park // Compress line point parks byte* devParks = (byte*)s3.parksOut.LockDeviceBuffer( lpStream ); @@ -214,20 +224,24 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx ) if( !isLastBucket ) { // Not the last bucket, so retain entries for the next GPU compression bucket - CudaErrCheck( cudaMemcpyAsync( sortedLinePoints - retainedLPCount, copySource, copySize, cudaMemcpyDeviceToDevice, lpStream ) ); + Log::Line( "Marker Set to %d", 39) +CudaErrCheck( cudaMemcpyAsync( sortedLinePoints - retainedLPCount, copySource, copySize, cudaMemcpyDeviceToDevice, lpStream ) ); } else { // No more buckets so we have to compress this last park on the CPU - CudaErrCheck( cudaStreamWaitEvent( downloadStream, cx.computeEventC ) ); + Log::Line( "Marker Set to %d", 40) +CudaErrCheck( cudaStreamWaitEvent( downloadStream, cx.computeEventC ) ); hostRetainedEntries = cx.useParkContext ? cx.parkContext->hostRetainedLinePoints : (uint64*)( hostParksWriter + hostParkSize * parkCount ); - CudaErrCheck( cudaMemcpyAsync( hostRetainedEntries, copySource, copySize, cudaMemcpyDeviceToHost, downloadStream ) ); + Log::Line( "Marker Set to %d", 41) +CudaErrCheck( cudaMemcpyAsync( hostRetainedEntries, copySource, copySize, cudaMemcpyDeviceToHost, downloadStream ) ); } } - CudaErrCheck( cudaEventRecord( cx.computeEventA, lpStream ) ); // Signal sortedLinePoints buffer ready for use again + Log::Line( "Marker Set to %d", 42) +CudaErrCheck( cudaEventRecord( cx.computeEventA, lpStream ) ); // Signal sortedLinePoints buffer ready for use again // Download parks @@ -270,17 +284,20 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx ) } // Copy park overrun count - CudaErrCheck( cudaMemcpyAsync( s3.hostParkOverrunCount, s3.devParkOverrunCount, sizeof( uint32 ), cudaMemcpyDeviceToHost, downloadStream ) ); + Log::Line( "Marker Set to %d", 43) +CudaErrCheck( cudaMemcpyAsync( s3.hostParkOverrunCount, s3.devParkOverrunCount, sizeof( uint32 ), cudaMemcpyDeviceToHost, downloadStream ) ); // Wait for parks to complete downloading s3.parksOut.WaitForCompletion(); s3.parksOut.Reset(); // Copy map slice counts (for the next step 2) - CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, + Log::Line( "Marker Set to %d", 44) +CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cudaMemcpyDeviceToHost, downloadStream ) ); - CudaErrCheck( cudaStreamSynchronize( downloadStream ) ); + Log::Line( "Marker Set to %d", 45) +CudaErrCheck( cudaStreamSynchronize( downloadStream ) ); memcpy( &s3.prunedBucketSlices[0][0], cx.hostBucketSlices, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT ); FatalIf( *s3.hostParkOverrunCount > 0, "Park buffer overrun." ); diff --git a/cuda/CudaPlotUtil.cu b/cuda/CudaPlotUtil.cu index 4f7f18b3..228be9f6 100644 --- a/cuda/CudaPlotUtil.cu +++ b/cuda/CudaPlotUtil.cu @@ -21,7 +21,8 @@ void CudaK32PlotGenSortKey( const uint32 entryCount, uint32* devKey, cudaStream_ GenSortKey<<>>( entryCount, devKey ); if( synchronize ) - CudaErrCheck( cudaStreamSynchronize( stream ) ); + Log::Line( "Marker Set to %d", 46) +CudaErrCheck( cudaStreamSynchronize( stream ) ); } @@ -48,7 +49,8 @@ void CudaK32PlotSortByKey( const uint32 entryCount, const uint32* devKey, const SortByKey<<>>( entryCount, devKey, devInput, devOutput ); if( synchronize ) - CudaErrCheck( cudaStreamSynchronize( stream ) ); + Log::Line( "Marker Set to %d", 47) +CudaErrCheck( cudaStreamSynchronize( stream ) ); } //----------------------------------------------------------- diff --git a/cuda/CudaPlotter.cu b/cuda/CudaPlotter.cu index 80ba8b0e..64e9e94d 100644 --- a/cuda/CudaPlotter.cu +++ b/cuda/CudaPlotter.cu @@ -201,10 +201,14 @@ void InitContext( CudaK32PlotConfig& cfg, CudaK32PlotContext*& outContext ) CudaInit( cx ); - CudaErrCheck( cudaStreamCreateWithFlags( &cx.computeStream , cudaStreamNonBlocking ) ); - CudaErrCheck( cudaStreamCreateWithFlags( &cx.computeStreamB, cudaStreamNonBlocking ) ); - CudaErrCheck( cudaStreamCreateWithFlags( &cx.computeStreamC, cudaStreamNonBlocking ) ); - CudaErrCheck( cudaStreamCreateWithFlags( &cx.computeStreamD, cudaStreamNonBlocking ) ); + Log::Line( "Marker Set to %d", 48) +CudaErrCheck( cudaStreamCreateWithFlags( &cx.computeStream , cudaStreamNonBlocking ) ); + Log::Line( "Marker Set to %d", 49) +CudaErrCheck( cudaStreamCreateWithFlags( &cx.computeStreamB, cudaStreamNonBlocking ) ); + Log::Line( "Marker Set to %d", 50) +CudaErrCheck( cudaStreamCreateWithFlags( &cx.computeStreamC, cudaStreamNonBlocking ) ); + Log::Line( "Marker Set to %d", 51) +CudaErrCheck( cudaStreamCreateWithFlags( &cx.computeStreamD, cudaStreamNonBlocking ) ); cudaEventCreateWithFlags( &cx.computeEventA, cudaEventDisableTiming ); cudaEventCreateWithFlags( &cx.computeEventB, cudaEventDisableTiming ); @@ -318,7 +322,8 @@ void CudaInit( CudaK32PlotContext& cx ) cx.cudaDevice = (int32)cx.cfg.deviceIndex; cudaDeviceProp* cudaDevProps = new cudaDeviceProp{}; - CudaErrCheck( cudaGetDeviceProperties( cudaDevProps, cx.cudaDevice ) ); + Log::Line( "Marker Set to %d", 52) +CudaErrCheck( cudaGetDeviceProperties( cudaDevProps, cx.cudaDevice ) ); cx.cudaDevProps = cudaDevProps; Log::Line( "Selected cuda device %u : %s", cx.cudaDevice, cudaDevProps->name ); @@ -364,7 +369,8 @@ void CudaK32Plotter::Run( const PlotRequest& req ) const auto& cfg = _cfg; // Only start profiling from here (don't profile allocations) - CudaErrCheck( cudaProfilerStart() ); + Log::Line( "Marker Set to %d", 53) +CudaErrCheck( cudaProfilerStart() ); ASSERT( cx.plotWriter == nullptr ); cx.plotWriter = new PlotWriter( !cfg.gCfg->disableOutputDirectIO ); @@ -522,7 +528,8 @@ void FpTable( CudaK32PlotContext& cx ) } // Clear slice counts - CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cx.computeStream ) ); + Log::Line( "Marker Set to %d", 54) +CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cx.computeStream ) ); // Load initial buckets UploadBucketForTable( cx, 0 ); @@ -533,7 +540,8 @@ void FpTable( CudaK32PlotContext& cx ) FpTableBucket( cx, bucket ); } - CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) ); + Log::Line( "Marker Set to %d", 55) +CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) ); // Copy bucket slices to host cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, diff --git a/cuda/GpuDownloadStream.cu b/cuda/GpuDownloadStream.cu index 3d06973c..d903adc2 100644 --- a/cuda/GpuDownloadStream.cu +++ b/cuda/GpuDownloadStream.cu @@ -11,7 +11,8 @@ void* GpuDownloadBuffer::GetDeviceBuffer() { const uint32 index = self->outgoingSequence % self->bufferCount; - CudaErrCheck( cudaEventSynchronize( self->events[index] ) ); + Log::Line( "Marker Set to %d", 56) +CudaErrCheck( cudaEventSynchronize( self->events[index] ) ); return self->deviceBuffer[index]; } @@ -25,7 +26,8 @@ void* GpuDownloadBuffer::LockDeviceBuffer( cudaStream_t stream ) self->lockSequence++; // Wait for the device buffer to be free to be used by kernels - CudaErrCheck( cudaStreamWaitEvent( stream, self->events[index] ) ); + Log::Line( "Marker Set to %d", 57) +CudaErrCheck( cudaStreamWaitEvent( stream, self->events[index] ) ); return self->deviceBuffer[index]; } @@ -64,22 +66,21 @@ void GpuDownloadBuffer::DownloadAndCopy( void* hostBuffer, void* finalBuffer, co // const void* devBuffer = self->deviceBuffer[index]; // // Signal from the work stream when it has finished doing kernel work with the device buffer - // CudaErrCheck( cudaEventRecord( self->readyEvents[index], workStream ) ); // // Ensure the work stream has completed writing data to the device buffer // cudaStream_t stream = self->queue->_stream; - // CudaErrCheck( cudaStreamWaitEvent( stream, self->readyEvents[index] ) ); // // Copy - // CudaErrCheck( cudaMemcpyAsync( hostBuffer, devBuffer, size, cudaMemcpyDeviceToHost, stream ) ); - + // // Signal that the device buffer is free to be re-used - // CudaErrCheck( cudaEventRecord( self->events[index], stream ) ); + // Log::Line( "Marker Set to %d", 58) +CudaErrCheck( cudaEventRecord( self->events[index], stream ) ); // // Launch copy command - // CudaErrCheck( cudaLaunchHostFunc( stream, []( void* userData ){ + // Log::Line( "Marker Set to %d", 59) +CudaErrCheck( cudaLaunchHostFunc( stream, []( void* userData ){ // const CopyInfo& c = *reinterpret_cast( userData ); // IGpuBuffer* self = c.self; @@ -134,11 +135,13 @@ void GpuDownloadBuffer::PerformDownload2D( void* hostBuffer, size_t width, size_ // Signal from the work stream when it has finished doing kernel work with the device buffer - CudaErrCheck( cudaEventRecord( self->workEvent[index], workStream ) ); + Log::Line( "Marker Set to %d", 60) +CudaErrCheck( cudaEventRecord( self->workEvent[index], workStream ) ); // From the download stream, wait for the work stream to finish cudaStream_t downloadStream = self->queue->_stream; - CudaErrCheck( cudaStreamWaitEvent( downloadStream, self->workEvent[index] ) ); + Log::Line( "Marker Set to %d", 61) +CudaErrCheck( cudaStreamWaitEvent( downloadStream, self->workEvent[index] ) ); if( self->diskBuffer ) diff --git a/cuda/GpuQueue.cu b/cuda/GpuQueue.cu index 399a0fbf..110929a0 100644 --- a/cuda/GpuQueue.cu +++ b/cuda/GpuQueue.cu @@ -9,9 +9,12 @@ GpuQueue::GpuQueue( Kind kind ) : _kind( kind ) , _bufferReadySignal( BBCU_BUCKET_COUNT ) { - CudaErrCheck( cudaStreamCreateWithFlags( &_stream , cudaStreamNonBlocking ) ); - CudaErrCheck( cudaStreamCreateWithFlags( &_preloadStream , cudaStreamNonBlocking ) ); - CudaErrCheck( cudaStreamCreateWithFlags( &_callbackStream, cudaStreamNonBlocking ) ); + Log::Line( "Marker Set to %d", 62) +CudaErrCheck( cudaStreamCreateWithFlags( &_stream , cudaStreamNonBlocking ) ); + Log::Line( "Marker Set to %d", 63) +CudaErrCheck( cudaStreamCreateWithFlags( &_preloadStream , cudaStreamNonBlocking ) ); + Log::Line( "Marker Set to %d", 64) +CudaErrCheck( cudaStreamCreateWithFlags( &_callbackStream, cudaStreamNonBlocking ) ); _queueThread.Run( QueueThreadEntryPoint, this ); } @@ -168,14 +171,19 @@ struct IGpuBuffer* GpuQueue::CreateGpuBuffer( const GpuStreamDescriptor& desc, b for( int32 i = 0; i < desc.bufferCount; i++ ) { - CudaErrCheck( cudaEventCreateWithFlags( &buf->events[i] , cudaEventDisableTiming ) ); - CudaErrCheck( cudaEventCreateWithFlags( &buf->completedEvents[i], cudaEventDisableTiming ) ); - CudaErrCheck( cudaEventCreateWithFlags( &buf->readyEvents[i] , cudaEventDisableTiming ) ); - // CudaErrCheck( cudaEventCreateWithFlags( &buf->preloadEvents[i] , cudaEventDisableTiming ) ); - CudaErrCheck( cudaEventCreateWithFlags( &buf->pinnedEvent[i] , cudaEventDisableTiming ) ); - - CudaErrCheck( cudaEventCreateWithFlags( &buf->callbackLockEvent , cudaEventDisableTiming ) ); - CudaErrCheck( cudaEventCreateWithFlags( &buf->callbackCompletedEvent, cudaEventDisableTiming ) ); + Log::Line( "Marker Set to %d", 65) +CudaErrCheck( cudaEventCreateWithFlags( &buf->events[i] , cudaEventDisableTiming ) ); + Log::Line( "Marker Set to %d", 66) +CudaErrCheck( cudaEventCreateWithFlags( &buf->completedEvents[i], cudaEventDisableTiming ) ); + Log::Line( "Marker Set to %d", 67) +CudaErrCheck( cudaEventCreateWithFlags( &buf->readyEvents[i] , cudaEventDisableTiming ) ); + Log::Line( "Marker Set to %d", 68) +CudaErrCheck( cudaEventCreateWithFlags( &buf->pinnedEvent[i] , cudaEventDisableTiming ) ); + + Log::Line( "Marker Set to %d", 69) +CudaErrCheck( cudaEventCreateWithFlags( &buf->callbackLockEvent , cudaEventDisableTiming ) ); + Log::Line( "Marker Set to %d", 70) +CudaErrCheck( cudaEventCreateWithFlags( &buf->callbackCompletedEvent, cudaEventDisableTiming ) ); buf->deviceBuffer[i] = devBuffers[i]; buf->pinnedBuffer[i] = pinnedBuffers[i]; @@ -241,14 +249,17 @@ void GpuQueue::DispatchHostFunc( GpuCallbackDispath func, cudaStream_t stream, c // #MAYBE: Perhaps support having multiple callback streams, and multiple copy streams. // Signal from the work stream into the callback stream that we are ready for callback - CudaErrCheck( cudaEventRecord( lockEvent, stream ) ); + Log::Line( "Marker Set to %d", 71) +CudaErrCheck( cudaEventRecord( lockEvent, stream ) ); // Wait on the callback stream until it's ready to dsitpatch - CudaErrCheck( cudaStreamWaitEvent( _callbackStream, lockEvent ) ); + Log::Line( "Marker Set to %d", 72) +CudaErrCheck( cudaStreamWaitEvent( _callbackStream, lockEvent ) ); // #MAYBE: Use a bump allocator perhaps later to avoid locking here by new/delete if needed for performance. auto* fnCpy = new std::function( std::move( func ) ); - CudaErrCheck( cudaLaunchHostFunc( _callbackStream, []( void* userData ){ + Log::Line( "Marker Set to %d", 73) +CudaErrCheck( cudaLaunchHostFunc( _callbackStream, []( void* userData ){ auto& fn = *reinterpret_cast*>( userData ); fn(); @@ -257,10 +268,12 @@ void GpuQueue::DispatchHostFunc( GpuCallbackDispath func, cudaStream_t stream, c }, fnCpy ) ); // Signal from the callback stream that the callback finished - CudaErrCheck( cudaEventRecord( completedEvent, _callbackStream ) ); + Log::Line( "Marker Set to %d", 74) +CudaErrCheck( cudaEventRecord( completedEvent, _callbackStream ) ); // Wait on work stream for the callback to complete - CudaErrCheck( cudaStreamWaitEvent( stream, completedEvent ) ); + Log::Line( "Marker Set to %d", 75) +CudaErrCheck( cudaStreamWaitEvent( stream, completedEvent ) ); } size_t GpuQueue::CalculateSliceSizeFromDescriptor( const GpuStreamDescriptor& desc ) diff --git a/cuda/GpuStreams.cu b/cuda/GpuStreams.cu index 63700c9c..e075e306 100644 --- a/cuda/GpuStreams.cu +++ b/cuda/GpuStreams.cu @@ -60,26 +60,32 @@ void GpuUploadBuffer::Upload( const void* hostBuffer, size_t size, cudaStream_t { // Copy from unpinned to pinned first // #TODO: This should be done in a different backgrund host-to-host copy stream - CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->pinnedEvent[index] ) ); - CudaErrCheck( cudaMemcpyAsync( self->pinnedBuffer[index], hostBuffer, size, cudaMemcpyHostToHost, uploadStream ) ); + Log::Line( "Marker Set to %d", 76) +CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->pinnedEvent[index] ) ); + Log::Line( "Marker Set to %d", 77) +CudaErrCheck( cudaMemcpyAsync( self->pinnedBuffer[index], hostBuffer, size, cudaMemcpyHostToHost, uploadStream ) ); hostBuffer = self->pinnedBuffer[index]; } // Ensure the device buffer is ready for use - CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->deviceEvents[index] ) ); + Log::Line( "Marker Set to %d", 78) +CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->deviceEvents[index] ) ); // Upload to the device buffer - CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], hostBuffer, size, cudaMemcpyHostToDevice, uploadStream ) ); + Log::Line( "Marker Set to %d", 79) +CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], hostBuffer, size, cudaMemcpyHostToDevice, uploadStream ) ); if( !isDirect ) { // Signal that the pinned buffer is ready for re-use - CudaErrCheck( cudaEventRecord( self->pinnedEvent[index], uploadStream ) ); + Log::Line( "Marker Set to %d", 80) +CudaErrCheck( cudaEventRecord( self->pinnedEvent[index], uploadStream ) ); } // Signal work stream that the device buffer is ready to be used - CudaErrCheck( cudaEventRecord( self->readyEvents[index], uploadStream ) ); + Log::Line( "Marker Set to %d", 81) +CudaErrCheck( cudaEventRecord( self->readyEvents[index], uploadStream ) ); } void GpuUploadBuffer::UploadAndPreLoad( void* hostBuffer, const size_t size, const void* copyBufferSrc, const size_t copySize ) @@ -99,7 +105,6 @@ void GpuUploadBuffer::UploadAndPreLoad( void* hostBuffer, const size_t size, con // cpy.copy.size = copySize; // // Launch copy command - // CudaErrCheck( cudaLaunchHostFunc( self->queue->GetStream(), []( void* userData ){ // const CopyInfo& c = *reinterpret_cast( userData ); // IGpuBuffer* self = c.self; @@ -165,7 +170,8 @@ void GpuUploadBuffer::UploadArray( const void* hostBuffer, uint32 length, uint32 const auto copyMode = isDirect ? cudaMemcpyHostToDevice : cudaMemcpyHostToHost; // Wait on device or pinned buffer to be ready (depending if a direct copy or not) - CudaErrCheck( cudaStreamWaitEvent( uploadStream, waitEvent ) ); + Log::Line( "Marker Set to %d", 82) +CudaErrCheck( cudaStreamWaitEvent( uploadStream, waitEvent ) ); const byte* src = (byte*)hostBuffer; byte* dst = (byte*)( isDirect ? self->deviceBuffer[index] : self->pinnedBuffer[index] ); @@ -175,7 +181,8 @@ void GpuUploadBuffer::UploadArray( const void* hostBuffer, uint32 length, uint32 { const size_t size = *sizes * (size_t)elementSize; - CudaErrCheck( cudaMemcpyAsync( dst, src, size, copyMode, uploadStream ) ); + Log::Line( "Marker Set to %d", 83) +CudaErrCheck( cudaMemcpyAsync( dst, src, size, copyMode, uploadStream ) ); dst += size; src += srcStride; @@ -200,15 +207,19 @@ void GpuUploadBuffer::UploadArray( const void* hostBuffer, uint32 length, uint32 } // #TODO: This should be done in a copy stream to perform the copies in the background - CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->deviceEvents[index] ) ); - CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], hostBuffer, totalBufferSize, cudaMemcpyHostToDevice, uploadStream ) ); + Log::Line( "Marker Set to %d", 84) +CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->deviceEvents[index] ) ); + Log::Line( "Marker Set to %d", 84) +CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], hostBuffer, totalBufferSize, cudaMemcpyHostToDevice, uploadStream ) ); if( !self->diskBuffer ) - CudaErrCheck( cudaEventRecord( self->pinnedEvent[index], uploadStream ) ); + Log::Line( "Marker Set to %d", 85) +CudaErrCheck( cudaEventRecord( self->pinnedEvent[index], uploadStream ) ); } // Signal work stream that the device buffer is ready to be used - CudaErrCheck( cudaEventRecord( self->readyEvents[index], uploadStream ) ); + Log::Line( "Marker Set to %d", 86) +CudaErrCheck( cudaEventRecord( self->readyEvents[index], uploadStream ) ); } void GpuUploadBuffer::UploadArrayForIndex( const uint32 index, const void* hostBuffer, uint32 length, @@ -219,7 +230,8 @@ void GpuUploadBuffer::UploadArrayForIndex( const uint32 index, const void* hostB auto stream = self->queue->GetStream(); // Ensure the device buffer is ready for use - CudaErrCheck( cudaStreamWaitEvent( stream, self->events[index] ) ); + Log::Line( "Marker Set to %d", 87) +CudaErrCheck( cudaStreamWaitEvent( stream, self->events[index] ) ); // Perform uploads //size_t deviceCopySize = 0; @@ -230,7 +242,8 @@ void GpuUploadBuffer::UploadArrayForIndex( const uint32 index, const void* hostB { const size_t size = *counts * (size_t)elementSize; //memcpy( dst, src, size ); - CudaErrCheck( cudaMemcpyAsync( dst, src, size, cudaMemcpyHostToDevice, stream ) ); + Log::Line( "Marker Set to %d", 88) +CudaErrCheck( cudaMemcpyAsync( dst, src, size, cudaMemcpyHostToDevice, stream ) ); //deviceCopySize += size; @@ -240,10 +253,12 @@ void GpuUploadBuffer::UploadArrayForIndex( const uint32 index, const void* hostB } // Copy to device buffer - //CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], cpy.dstBuffer, deviceCopySize, cudaMemcpyHostToDevice, _stream ) ); + //Log::Line( "Marker Set to %d", 89) +CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], cpy.dstBuffer, deviceCopySize, cudaMemcpyHostToDevice, _stream ) ); // Signal work stream that the device buffer is ready to be used - CudaErrCheck( cudaEventRecord( self->readyEvents[index], stream ) ); + Log::Line( "Marker Set to %d", 90) +CudaErrCheck( cudaEventRecord( self->readyEvents[index], stream ) ); } void GpuUploadBuffer::Upload( const void* hostBuffer, const size_t size ) @@ -270,7 +285,8 @@ void* GpuUploadBuffer::GetUploadedDeviceBuffer( cudaStream_t workStream ) const uint32 index = self->completedSequence % self->bufferCount; self->completedSequence++; - CudaErrCheck( cudaStreamWaitEvent( workStream, self->readyEvents[index] ) ); + Log::Line( "Marker Set to %d", 91) +CudaErrCheck( cudaStreamWaitEvent( workStream, self->readyEvents[index] ) ); return self->deviceBuffer[index]; } @@ -284,7 +300,8 @@ void GpuUploadBuffer::ReleaseDeviceBuffer( cudaStream_t workStream ) const uint32 index = self->lockSequence % self->bufferCount; self->lockSequence++; - CudaErrCheck( cudaEventRecord( self->deviceEvents[index], workStream ) ); + Log::Line( "Marker Set to %d", 92) +CudaErrCheck( cudaEventRecord( self->deviceEvents[index], workStream ) ); } void GpuUploadBuffer::WaitForPreloadsToComplete() @@ -342,7 +359,8 @@ DiskBufferBase* GpuUploadBuffer::GetDiskBuffer() const void GpuUploadBuffer::CallHostFunctionOnStream( cudaStream_t stream, std::function func ) { auto* fnCpy = new std::function( std::move( func ) ); - CudaErrCheck( cudaLaunchHostFunc( stream, []( void* userData ) { + Log::Line( "Marker Set to %d", 93) +CudaErrCheck( cudaLaunchHostFunc( stream, []( void* userData ) { auto& fn = *reinterpret_cast*>( userData ); fn(); diff --git a/cuda/chacha8.cu b/cuda/chacha8.cu index 7fb7c5d0..32f665f2 100644 --- a/cuda/chacha8.cu +++ b/cuda/chacha8.cu @@ -201,8 +201,10 @@ void GenF1Cuda( CudaK32PlotContext& cx ) chacha8_ctx chacha; chacha8_keysetup( &chacha, key, 256, nullptr ); - CudaErrCheck( cudaMemcpyAsync( devChaChaInput, chacha.input, 64, cudaMemcpyHostToDevice, cx.computeStream ) ); - CudaErrCheck( cudaMemsetAsync( devBucketCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cx.computeStream ) ); + Log::Line( "Marker Set to %d", 94) +CudaErrCheck( cudaMemcpyAsync( devChaChaInput, chacha.input, 64, cudaMemcpyHostToDevice, cx.computeStream ) ); + Log::Line( "Marker Set to %d", 95) +CudaErrCheck( cudaMemsetAsync( devBucketCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cx.computeStream ) ); const uint32 outIndex = CudaK32PlotGetOutputIndex( cx ); @@ -218,7 +220,8 @@ void GenF1Cuda( CudaK32PlotContext& cx ) uint32* devMeta = (uint32*)cx.metaOut.LockDeviceBuffer( cx.computeStream ); #if _DEBUG - CudaErrCheck( cudaMemsetAsync( devY, 0, sizeof( uint32 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, cx.computeStream ) ); + Log::Line( "Marker Set to %d", 96) +CudaErrCheck( cudaMemsetAsync( devY, 0, sizeof( uint32 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, cx.computeStream ) ); #endif // Gen chacha blocks @@ -229,10 +232,12 @@ void GenF1Cuda( CudaK32PlotContext& cx ) } // Copy bucket slices to host - CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, + Log::Line( "Marker Set to %d", 97) +CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cudaMemcpyDeviceToHost, cx.computeStream ) ); - CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) ); + Log::Line( "Marker Set to %d", 98) +CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) ); memcpy( &cx.bucketSlices[0], cx.hostBucketSlices, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT ); @@ -293,7 +298,6 @@ static void DbgValidateBucket( CudaK32PlotContext& cx, const uint32 bucket ) if( _dbgPool == nullptr ) _dbgPool = new ThreadPool( SysHost::GetLogicalCPUCount() ); - //CudaErrCheck( cudaStreamSynchronize( cx.downloadStream ) ); Log::Line( "Validating bucket %u", bucket ); AnonMTJob::Run( *_dbgPool, [&cx, bucket]( AnonMTJob* self ) { diff --git a/cuda/harvesting/CudaThresher.cu b/cuda/harvesting/CudaThresher.cu index b023290e..31214524 100644 --- a/cuda/harvesting/CudaThresher.cu +++ b/cuda/harvesting/CudaThresher.cu @@ -102,11 +102,11 @@ public: bool AllocateBuffers( const uint k, uint maxCompressionLevel ) override { // Only support C7 max for now - if( maxCompressionLevel > 7 ) + if( maxCompressionLevel > 16 ) return false; // #NOTE: For now we always preallocate for the maximum compression level - maxCompressionLevel = 7; + maxCompressionLevel = 16; if( _maxCompressionLevel >= maxCompressionLevel ) return true; diff --git a/src/commands/CmdSimulator.cpp b/src/commands/CmdSimulator.cpp index e1c48251..6e4c7b78 100644 --- a/src/commands/CmdSimulator.cpp +++ b/src/commands/CmdSimulator.cpp @@ -250,8 +250,8 @@ size_t CalculatePlotSizeBytes( const uint32 k, const uint32 compressionLevel ) const size_t parkSizes[] = { 0, // Table 1 is dropped - compressionLevel >= 9 ? 0 : info.tableParkSize, - compressionLevel >= 9 ? info.tableParkSize : CalculateParkSize( TableId::Table3 ), + compressionLevel >= 40 ? 0 : info.tableParkSize, + compressionLevel >= 40 ? info.tableParkSize : CalculateParkSize( TableId::Table3 ), CalculateParkSize( TableId::Table4 ), CalculateParkSize( TableId::Table5 ), CalculateParkSize( TableId::Table6 ), diff --git a/src/harvesting/GreenReaper.cpp b/src/harvesting/GreenReaper.cpp index 3aae4cdd..3036eee6 100644 --- a/src/harvesting/GreenReaper.cpp +++ b/src/harvesting/GreenReaper.cpp @@ -346,7 +346,7 @@ GRResult grGetCompressionInfo( GRCompressionInfo* outInfo, const size_t infoStru if( outInfo == nullptr || k != 32 || compressionLevel < 1 - || compressionLevel > 9 ) + || compressionLevel > 40 ) { return GRResult_InvalidArg; } @@ -385,7 +385,7 @@ GRResult grFetchProofForChallenge( GreenReaperContext* cx, GRCompressedProofRequ uint32 xGroups[GR_POST_PROOF_X_COUNT] = {}; // Unpack x groups first - if( req->compressionLevel < 9 ) + if( req->compressionLevel < 40 ) { for( uint32 i = 0, j = 0; i < numGroups; i++, j+=2 ) { @@ -514,7 +514,7 @@ GRResult grGetFetchQualitiesXPair( GreenReaperContext* cx, GRCompressedQualities proofMightBeDropped = (x1x2.x == 0 || x1x2.y == 0) || (x2x3.x == 0 || x2x3.y == 0); - if( req->compressionLevel < 9 ) + if( req->compressionLevel < 40 ) { numXGroups = 2; xGroups[0] = (uint32)x1x2.x; @@ -541,7 +541,7 @@ GRResult grGetFetchQualitiesXPair( GreenReaperContext* cx, GRCompressedQualities } } - if( req->compressionLevel >= 6 && req->compressionLevel < 9 ) + if( req->compressionLevel >= 6 && req->compressionLevel < 40 ) { const BackPtr p = LinePointToSquare( ((uint128)req->xLinePoints[1].hi) << 64 | (uint128)req->xLinePoints[1].lo ); @@ -550,7 +550,7 @@ GRResult grGetFetchQualitiesXPair( GreenReaperContext* cx, GRCompressedQualities proofMightBeDropped = proofMightBeDropped || (x1x2.x == 0 || x1x2.y == 0) || (x2x3.x == 0 || x2x3.y == 0); - if( req->compressionLevel < 9 ) + if( req->compressionLevel < 40 ) { numXGroups = 4; xGroups[4] = (uint32)x1x2.x; @@ -894,7 +894,7 @@ void BacktraceProof( GreenReaperContext& cx, const TableId tableStart, uint64 pr //----------------------------------------------------------- GRResult RequestSetup( GreenReaperContext* cx, const uint32 k, const uint32 compressionLevel ) { - if( compressionLevel < 1 || compressionLevel > 9 ) + if( compressionLevel < 1 || compressionLevel > 40 ) return GRResult_Failed; // Make sure we have our CUDA decompressor working in case it was deleted after a failure diff --git a/src/harvesting/GreenReaperInternal.h b/src/harvesting/GreenReaperInternal.h index 6ae101d0..ae4c1d02 100644 --- a/src/harvesting/GreenReaperInternal.h +++ b/src/harvesting/GreenReaperInternal.h @@ -26,6 +26,6 @@ inline uint64 GetEntriesPerBucketForCompressionLevel( const uint32 k, const uint inline uint64 GetMaxTablePairsForCompressionLevel( const uint32 k, const uint32 cLevel ) { - const double factor = cLevel >= 9 ? GR_MAX_MATCHES_MULTIPLIER_2T_DROP : GR_MAX_MATCHES_MULTIPLIER; + const double factor = cLevel >= 40 ? GR_MAX_MATCHES_MULTIPLIER_2T_DROP : GR_MAX_MATCHES_MULTIPLIER; return (uint64)( GetEntriesPerBucketForCompressionLevel( k, cLevel ) * factor ) * (uint64)GR_MAX_BUCKETS; } \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp index c510ed5c..1bda9d84 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -273,7 +273,7 @@ void ParseCommandLine( GlobalPlotConfig& cfg, IPlotter*& outPlotter, int argc, c } else if( cli.ArgConsume( "ramplot" ) ) { - FatalIf( cfg.compressionLevel > 7, "ramplot currently does not support compression levels greater than 7" ); + FatalIf( cfg.compressionLevel > 16, "ramplot currently does not support compression levels greater than 7" ); plotter = new MemPlotter(); break; @@ -390,7 +390,7 @@ void ParseCommandLine( GlobalPlotConfig& cfg, IPlotter*& outPlotter, int argc, c // FatalIf( cfg.compressionLevel > 7, "Invalid compression level. Please specify a compression level between 0 and 7 (inclusive)." ); - FatalIf( cfg.compressionLevel > 9, "Invalid compression level. Please specify a compression level between 0 and 9 (inclusive)." ); + FatalIf( cfg.compressionLevel > 40, "Invalid compression level. Please specify a compression level between 0 and 40 (inclusive)." ); // If making compressed plots, get thr compression CTable, etc. if( cfg.compressionLevel > 0 ) { @@ -402,7 +402,7 @@ void ParseCommandLine( GlobalPlotConfig& cfg, IPlotter*& outPlotter, int argc, c cfg.ctable = CreateCompressionCTable( cfg.compressionLevel, &cfg.cTableSize ); cfg.compressionInfo = GetCompressionInfoForLevel( cfg.compressionLevel ); cfg.compressedEntryBits = cfg.compressionInfo.entrySizeBits; - cfg.numDroppedTables = cfg.compressionLevel < 9 ? 1 : 2; + cfg.numDroppedTables = cfg.compressionLevel < 40 ? 1 : 2; cfg.ctable = CreateCompressionCTable( cfg.compressionLevel ); cfg.compressionInfo = GetCompressionInfoForLevel( cfg.compressionLevel ); diff --git a/src/plotting/Compression.cpp b/src/plotting/Compression.cpp index bde4313b..50f84446 100644 --- a/src/plotting/Compression.cpp +++ b/src/plotting/Compression.cpp @@ -83,7 +83,14 @@ CompressionInfo GetCompressionInfoForLevel( const uint32 compressionLevel ) case 7: GetCompressionInfoForLevel<7>( info ); break; case 8: GetCompressionInfoForLevel<8>( info ); break; case 9: GetCompressionInfoForLevel<9>( info ); break; - + case 10: GetCompressionInfoForLevel<10>( info ); break; + case 11: GetCompressionInfoForLevel<11>( info ); break; + case 12: GetCompressionInfoForLevel<12>( info ); break; + case 13: GetCompressionInfoForLevel<13>( info ); break; + case 14: GetCompressionInfoForLevel<14>( info ); break; + case 15: GetCompressionInfoForLevel<15>( info ); break; + case 16: GetCompressionInfoForLevel<16>( info ); break; + default: Fatal( "Invalid compression level %u.", compressionLevel ); break; @@ -96,6 +103,7 @@ void* CreateCompressionTable( const uint32 compressionLevel, size_t* outTableSiz { switch ( compressionLevel ) { + case 1: return CreateCompressionCTable<1>( outTableSize, compress ); case 2: return CreateCompressionCTable<2>( outTableSize, compress ); case 3: return CreateCompressionCTable<3>( outTableSize, compress ); @@ -105,6 +113,13 @@ void* CreateCompressionTable( const uint32 compressionLevel, size_t* outTableSiz case 7: return CreateCompressionCTable<7>( outTableSize, compress ); case 8: return CreateCompressionCTable<8>( outTableSize, compress ); case 9: return CreateCompressionCTable<9>( outTableSize, compress ); + case 10: return CreateCompressionCTable<10>( outTableSize, compress ); + case 11: return CreateCompressionCTable<11>( outTableSize, compress ); + case 12: return CreateCompressionCTable<12>( outTableSize, compress ); + case 13: return CreateCompressionCTable<13>( outTableSize, compress ); + case 14: return CreateCompressionCTable<14>( outTableSize, compress ); + case 15: return CreateCompressionCTable<15>( outTableSize, compress ); + case 16: return CreateCompressionCTable<16>( outTableSize, compress ); default: break; @@ -146,14 +161,22 @@ uint32 GetCompressedLPBitCount( const uint32 compressionLevel ) size_t GetLargestCompressedParkSize() { return std::max( { - GetCompressionInfoForLevel( 1 ).tableParkSize, - GetCompressionInfoForLevel( 2 ).tableParkSize, - GetCompressionInfoForLevel( 3 ).tableParkSize, - GetCompressionInfoForLevel( 4 ).tableParkSize, - GetCompressionInfoForLevel( 5 ).tableParkSize, - GetCompressionInfoForLevel( 6 ).tableParkSize, - GetCompressionInfoForLevel( 7 ).tableParkSize, - GetCompressionInfoForLevel( 8 ).tableParkSize, - GetCompressionInfoForLevel( 9 ).tableParkSize } + GetCompressionInfoForLevel( 1 ).tableParkSize, + GetCompressionInfoForLevel( 2 ).tableParkSize, + GetCompressionInfoForLevel( 3 ).tableParkSize, + GetCompressionInfoForLevel( 4 ).tableParkSize, + GetCompressionInfoForLevel( 5 ).tableParkSize, + GetCompressionInfoForLevel( 6 ).tableParkSize, + GetCompressionInfoForLevel( 7 ).tableParkSize, + GetCompressionInfoForLevel( 8 ).tableParkSize, + GetCompressionInfoForLevel( 9 ).tableParkSize, + GetCompressionInfoForLevel( 10 ).tableParkSize, + GetCompressionInfoForLevel( 11 ).tableParkSize, + GetCompressionInfoForLevel( 12 ).tableParkSize, + GetCompressionInfoForLevel( 13 ).tableParkSize, + GetCompressionInfoForLevel( 14 ).tableParkSize, + GetCompressionInfoForLevel( 15 ).tableParkSize, + GetCompressionInfoForLevel( 16 ).tableParkSize + } ); } \ No newline at end of file diff --git a/src/plotting/Compression.h b/src/plotting/Compression.h index dbb01228..7a926a70 100644 --- a/src/plotting/Compression.h +++ b/src/plotting/Compression.h @@ -106,7 +106,65 @@ struct CompressionLevelInfo<9> static constexpr double ANS_R_VALUE = 4.54; }; +template<> +struct CompressionLevelInfo<10> +{ + static constexpr uint32_t ENTRY_SIZE = 7; + static constexpr uint32_t STUB_BIT_SIZE = 26; + static constexpr size_t TABLE_PARK_SIZE = 7896; + static constexpr double ANS_R_VALUE = 4.54; +}; +template<> +struct CompressionLevelInfo<11> +{ + static constexpr uint32_t ENTRY_SIZE = 6; + static constexpr uint32_t STUB_BIT_SIZE = 22; + static constexpr size_t TABLE_PARK_SIZE = 6930; + static constexpr double ANS_R_VALUE = 4.54; +}; +template<> +struct CompressionLevelInfo<12> +{ + static constexpr uint32_t ENTRY_SIZE = 5; + static constexpr uint32_t STUB_BIT_SIZE = 18; + static constexpr size_t TABLE_PARK_SIZE = 5953; + static constexpr double ANS_R_VALUE = 4.54; +}; +template<> +struct CompressionLevelInfo<13> +{ + static constexpr uint32_t ENTRY_SIZE = 4; + static constexpr uint32_t STUB_BIT_SIZE = 14; + static constexpr size_t TABLE_PARK_SIZE = 4956; + static constexpr double ANS_R_VALUE = 4.54; +}; + +template<> +struct CompressionLevelInfo<14> +{ + static constexpr uint32_t ENTRY_SIZE = 3; + static constexpr uint32_t STUB_BIT_SIZE = 10; + static constexpr size_t TABLE_PARK_SIZE = 3944; + static constexpr double ANS_R_VALUE = 4.54; +}; + +template<> +struct CompressionLevelInfo<15> +{ + static constexpr uint32_t ENTRY_SIZE = 2; + static constexpr uint32_t STUB_BIT_SIZE = 6; + static constexpr size_t TABLE_PARK_SIZE = 2930; + static constexpr double ANS_R_VALUE = 4.54; +}; +template<> +struct CompressionLevelInfo<16> +{ + static constexpr uint32_t ENTRY_SIZE = 1; + static constexpr uint32_t STUB_BIT_SIZE = 2; + static constexpr size_t TABLE_PARK_SIZE = 1913; + static constexpr double ANS_R_VALUE = 4.54; +}; \ No newline at end of file diff --git a/src/plotting/PlotWriter.cpp b/src/plotting/PlotWriter.cpp index 3d3440c7..307b1a23 100644 --- a/src/plotting/PlotWriter.cpp +++ b/src/plotting/PlotWriter.cpp @@ -95,7 +95,7 @@ bool PlotWriter::BeginPlotInternal( PlotVersion version, if( !plotMemo || !plotMemoSize ) return false; - ASSERT( compressionLevel >= 0 && compressionLevel <= 9 ); + ASSERT( compressionLevel >= 0 && compressionLevel <= 40 ); if( compressionLevel > 0 && version < PlotVersion::v2_0 ) return false; diff --git a/src/tools/PlotComparer.cpp b/src/tools/PlotComparer.cpp index f275d980..18728835 100644 --- a/src/tools/PlotComparer.cpp +++ b/src/tools/PlotComparer.cpp @@ -376,14 +376,14 @@ void TestTable( FilePlot& ref, FilePlot& tgt, TableId table ) if( table == TableId::Table1 && tgt.CompressionLevel() > 0 ) return; - if( table == TableId::Table2 && tgt.CompressionLevel() >= 9 ) + if( table == TableId::Table2 && tgt.CompressionLevel() >= 40 ) return; // if( table == TableId::Table7 ) return; Log::Line( "Reading Table %u...", table+1 ); - const uint32 numTablesDropped = tgt.CompressionLevel() >= 9 ? 2 : + const uint32 numTablesDropped = tgt.CompressionLevel() >= 40 ? 2 : tgt.CompressionLevel() >= 1 ? 1 : 0; const size_t parkSize = table < TableId::Table7 ? diff --git a/src/tools/PlotReader.cpp b/src/tools/PlotReader.cpp index 0059f186..6aad1ec2 100644 --- a/src/tools/PlotReader.cpp +++ b/src/tools/PlotReader.cpp @@ -439,7 +439,7 @@ TableId PlotReader::GetLowestStoredTable() const const uint32 compressionLevel = _plot.CompressionLevel(); const uint32 numDroppedTables = compressionLevel == 0 ? 0 : - compressionLevel >= 9 ? 2 : 1; + compressionLevel >= 40 ? 2 : 1; return TableId::Table1 + numDroppedTables; } @@ -732,7 +732,7 @@ ProofFetchResult PlotReader::DecompressProof( const uint64 compressedProof[BB_PL req.plotId = _plot.PlotId(); req.compressionLevel = compressionLevel; - const uint32 compressedProofCount = compressionLevel < 9 ? PROOF_X_COUNT / 2 : PROOF_X_COUNT / 4; + const uint32 compressedProofCount = compressionLevel < 40 ? PROOF_X_COUNT / 2 : PROOF_X_COUNT / 4; for( uint32 i = 0; i < compressedProofCount; i++ ) req.compressedProof[i] = compressedProof[i]; diff --git a/src/tools/PlotValidator.cpp b/src/tools/PlotValidator.cpp index 16a7f14a..c1dad4fa 100644 --- a/src/tools/PlotValidator.cpp +++ b/src/tools/PlotValidator.cpp @@ -1137,7 +1137,7 @@ bool DecompressProof( const byte plotId[BB_PLOT_ID_LEN], const uint32 compressio req.compressionLevel = compressionLevel; req.plotId = plotId; - const uint32 compressedProofCount = compressionLevel < 9 ? PROOF_X_COUNT / 2 : PROOF_X_COUNT / 4; + const uint32 compressedProofCount = compressionLevel < 40 ? PROOF_X_COUNT / 2 : PROOF_X_COUNT / 4; for( uint32 i = 0; i < compressedProofCount; i++ ) req.compressedProof[i] = compressedProof[i]; @@ -1170,7 +1170,7 @@ bool FetchProof( PlotReader& plot, uint64 t6LPIndex, uint64 fullProofXs[PROOF_X_ const bool isCompressed = plot.PlotFile().CompressionLevel() > 0; const TableId endTable = !isCompressed ? TableId::Table1 : - plot.PlotFile().CompressionLevel() < 9 ? + plot.PlotFile().CompressionLevel() < 40 ? TableId::Table2 : TableId::Table3; for( TableId table = TableId::Table6; table >= endTable; table-- )