Skip to content

Commit

Permalink
add marker
Browse files Browse the repository at this point in the history
  • Loading branch information
Moz committed Nov 2, 2023
1 parent 2f3a7d3 commit c7cb73c
Show file tree
Hide file tree
Showing 13 changed files with 199 additions and 100 deletions.
6 changes: 4 additions & 2 deletions cuda/CudaMatch.cu
Original file line number Diff line number Diff line change
Expand Up @@ -592,8 +592,10 @@ void CudaMatchBucketizedK32(
CudaSetFirstAndLastGroup<<<1,2,0,stream>>>( tmpGroupCounts, entryCount );
}

CudaErrCheck( cudaMemsetAsync( cx.devGroupCount, 0, sizeof( uint32 ), stream ) );
CudaErrCheck( cudaMemsetAsync( cx.devMatchCount, 0, sizeof( uint32 ), stream ) );
Log::Line( "Marker Set to %d", 1)
CudaErrCheck( cudaMemsetAsync( cx.devGroupCount, 0, sizeof( uint32 ), stream ) );
Log::Line( "Marker Set to %d", 2)
CudaErrCheck( cudaMemsetAsync( cx.devMatchCount, 0, sizeof( uint32 ), stream ) );
ScanGroupsCudaK32Bucket<<<kscanblocks, BBCU_SCAN_GROUP_THREADS, 0, stream>>>( devY, tmpGroupCounts+2, cx.devGroupCount, entryCount, bucketMask );

byte* sortTmpAlloc = (byte*)( tmpGroupCounts + BBCU_MAX_GROUP_COUNT );
Expand Down
6 changes: 4 additions & 2 deletions cuda/CudaParkSerializer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,10 @@ void InitFSEBitMask( CudaK32PlotContext& cx )
0x3FFFFFFF, 0x7FFFFFFF
};

CudaErrCheck( cudaMemcpyToSymbolAsync( CUDA_FSE_BIT_mask, bitmask, sizeof( bitmask ), 0, cudaMemcpyHostToDevice, cx.computeStream ) );
CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) );
Log::Line( "Marker Set to %d", 3)
CudaErrCheck( cudaMemcpyToSymbolAsync( CUDA_FSE_BIT_mask, bitmask, sizeof( bitmask ), 0, cudaMemcpyHostToDevice, cx.computeStream ) );
Log::Line( "Marker Set to %d", 4)
CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) );
}


Expand Down
6 changes: 4 additions & 2 deletions cuda/CudaPlotContext.h
Original file line number Diff line number Diff line change
Expand Up @@ -576,8 +576,10 @@ inline void DbgPrintDeviceHash( const char* msg, const void* ptr, const size_t s
byte hash[32];

void* hostBuffer = bbvirtallocboundednuma<byte>( size );
CudaErrCheck( cudaMemcpyAsync( hostBuffer, ptr, size, cudaMemcpyDeviceToHost, stream ) );
CudaErrCheck( cudaStreamSynchronize( stream ) );
Log::Line( "Marker Set to %d", 5)
CudaErrCheck( cudaMemcpyAsync( hostBuffer, ptr, size, cudaMemcpyDeviceToHost, stream ) );
Log::Line( "Marker Set to %d", 6)
CudaErrCheck( cudaStreamSynchronize( stream ) );

blake3_hasher hasher;
blake3_hasher_init( &hasher );
Expand Down
15 changes: 10 additions & 5 deletions cuda/CudaPlotPhase2.cu
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,8 @@ static void BytefieldToBitfield( CudaK32PlotContext& cx, const byte* bytefield,

#if DBG_BBCU_P2_COUNT_PRUNED_ENTRIES
#define G_PRUNED_COUNTS ,cx.phase2->devPrunedCount
CudaErrCheck( cudaMemsetAsync( cx.phase2->devPrunedCount, 0, sizeof( uint32 ), stream ) );
Log::Line( "Marker Set to %d", 7)
CudaErrCheck( cudaMemsetAsync( cx.phase2->devPrunedCount, 0, sizeof( uint32 ), stream ) );
#else
#define G_PRUNED_COUNTS
#endif
Expand Down Expand Up @@ -176,7 +177,8 @@ void MarkTable( CudaK32PlotContext& cx, CudaK32Phase2& p2 )
}

// Zero-out marks
CudaErrCheck( cudaMemsetAsync( devLMarks, 0, GetMarkingTableByteSize(), cx.computeStream ) );
Log::Line( "Marker Set to %d", 8)
CudaErrCheck( cudaMemsetAsync( devLMarks, 0, GetMarkingTableByteSize(), cx.computeStream ) );

// Load first bucket's worth of pairs
LoadPairs( cx, p2, rTable, 0 );
Expand Down Expand Up @@ -230,9 +232,12 @@ void MarkTable( CudaK32PlotContext& cx, CudaK32Phase2& p2 )
#if DBG_BBCU_P2_COUNT_PRUNED_ENTRIES
{
uint32 prunedEntryCount = 0;
CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) );
CudaErrCheck( cudaMemcpyAsync( &prunedEntryCount, p2.devPrunedCount, sizeof( uint32 ), cudaMemcpyDeviceToHost, cx.computeStream ) );
CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) );
Log::Line( "Marker Set to %d", 9)
CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) );
Log::Line( "Marker Set to %d", 10)
CudaErrCheck( cudaMemcpyAsync( &prunedEntryCount, p2.devPrunedCount, sizeof( uint32 ), cudaMemcpyDeviceToHost, cx.computeStream ) );
Log::Line( "Marker Set to %d", 11)
CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) );

const uint64 lEntryCount = cx.tableEntryCounts[(int)lTable];
Log::Line( "Table %u now has %u / %llu ( %.2lf%% ) entries.", (uint)lTable+1,
Expand Down
24 changes: 16 additions & 8 deletions cuda/CudaPlotPhase3.cu
Original file line number Diff line number Diff line change
Expand Up @@ -374,12 +374,14 @@ void Step1( CudaK32PlotContext& cx )
const TableId rTable = cx.table;

// Clear pruned table count
CudaErrCheck( cudaMemsetAsync( p3.devPrunedEntryCount, 0, sizeof( uint32 ), cx.computeStream ) );
Log::Line( "Marker Set to %d", 12)
CudaErrCheck( cudaMemsetAsync( p3.devPrunedEntryCount, 0, sizeof( uint32 ), cx.computeStream ) );

// Load marking table (must be loaded before first bucket, on the same stream)
if( cx.table < TableId::Table7 )
{
CudaErrCheck( cudaMemcpyAsync( s1.rTableMarks, cx.hostMarkingTables[(int)rTable],
Log::Line( "Marker Set to %d", 13)
CudaErrCheck( cudaMemcpyAsync( s1.rTableMarks, cx.hostMarkingTables[(int)rTable],
GetMarkingTableBitFieldSize(), cudaMemcpyHostToDevice, s1.pairsLIn.GetQueue()->GetStream() ) );
}

Expand Down Expand Up @@ -416,7 +418,8 @@ void Step1( CudaK32PlotContext& cx )
// Generate map
#define KERN_RMAP_ARGS entryCount, rTableOffset, devSliceCounts, p3.devPrunedEntryCount, devRMap, devLPairs, devRPairs, s1.rTableMarks

CudaErrCheck( cudaMemsetAsync( devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT, cx.computeStream ) );
Log::Line( "Marker Set to %d", 14)
CudaErrCheck( cudaMemsetAsync( devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT, cx.computeStream ) );

if( cx.table < TableId::Table7 )
PruneAndWriteRMap<true><<<blocksPerGrid, threadPerBlock, 0, cx.computeStream>>>( KERN_RMAP_ARGS );
Expand All @@ -436,7 +439,8 @@ void Step1( CudaK32PlotContext& cx )
// Download slice counts
cudaStream_t downloadStream = s1.rMapOut.GetQueue()->GetStream();

CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT,
Log::Line( "Marker Set to %d", 15)
CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT,
cudaMemcpyDeviceToHost, downloadStream ) );

// Wait for completion
Expand All @@ -446,7 +450,8 @@ void Step1( CudaK32PlotContext& cx )
s1.pairsLIn.Reset();
s1.pairsRIn.Reset();

CudaErrCheck( cudaStreamSynchronize( downloadStream ) );
Log::Line( "Marker Set to %d", 16)
CudaErrCheck( cudaStreamSynchronize( downloadStream ) );

// Add-up pruned bucket counts and tables counts
memcpy( &s1.prunedBucketSlices[0][0], cx.hostBucketSlices, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT );
Expand Down Expand Up @@ -521,7 +526,8 @@ void CompressInlinedTable( CudaK32PlotContext& cx )
#endif

// Load R Marking table (must be loaded before first bucket, on the same stream)
CudaErrCheck( cudaMemcpyAsync( (void*)tx.devRMarks, cx.hostMarkingTables[(int)rTable],
Log::Line( "Marker Set to %d", 17)
CudaErrCheck( cudaMemcpyAsync( (void*)tx.devRMarks, cx.hostMarkingTables[(int)rTable],
GetMarkingTableBitFieldSize(), cudaMemcpyHostToDevice, p3.xTable.xIn.GetQueue()->GetStream() ) );

// Load initial bucket
Expand All @@ -536,7 +542,8 @@ void CompressInlinedTable( CudaK32PlotContext& cx )
uint64 tablePrunedEntryCount = 0;
uint32 rTableOffset = 0;

CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cx.computeStream ) );
Log::Line( "Marker Set to %d", 18)
CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cx.computeStream ) );

for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
{
Expand All @@ -559,7 +566,8 @@ void CompressInlinedTable( CudaK32PlotContext& cx )
uint32* devSliceCounts = cx.devSliceCounts + bucket * BBCU_BUCKET_COUNT;

#if _DEBUG
CudaErrCheck( cudaMemsetAsync( outLps, 0, sizeof( uint64 ) * P3_PRUNED_BUCKET_MAX, cx.computeStream ) );
Log::Line( "Marker Set to %d", 19)
CudaErrCheck( cudaMemsetAsync( outLps, 0, sizeof( uint64 ) * P3_PRUNED_BUCKET_MAX, cx.computeStream ) );
#endif

CudaConvertInlinedXsToLinePoints<<<blocksPerGrid, threadPerBlock, 0, cx.computeStream>>>(
Expand Down
27 changes: 18 additions & 9 deletions cuda/CudaPlotPhase3Step2.cu
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,8 @@ static void ConvertRMapToLinePoints( CudaK32PlotContext& cx, const uint32 entryC
const uint64 divisor = P3_CalculateMaxLPValue( prunedEntryCount ) / BBCU_BUCKET_COUNT;

// #TODO: Use upload stream?
CudaErrCheck( cudaMemcpyToSymbolAsync( BucketDivisor, &divisor, sizeof( divisor ), 0, cudaMemcpyHostToDevice, cx.computeStream ) );
Log::Line( "Marker Set to %d", 20)
CudaErrCheck( cudaMemcpyToSymbolAsync( BucketDivisor, &divisor, sizeof( divisor ), 0, cudaMemcpyHostToDevice, cx.computeStream ) );
}

CudaConvertRMapToLinePoints<false><<<blocks, threads, 0, stream>>>( Rmap2LPParams, 0 );
Expand Down Expand Up @@ -223,7 +224,8 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx )
if( isCompressed )
{
// Copy from upload buffer to working buffer
CudaErrCheck( cudaMemcpyAsync( lTable, lMap, BBCU_BUCKET_ENTRY_COUNT * sizeof( uint32 ), cudaMemcpyDeviceToDevice, cx.computeStream ) );
Log::Line( "Marker Set to %d", 21)
CudaErrCheck( cudaMemcpyAsync( lTable, lMap, BBCU_BUCKET_ENTRY_COUNT * sizeof( uint32 ), cudaMemcpyDeviceToDevice, cx.computeStream ) );
}
else
{
Expand Down Expand Up @@ -264,7 +266,8 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx )
LoadRBucket( cx, 0 );

// Clear pruned entry count
CudaErrCheck( cudaMemsetAsync( p3.devPrunedEntryCount, 0, sizeof( uint32 ), cx.computeStream ) );
Log::Line( "Marker Set to %d", 22)
CudaErrCheck( cudaMemsetAsync( p3.devPrunedEntryCount, 0, sizeof( uint32 ), cx.computeStream ) );

// Unpack the first map beforehand
UnpackLBucket( cx, 0 );
Expand All @@ -275,7 +278,8 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx )
///
uint32 rTableOffset = 0; // Track the global origin index of R entry/line point

CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cx.computeStream ) );
Log::Line( "Marker Set to %d", 23)
CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cx.computeStream ) );

for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
{
Expand All @@ -298,7 +302,8 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx )
static_assert( BBCU_BUCKET_ALLOC_ENTRY_COUNT - BBCU_BUCKET_ENTRY_COUNT == copyCount );

uint32* nextLTable = s2.devLTable[nextBucket & 1];
CudaErrCheck( cudaMemcpyAsync( (uint32*)devLTable + BBCU_BUCKET_ENTRY_COUNT, nextLTable, copyCount * sizeof( uint32 ), cudaMemcpyDeviceToDevice, cx.computeStream ) );
Log::Line( "Marker Set to %d", 24)
CudaErrCheck( cudaMemcpyAsync( (uint32*)devLTable + BBCU_BUCKET_ENTRY_COUNT, nextLTable, copyCount * sizeof( uint32 ), cudaMemcpyDeviceToDevice, cx.computeStream ) );
}

if( nextBucketL < BBCU_BUCKET_COUNT )
Expand Down Expand Up @@ -351,12 +356,14 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx )
// Copy slice counts & bucket count
cudaStream_t downloadStream = s2.lpOut.GetQueue()->GetStream();

CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT,
Log::Line( "Marker Set to %d", 25)
CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT,
cudaMemcpyDeviceToHost, downloadStream ) );

memset( p3.prunedBucketCounts[(int)rTable], 0, BBCU_BUCKET_COUNT * sizeof( uint32 ) );

CudaErrCheck( cudaStreamSynchronize( downloadStream ) );
Log::Line( "Marker Set to %d", 26)
CudaErrCheck( cudaStreamSynchronize( downloadStream ) );
bbmemcpy_t( &s2.prunedBucketSlices[0][0], cx.hostBucketSlices, BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT );
for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
{
Expand Down Expand Up @@ -481,9 +488,11 @@ void WritePark7( CudaK32PlotContext& cx )
const size_t copySize = sizeof( uint32 ) * retainedEntryCount;

if( !isLastBucket )
CudaErrCheck( cudaMemcpyAsync( devIndexBuffer - retainedEntryCount, copySource, copySize, cudaMemcpyDeviceToDevice, cx.computeStream ) );
Log::Line( "Marker Set to %d", 27)
CudaErrCheck( cudaMemcpyAsync( devIndexBuffer - retainedEntryCount, copySource, copySize, cudaMemcpyDeviceToDevice, cx.computeStream ) );
else
CudaErrCheck( cudaMemcpyAsync( hostLastParkEntries, copySource, copySize, cudaMemcpyDeviceToHost, cx.computeStream ) );
Log::Line( "Marker Set to %d", 28)
CudaErrCheck( cudaMemcpyAsync( hostLastParkEntries, copySource, copySize, cudaMemcpyDeviceToHost, cx.computeStream ) );
}

// Download parks & write to plot
Expand Down
51 changes: 34 additions & 17 deletions cuda/CudaPlotPhase3Step3.cu
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx )
const FSE_CTable* hostCTable = !isCompressed ? CTables[(int)lTable] : cx.gCfg->ctable;

// (upload must be loaded before first bucket, on the same stream)
CudaErrCheck( cudaMemcpyAsync( s3.devCTable, hostCTable, cTableSize, cudaMemcpyHostToDevice,
Log::Line( "Marker Set to %d", 29)
CudaErrCheck( cudaMemcpyAsync( s3.devCTable, hostCTable, cTableSize, cudaMemcpyHostToDevice,
s3.lpIn.GetQueue()->GetStream() ) );

// Load initial bucket
Expand Down Expand Up @@ -116,11 +117,14 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx )
cudaStream_t lpStream = cx.computeStream;//B;
cudaStream_t downloadStream = cx.gpuDownloadStream[0]->GetStream();

CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, sortAndMapStream ) );
CudaErrCheck( cudaMemsetAsync( s3.devParkOverrunCount, 0, sizeof( uint32 ), sortAndMapStream ) );
Log::Line( "Marker Set to %d", 30)
CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, sortAndMapStream ) );
Log::Line( "Marker Set to %d", 31)
CudaErrCheck( cudaMemsetAsync( s3.devParkOverrunCount, 0, sizeof( uint32 ), sortAndMapStream ) );

// Set initial event LP stream event as set.
CudaErrCheck( cudaEventRecord( cx.computeEventA, lpStream ) );
Log::Line( "Marker Set to %d", 32)
CudaErrCheck( cudaEventRecord( cx.computeEventA, lpStream ) );

cx.parkFence->Reset( 0 );
s3.parkBucket = 0;
Expand All @@ -145,23 +149,27 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx )
#if _DEBUG
{
size_t sortRequiredSize = 0;
CudaErrCheck( cub::DeviceRadixSort::SortPairs<uint64, uint32>( nullptr, sortRequiredSize, nullptr, nullptr, nullptr, nullptr, bucketEntryCount, 0, 64 ) );
Log::Line( "Marker Set to %d", 33)
CudaErrCheck( cub::DeviceRadixSort::SortPairs<uint64, uint32>( nullptr, sortRequiredSize, nullptr, nullptr, nullptr, nullptr, bucketEntryCount, 0, 64 ) );
ASSERT( s3.sizeTmpSort >= sortRequiredSize );
}
#endif

// Wait for the previous bucket's LP work to finish, so we can re-use the device buffer
CudaErrCheck( cudaStreamWaitEvent( sortAndMapStream, cx.computeEventA ) );
Log::Line( "Marker Set to %d", 34)
CudaErrCheck( cudaStreamWaitEvent( sortAndMapStream, cx.computeEventA ) );

// #TODO: We can use 63-7 (log2(128 buckets)), which might be faster
// #NOTE: I did change it and the sort failed. Investigate.
CudaErrCheck( cub::DeviceRadixSort::SortPairs<uint64, uint32>(
Log::Line( "Marker Set to %d", 35)
CudaErrCheck( cub::DeviceRadixSort::SortPairs<uint64, uint32>(
s3.devSortTmpData, s3.sizeTmpSort,
unsortedLinePoints, sortedLinePoints,
unsortedIndices, sortedIndices,
bucketEntryCount, 0, 64, sortAndMapStream ) );

CudaErrCheck( cudaEventRecord( cx.computeEventB, sortAndMapStream ) );
Log::Line( "Marker Set to %d", 36)
CudaErrCheck( cudaEventRecord( cx.computeEventB, sortAndMapStream ) );

s3.lpIn .ReleaseDeviceBuffer( sortAndMapStream ); unsortedLinePoints = nullptr;
s3.indexIn.ReleaseDeviceBuffer( sortAndMapStream ); unsortedIndices = nullptr;
Expand Down Expand Up @@ -190,12 +198,14 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx )
ASSERT( parkCount <= P3_PRUNED_MAX_PARKS_PER_BUCKET );

// Wait for sort to finish
CudaErrCheck( cudaStreamWaitEvent( lpStream, cx.computeEventB ) );
Log::Line( "Marker Set to %d", 37)
CudaErrCheck( cudaStreamWaitEvent( lpStream, cx.computeEventB ) );

// Deltafy line points
DeltafyLinePoints( cx, entryCount, parkLinePoints, s3.devDeltaLinePoints, lpStream );

CudaErrCheck( cudaEventRecord( cx.computeEventC, lpStream ) ); // Signal download stream can download remaining line points for last park
Log::Line( "Marker Set to %d", 38)
CudaErrCheck( cudaEventRecord( cx.computeEventC, lpStream ) ); // Signal download stream can download remaining line points for last park

// Compress line point parks
byte* devParks = (byte*)s3.parksOut.LockDeviceBuffer( lpStream );
Expand All @@ -214,20 +224,24 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx )
if( !isLastBucket )
{
// Not the last bucket, so retain entries for the next GPU compression bucket
CudaErrCheck( cudaMemcpyAsync( sortedLinePoints - retainedLPCount, copySource, copySize, cudaMemcpyDeviceToDevice, lpStream ) );
Log::Line( "Marker Set to %d", 39)
CudaErrCheck( cudaMemcpyAsync( sortedLinePoints - retainedLPCount, copySource, copySize, cudaMemcpyDeviceToDevice, lpStream ) );
}
else
{
// No more buckets so we have to compress this last park on the CPU
CudaErrCheck( cudaStreamWaitEvent( downloadStream, cx.computeEventC ) );
Log::Line( "Marker Set to %d", 40)
CudaErrCheck( cudaStreamWaitEvent( downloadStream, cx.computeEventC ) );

hostRetainedEntries = cx.useParkContext ? cx.parkContext->hostRetainedLinePoints :
(uint64*)( hostParksWriter + hostParkSize * parkCount );
CudaErrCheck( cudaMemcpyAsync( hostRetainedEntries, copySource, copySize, cudaMemcpyDeviceToHost, downloadStream ) );
Log::Line( "Marker Set to %d", 41)
CudaErrCheck( cudaMemcpyAsync( hostRetainedEntries, copySource, copySize, cudaMemcpyDeviceToHost, downloadStream ) );
}
}

CudaErrCheck( cudaEventRecord( cx.computeEventA, lpStream ) ); // Signal sortedLinePoints buffer ready for use again
Log::Line( "Marker Set to %d", 42)
CudaErrCheck( cudaEventRecord( cx.computeEventA, lpStream ) ); // Signal sortedLinePoints buffer ready for use again


// Download parks
Expand Down Expand Up @@ -270,17 +284,20 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx )
}

// Copy park overrun count
CudaErrCheck( cudaMemcpyAsync( s3.hostParkOverrunCount, s3.devParkOverrunCount, sizeof( uint32 ), cudaMemcpyDeviceToHost, downloadStream ) );
Log::Line( "Marker Set to %d", 43)
CudaErrCheck( cudaMemcpyAsync( s3.hostParkOverrunCount, s3.devParkOverrunCount, sizeof( uint32 ), cudaMemcpyDeviceToHost, downloadStream ) );

// Wait for parks to complete downloading
s3.parksOut.WaitForCompletion();
s3.parksOut.Reset();

// Copy map slice counts (for the next step 2)
CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT,
Log::Line( "Marker Set to %d", 44)
CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT,
cudaMemcpyDeviceToHost, downloadStream ) );

CudaErrCheck( cudaStreamSynchronize( downloadStream ) );
Log::Line( "Marker Set to %d", 45)
CudaErrCheck( cudaStreamSynchronize( downloadStream ) );
memcpy( &s3.prunedBucketSlices[0][0], cx.hostBucketSlices, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT );

FatalIf( *s3.hostParkOverrunCount > 0, "Park buffer overrun." );
Expand Down
Loading

0 comments on commit c7cb73c

Please sign in to comment.