Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

try upgrade max compression to 40 #436

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 4 additions & 59 deletions cuda/CudaMatch.cu
Original file line number Diff line number Diff line change
Expand Up @@ -564,17 +564,6 @@ cudaError CudaHarvestMatchK32(
HarvestMatchK32Kernel<<<kblocks, kthreads, 0, stream>>>(
devOutPairs, devMatchCount, devYEntries, entryCount, matchOffset );

// #if _DEBUG
// uint32 matchCount = 0;
// CudaErrCheck( cudaMemcpyAsync( &matchCount, devMatchCount, sizeof( uint32 ) , cudaMemcpyDeviceToHost, stream ) );
// CudaErrCheck( cudaStreamSynchronize( stream ) );
// CudaErrCheck( cudaStreamSynchronize( stream ) );

// Pair* matches = new Pair[matchCount];
// CudaErrCheck( cudaMemcpyAsync( matches, devOutPairs, sizeof( Pair ) * matchCount , cudaMemcpyDeviceToHost, stream ) );
// CudaErrCheck( cudaStreamSynchronize( stream ) );
// CudaErrCheck( cudaStreamSynchronize( stream ) );
// #endif

return cudaSuccess;
}
Expand Down Expand Up @@ -603,8 +592,10 @@ void CudaMatchBucketizedK32(
CudaSetFirstAndLastGroup<<<1,2,0,stream>>>( tmpGroupCounts, entryCount );
}

CudaErrCheck( cudaMemsetAsync( cx.devGroupCount, 0, sizeof( uint32 ), stream ) );
CudaErrCheck( cudaMemsetAsync( cx.devMatchCount, 0, sizeof( uint32 ), stream ) );
Log::Line( "Marker Set to %d", 1)
CudaErrCheck( cudaMemsetAsync( cx.devGroupCount, 0, sizeof( uint32 ), stream ) );
Log::Line( "Marker Set to %d", 2)
CudaErrCheck( cudaMemsetAsync( cx.devMatchCount, 0, sizeof( uint32 ), stream ) );
ScanGroupsCudaK32Bucket<<<kscanblocks, BBCU_SCAN_GROUP_THREADS, 0, stream>>>( devY, tmpGroupCounts+2, cx.devGroupCount, entryCount, bucketMask );

byte* sortTmpAlloc = (byte*)( tmpGroupCounts + BBCU_MAX_GROUP_COUNT );
Expand All @@ -621,49 +612,3 @@ void CudaMatchBucketizedK32(
MatchCudaK32Bucket<<<BBCU_MAX_GROUP_COUNT, BBCU_THREADS_PER_MATCH_GROUP, 0, stream>>>( bucketMask, entryCount, cx.devGroupCount, devY, cx.devGroupBoundaries, cx.devMatchCount, cx.devMatches );
}

//-----------------------------------------------------------
// cudaError CudaHarvestMatchK32WithGroupScan(
// Pair* devOutPairs,
// uint32* devMatchCount,
// const uint32 maxMatches,
// uint32* devGroupIndices,
// uint32* devGroupIndicesTemp,
// const uint32 maxGroups,
// void* sortBuffer,
// const size_t sortBufferSize,
// const uint64* devYEntries,
// const uint32 entryCount,
// const uint32 matchOffset,
// cudaStream_t stream )
// {
// // Scan for BC groups
// {
// const uint32 kblocks = 0;
// const uint32 kthreads = 0;


// // constexpr uint32 kscanblocks = CuCDiv( BBCU_BUCKET_ALLOC_ENTRY_COUNT, BBCU_SCAN_GROUP_THREADS );
// // Initialize the entries to the max value so that they are not included in the sort
// CudaInitGroups<<<kblocks, kthreads, 0, stream>>>( devGroupIndicesTemp, entryCount );
// // CudaInitGroupsBucket<<<kscanblocks, BBCU_SCAN_GROUP_THREADS, 0, stream>>>( tmpGroupCounts );

// // Add first group and last ghost group
// CudaSetFirstAndLastGroup<<<1,2,0,stream>>>( tmpGroupCounts, entryCount );
// }

// CudaErrCheck( cudaMemsetAsync( cx.devGroupCount, 0, sizeof( uint32 ), stream ) );
// CudaErrCheck( cudaMemsetAsync( cx.devMatchCount, 0, sizeof( uint32 ), stream ) );
// ScanGroupsCudaK32Bucket<<<kscanblocks, BBCU_SCAN_GROUP_THREADS, 0, stream>>>( devY, tmpGroupCounts+2, cx.devGroupCount, entryCount, bucketMask );

// byte* sortTmpAlloc = (byte*)( tmpGroupCounts + BBCU_MAX_GROUP_COUNT );
// size_t sortTmpSize = ( BBCU_BUCKET_ALLOC_ENTRY_COUNT - BBCU_MAX_GROUP_COUNT ) * sizeof( uint32 );

// #if _DEBUG
// size_t sortSize = 0;
// cub::DeviceRadixSort::SortKeys<uint32, uint32>( nullptr, sortSize, nullptr, nullptr, BBCU_MAX_GROUP_COUNT, 0, 32 );
// ASSERT( sortSize <= sortTmpSize );
// #endif

// cub::DeviceRadixSort::SortKeys<uint32, uint32>( sortTmpAlloc, sortTmpSize, tmpGroupCounts, cx.devGroupBoundaries, BBCU_MAX_GROUP_COUNT, 0, 32, stream );

// }
6 changes: 4 additions & 2 deletions cuda/CudaParkSerializer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,10 @@ void InitFSEBitMask( CudaK32PlotContext& cx )
0x3FFFFFFF, 0x7FFFFFFF
};

CudaErrCheck( cudaMemcpyToSymbolAsync( CUDA_FSE_BIT_mask, bitmask, sizeof( bitmask ), 0, cudaMemcpyHostToDevice, cx.computeStream ) );
CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) );
Log::Line( "Marker Set to %d", 3)
CudaErrCheck( cudaMemcpyToSymbolAsync( CUDA_FSE_BIT_mask, bitmask, sizeof( bitmask ), 0, cudaMemcpyHostToDevice, cx.computeStream ) );
Log::Line( "Marker Set to %d", 4)
CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) );
}


Expand Down
6 changes: 4 additions & 2 deletions cuda/CudaPlotContext.h
Original file line number Diff line number Diff line change
Expand Up @@ -576,8 +576,10 @@ inline void DbgPrintDeviceHash( const char* msg, const void* ptr, const size_t s
byte hash[32];

void* hostBuffer = bbvirtallocboundednuma<byte>( size );
CudaErrCheck( cudaMemcpyAsync( hostBuffer, ptr, size, cudaMemcpyDeviceToHost, stream ) );
CudaErrCheck( cudaStreamSynchronize( stream ) );
Log::Line( "Marker Set to %d", 5)
CudaErrCheck( cudaMemcpyAsync( hostBuffer, ptr, size, cudaMemcpyDeviceToHost, stream ) );
Log::Line( "Marker Set to %d", 6)
CudaErrCheck( cudaStreamSynchronize( stream ) );

blake3_hasher hasher;
blake3_hasher_init( &hasher );
Expand Down
45 changes: 11 additions & 34 deletions cuda/CudaPlotPhase2.cu
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,8 @@ static void BytefieldToBitfield( CudaK32PlotContext& cx, const byte* bytefield,

#if DBG_BBCU_P2_COUNT_PRUNED_ENTRIES
#define G_PRUNED_COUNTS ,cx.phase2->devPrunedCount
CudaErrCheck( cudaMemsetAsync( cx.phase2->devPrunedCount, 0, sizeof( uint32 ), stream ) );
Log::Line( "Marker Set to %d", 7)
CudaErrCheck( cudaMemsetAsync( cx.phase2->devPrunedCount, 0, sizeof( uint32 ), stream ) );
#else
#define G_PRUNED_COUNTS
#endif
Expand Down Expand Up @@ -176,7 +177,8 @@ void MarkTable( CudaK32PlotContext& cx, CudaK32Phase2& p2 )
}

// Zero-out marks
CudaErrCheck( cudaMemsetAsync( devLMarks, 0, GetMarkingTableByteSize(), cx.computeStream ) );
Log::Line( "Marker Set to %d", 8)
CudaErrCheck( cudaMemsetAsync( devLMarks, 0, GetMarkingTableByteSize(), cx.computeStream ) );

// Load first bucket's worth of pairs
LoadPairs( cx, p2, rTable, 0 );
Expand Down Expand Up @@ -230,9 +232,12 @@ void MarkTable( CudaK32PlotContext& cx, CudaK32Phase2& p2 )
#if DBG_BBCU_P2_COUNT_PRUNED_ENTRIES
{
uint32 prunedEntryCount = 0;
CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) );
CudaErrCheck( cudaMemcpyAsync( &prunedEntryCount, p2.devPrunedCount, sizeof( uint32 ), cudaMemcpyDeviceToHost, cx.computeStream ) );
CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) );
Log::Line( "Marker Set to %d", 9)
CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) );
Log::Line( "Marker Set to %d", 10)
CudaErrCheck( cudaMemcpyAsync( &prunedEntryCount, p2.devPrunedCount, sizeof( uint32 ), cudaMemcpyDeviceToHost, cx.computeStream ) );
Log::Line( "Marker Set to %d", 11)
CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) );

const uint64 lEntryCount = cx.tableEntryCounts[(int)lTable];
Log::Line( "Table %u now has %u / %llu ( %.2lf%% ) entries.", (uint)lTable+1,
Expand All @@ -245,22 +250,7 @@ void MarkTable( CudaK32PlotContext& cx, CudaK32Phase2& p2 )
#if _DEBUG
p2.outMarks.WaitForCompletion();

// CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) );
// CudaErrCheck( cudaStreamSynchronize( cx.gpuDownloadStream[0]->GetStream() ) );
// CudaErrCheck( cudaStreamSynchronize( cx.gpuDownloadStream[1]->GetStream() ) );
// CudaErrCheck( cudaStreamSynchronize( cx.gpuDownloadStream[2]->GetStream() ) );

// byte* hByteField = bbcvirtalloc<byte>( GetMarkingTableByteSize() );
// uint64* hBitField = bbcvirtalloc<uint64>( GetMarkingTableBitFieldSize() );
// uint64* rBitField = bbcvirtalloc<uint64>( GetMarkingTableBitFieldSize() );
// CudaErrCheck( cudaMemcpyAsync( hByteField, devLMarks, GetMarkingTableByteSize(), cudaMemcpyDeviceToHost, cx.computeStream ) );
// CudaErrCheck( cudaMemcpyAsync( hBitField, bitfield, GetMarkingTableBitFieldSize(), cudaMemcpyDeviceToHost, cx.computeStream ) );

// if( rTable < TableId::Table7 )
// CudaErrCheck( cudaMemcpyAsync( rBitField, p2.devRMarks, GetMarkingTableBitFieldSize(), cudaMemcpyDeviceToHost, cx.computeStream ) );

// CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) );
// // (void)p2.outMarks.GetDeviceBuffer();
uint64* hBitField = cx.hostMarkingTables[(int)lTable];

std::atomic<uint64> bitfieldPrunedEntryCount = 0;
Expand All @@ -276,23 +266,10 @@ void MarkTable( CudaK32PlotContext& cx, CudaK32Phase2& p2 )
uint64 localPrunedEntryCount = 0;
uint64 rPrunedEntryCount = 0;

// BitField rMarks( rBitField, rEntryCount );
// const byte* bytefield = hByteField;

uint64 count, offset, end;

// // Count r entries again to make sure it's still valid
// if( rt < TableId::Table7 )
// {
// GetThreadOffsets( self, rEntryCount, count, offset, end );
// for( uint64 i = offset; i < end; i++ )
// {
// if( rMarks.Get( i ) )
// rPrunedEntryCount ++;
// }

// rTablePrunedEntryCount += rPrunedEntryCount;
// }


GetThreadOffsets( self, lEntryCount, count, offset, end );
// for( uint64 i = offset; i < end; i++ )
Expand Down
24 changes: 16 additions & 8 deletions cuda/CudaPlotPhase3.cu
Original file line number Diff line number Diff line change
Expand Up @@ -374,12 +374,14 @@ void Step1( CudaK32PlotContext& cx )
const TableId rTable = cx.table;

// Clear pruned table count
CudaErrCheck( cudaMemsetAsync( p3.devPrunedEntryCount, 0, sizeof( uint32 ), cx.computeStream ) );
Log::Line( "Marker Set to %d", 12)
CudaErrCheck( cudaMemsetAsync( p3.devPrunedEntryCount, 0, sizeof( uint32 ), cx.computeStream ) );

// Load marking table (must be loaded before first bucket, on the same stream)
if( cx.table < TableId::Table7 )
{
CudaErrCheck( cudaMemcpyAsync( s1.rTableMarks, cx.hostMarkingTables[(int)rTable],
Log::Line( "Marker Set to %d", 13)
CudaErrCheck( cudaMemcpyAsync( s1.rTableMarks, cx.hostMarkingTables[(int)rTable],
GetMarkingTableBitFieldSize(), cudaMemcpyHostToDevice, s1.pairsLIn.GetQueue()->GetStream() ) );
}

Expand Down Expand Up @@ -416,7 +418,8 @@ void Step1( CudaK32PlotContext& cx )
// Generate map
#define KERN_RMAP_ARGS entryCount, rTableOffset, devSliceCounts, p3.devPrunedEntryCount, devRMap, devLPairs, devRPairs, s1.rTableMarks

CudaErrCheck( cudaMemsetAsync( devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT, cx.computeStream ) );
Log::Line( "Marker Set to %d", 14)
CudaErrCheck( cudaMemsetAsync( devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT, cx.computeStream ) );

if( cx.table < TableId::Table7 )
PruneAndWriteRMap<true><<<blocksPerGrid, threadPerBlock, 0, cx.computeStream>>>( KERN_RMAP_ARGS );
Expand All @@ -436,7 +439,8 @@ void Step1( CudaK32PlotContext& cx )
// Download slice counts
cudaStream_t downloadStream = s1.rMapOut.GetQueue()->GetStream();

CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT,
Log::Line( "Marker Set to %d", 15)
CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT,
cudaMemcpyDeviceToHost, downloadStream ) );

// Wait for completion
Expand All @@ -446,7 +450,8 @@ void Step1( CudaK32PlotContext& cx )
s1.pairsLIn.Reset();
s1.pairsRIn.Reset();

CudaErrCheck( cudaStreamSynchronize( downloadStream ) );
Log::Line( "Marker Set to %d", 16)
CudaErrCheck( cudaStreamSynchronize( downloadStream ) );

// Add-up pruned bucket counts and tables counts
memcpy( &s1.prunedBucketSlices[0][0], cx.hostBucketSlices, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT );
Expand Down Expand Up @@ -521,7 +526,8 @@ void CompressInlinedTable( CudaK32PlotContext& cx )
#endif

// Load R Marking table (must be loaded before first bucket, on the same stream)
CudaErrCheck( cudaMemcpyAsync( (void*)tx.devRMarks, cx.hostMarkingTables[(int)rTable],
Log::Line( "Marker Set to %d", 17)
CudaErrCheck( cudaMemcpyAsync( (void*)tx.devRMarks, cx.hostMarkingTables[(int)rTable],
GetMarkingTableBitFieldSize(), cudaMemcpyHostToDevice, p3.xTable.xIn.GetQueue()->GetStream() ) );

// Load initial bucket
Expand All @@ -536,7 +542,8 @@ void CompressInlinedTable( CudaK32PlotContext& cx )
uint64 tablePrunedEntryCount = 0;
uint32 rTableOffset = 0;

CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cx.computeStream ) );
Log::Line( "Marker Set to %d", 18)
CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cx.computeStream ) );

for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
{
Expand All @@ -559,7 +566,8 @@ void CompressInlinedTable( CudaK32PlotContext& cx )
uint32* devSliceCounts = cx.devSliceCounts + bucket * BBCU_BUCKET_COUNT;

#if _DEBUG
CudaErrCheck( cudaMemsetAsync( outLps, 0, sizeof( uint64 ) * P3_PRUNED_BUCKET_MAX, cx.computeStream ) );
Log::Line( "Marker Set to %d", 19)
CudaErrCheck( cudaMemsetAsync( outLps, 0, sizeof( uint64 ) * P3_PRUNED_BUCKET_MAX, cx.computeStream ) );
#endif

CudaConvertInlinedXsToLinePoints<<<blocksPerGrid, threadPerBlock, 0, cx.computeStream>>>(
Expand Down
27 changes: 18 additions & 9 deletions cuda/CudaPlotPhase3Step2.cu
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,8 @@ static void ConvertRMapToLinePoints( CudaK32PlotContext& cx, const uint32 entryC
const uint64 divisor = P3_CalculateMaxLPValue( prunedEntryCount ) / BBCU_BUCKET_COUNT;

// #TODO: Use upload stream?
CudaErrCheck( cudaMemcpyToSymbolAsync( BucketDivisor, &divisor, sizeof( divisor ), 0, cudaMemcpyHostToDevice, cx.computeStream ) );
Log::Line( "Marker Set to %d", 20)
CudaErrCheck( cudaMemcpyToSymbolAsync( BucketDivisor, &divisor, sizeof( divisor ), 0, cudaMemcpyHostToDevice, cx.computeStream ) );
}

CudaConvertRMapToLinePoints<false><<<blocks, threads, 0, stream>>>( Rmap2LPParams, 0 );
Expand Down Expand Up @@ -223,7 +224,8 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx )
if( isCompressed )
{
// Copy from upload buffer to working buffer
CudaErrCheck( cudaMemcpyAsync( lTable, lMap, BBCU_BUCKET_ENTRY_COUNT * sizeof( uint32 ), cudaMemcpyDeviceToDevice, cx.computeStream ) );
Log::Line( "Marker Set to %d", 21)
CudaErrCheck( cudaMemcpyAsync( lTable, lMap, BBCU_BUCKET_ENTRY_COUNT * sizeof( uint32 ), cudaMemcpyDeviceToDevice, cx.computeStream ) );
}
else
{
Expand Down Expand Up @@ -264,7 +266,8 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx )
LoadRBucket( cx, 0 );

// Clear pruned entry count
CudaErrCheck( cudaMemsetAsync( p3.devPrunedEntryCount, 0, sizeof( uint32 ), cx.computeStream ) );
Log::Line( "Marker Set to %d", 22)
CudaErrCheck( cudaMemsetAsync( p3.devPrunedEntryCount, 0, sizeof( uint32 ), cx.computeStream ) );

// Unpack the first map beforehand
UnpackLBucket( cx, 0 );
Expand All @@ -275,7 +278,8 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx )
///
uint32 rTableOffset = 0; // Track the global origin index of R entry/line point

CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cx.computeStream ) );
Log::Line( "Marker Set to %d", 23)
CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cx.computeStream ) );

for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
{
Expand All @@ -298,7 +302,8 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx )
static_assert( BBCU_BUCKET_ALLOC_ENTRY_COUNT - BBCU_BUCKET_ENTRY_COUNT == copyCount );

uint32* nextLTable = s2.devLTable[nextBucket & 1];
CudaErrCheck( cudaMemcpyAsync( (uint32*)devLTable + BBCU_BUCKET_ENTRY_COUNT, nextLTable, copyCount * sizeof( uint32 ), cudaMemcpyDeviceToDevice, cx.computeStream ) );
Log::Line( "Marker Set to %d", 24)
CudaErrCheck( cudaMemcpyAsync( (uint32*)devLTable + BBCU_BUCKET_ENTRY_COUNT, nextLTable, copyCount * sizeof( uint32 ), cudaMemcpyDeviceToDevice, cx.computeStream ) );
}

if( nextBucketL < BBCU_BUCKET_COUNT )
Expand Down Expand Up @@ -351,12 +356,14 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx )
// Copy slice counts & bucket count
cudaStream_t downloadStream = s2.lpOut.GetQueue()->GetStream();

CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT,
Log::Line( "Marker Set to %d", 25)
CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT,
cudaMemcpyDeviceToHost, downloadStream ) );

memset( p3.prunedBucketCounts[(int)rTable], 0, BBCU_BUCKET_COUNT * sizeof( uint32 ) );

CudaErrCheck( cudaStreamSynchronize( downloadStream ) );
Log::Line( "Marker Set to %d", 26)
CudaErrCheck( cudaStreamSynchronize( downloadStream ) );
bbmemcpy_t( &s2.prunedBucketSlices[0][0], cx.hostBucketSlices, BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT );
for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
{
Expand Down Expand Up @@ -481,9 +488,11 @@ void WritePark7( CudaK32PlotContext& cx )
const size_t copySize = sizeof( uint32 ) * retainedEntryCount;

if( !isLastBucket )
CudaErrCheck( cudaMemcpyAsync( devIndexBuffer - retainedEntryCount, copySource, copySize, cudaMemcpyDeviceToDevice, cx.computeStream ) );
Log::Line( "Marker Set to %d", 27)
CudaErrCheck( cudaMemcpyAsync( devIndexBuffer - retainedEntryCount, copySource, copySize, cudaMemcpyDeviceToDevice, cx.computeStream ) );
else
CudaErrCheck( cudaMemcpyAsync( hostLastParkEntries, copySource, copySize, cudaMemcpyDeviceToHost, cx.computeStream ) );
Log::Line( "Marker Set to %d", 28)
CudaErrCheck( cudaMemcpyAsync( hostLastParkEntries, copySource, copySize, cudaMemcpyDeviceToHost, cx.computeStream ) );
}

// Download parks & write to plot
Expand Down
Loading