From c7cb73cdc27e4d4205bc1ce68c6c2fdadf79ceb9 Mon Sep 17 00:00:00 2001
From: Moz <contact@ether-source.fr>
Date: Thu, 2 Nov 2023 22:28:52 +0100
Subject: [PATCH] add marker

---
 cuda/CudaMatch.cu           |  6 ++--
 cuda/CudaParkSerializer.cu  |  6 ++--
 cuda/CudaPlotContext.h      |  6 ++--
 cuda/CudaPlotPhase2.cu      | 15 ++++++----
 cuda/CudaPlotPhase3.cu      | 24 ++++++++++------
 cuda/CudaPlotPhase3Step2.cu | 27 ++++++++++++------
 cuda/CudaPlotPhase3Step3.cu | 51 ++++++++++++++++++++++-----------
 cuda/CudaPlotUtil.cu        |  6 ++--
 cuda/CudaPlotter.cu         | 24 ++++++++++------
 cuda/GpuDownloadStream.cu   | 18 ++++++++----
 cuda/GpuQueue.cu            | 44 ++++++++++++++++++----------
 cuda/GpuStreams.cu          | 57 ++++++++++++++++++++++++-------------
 cuda/chacha8.cu             | 15 ++++++----
 13 files changed, 199 insertions(+), 100 deletions(-)
diff --git a/cuda/CudaMatch.cu b/cuda/CudaMatch.cu
index 359df68d..c1fdbff7 100644
--- a/cuda/CudaMatch.cu
+++ b/cuda/CudaMatch.cu
@@ -592,8 +592,10 @@ void CudaMatchBucketizedK32(
         CudaSetFirstAndLastGroup<<<1,2,0,stream>>>( tmpGroupCounts, entryCount );
     }
 
-    CudaErrCheck( cudaMemsetAsync( cx.devGroupCount, 0, sizeof( uint32 ), stream ) );
-    CudaErrCheck( cudaMemsetAsync( cx.devMatchCount, 0, sizeof( uint32 ), stream ) );
+    Log::Line( "Marker Set to %d", 1)
+CudaErrCheck( cudaMemsetAsync( cx.devGroupCount, 0, sizeof( uint32 ), stream ) );
+    Log::Line( "Marker Set to %d", 2)
+CudaErrCheck( cudaMemsetAsync( cx.devMatchCount, 0, sizeof( uint32 ), stream ) );
     ScanGroupsCudaK32Bucket<<<kscanblocks, BBCU_SCAN_GROUP_THREADS, 0, stream>>>( devY, tmpGroupCounts+2, cx.devGroupCount, entryCount, bucketMask );
 
     byte*  sortTmpAlloc = (byte*)( tmpGroupCounts + BBCU_MAX_GROUP_COUNT );
diff --git a/cuda/CudaParkSerializer.cu b/cuda/CudaParkSerializer.cu
index f3e8b8d4..91516a7a 100644
--- a/cuda/CudaParkSerializer.cu
+++ b/cuda/CudaParkSerializer.cu
@@ -20,8 +20,10 @@ void InitFSEBitMask( CudaK32PlotContext& cx )
         0x3FFFFFFF, 0x7FFFFFFF
     };
 
-    CudaErrCheck( cudaMemcpyToSymbolAsync( CUDA_FSE_BIT_mask, bitmask, sizeof( bitmask ), 0, cudaMemcpyHostToDevice, cx.computeStream ) );
-    CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) );
+    Log::Line( "Marker Set to %d", 3)
+CudaErrCheck( cudaMemcpyToSymbolAsync( CUDA_FSE_BIT_mask, bitmask, sizeof( bitmask ), 0, cudaMemcpyHostToDevice, cx.computeStream ) );
+    Log::Line( "Marker Set to %d", 4)
+CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) );
 }
 
 
diff --git a/cuda/CudaPlotContext.h b/cuda/CudaPlotContext.h
index fc5884b3..c83b3a32 100644
--- a/cuda/CudaPlotContext.h
+++ b/cuda/CudaPlotContext.h
@@ -576,8 +576,10 @@ inline void DbgPrintDeviceHash( const char* msg, const void* ptr, const size_t s
     byte hash[32];
 
     void* hostBuffer = bbvirtallocboundednuma<byte>( size );
-    CudaErrCheck( cudaMemcpyAsync( hostBuffer, ptr, size, cudaMemcpyDeviceToHost, stream ) );
-    CudaErrCheck( cudaStreamSynchronize( stream ) );
+    Log::Line( "Marker Set to %d", 5)
+CudaErrCheck( cudaMemcpyAsync( hostBuffer, ptr, size, cudaMemcpyDeviceToHost, stream ) );
+    Log::Line( "Marker Set to %d", 6)
+CudaErrCheck( cudaStreamSynchronize( stream ) );
 
     blake3_hasher hasher;
     blake3_hasher_init( &hasher );
diff --git a/cuda/CudaPlotPhase2.cu b/cuda/CudaPlotPhase2.cu
index 87c1b16c..d66ab8b6 100644
--- a/cuda/CudaPlotPhase2.cu
+++ b/cuda/CudaPlotPhase2.cu
@@ -119,7 +119,8 @@ static void BytefieldToBitfield( CudaK32PlotContext& cx, const byte* bytefield,
 
     #if DBG_BBCU_P2_COUNT_PRUNED_ENTRIES
         #define G_PRUNED_COUNTS ,cx.phase2->devPrunedCount
-        CudaErrCheck( cudaMemsetAsync( cx.phase2->devPrunedCount, 0, sizeof( uint32 ), stream ) );
+        Log::Line( "Marker Set to %d", 7)
+CudaErrCheck( cudaMemsetAsync( cx.phase2->devPrunedCount, 0, sizeof( uint32 ), stream ) );
     #else
         #define G_PRUNED_COUNTS 
     #endif
@@ -176,7 +177,8 @@ void MarkTable( CudaK32PlotContext& cx, CudaK32Phase2& p2 )
     }
 
     // Zero-out marks
-    CudaErrCheck( cudaMemsetAsync( devLMarks, 0, GetMarkingTableByteSize(), cx.computeStream ) );
+    Log::Line( "Marker Set to %d", 8)
+CudaErrCheck( cudaMemsetAsync( devLMarks, 0, GetMarkingTableByteSize(), cx.computeStream ) );
 
     // Load first bucket's worth of pairs
     LoadPairs( cx, p2, rTable, 0 );
@@ -230,9 +232,12 @@ void MarkTable( CudaK32PlotContext& cx, CudaK32Phase2& p2 )
 #if DBG_BBCU_P2_COUNT_PRUNED_ENTRIES
     {
         uint32 prunedEntryCount = 0;
-        CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) );
-        CudaErrCheck( cudaMemcpyAsync( &prunedEntryCount, p2.devPrunedCount, sizeof( uint32 ), cudaMemcpyDeviceToHost, cx.computeStream ) );
-        CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) );
+        Log::Line( "Marker Set to %d", 9)
+CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) );
+        Log::Line( "Marker Set to %d", 10)
+CudaErrCheck( cudaMemcpyAsync( &prunedEntryCount, p2.devPrunedCount, sizeof( uint32 ), cudaMemcpyDeviceToHost, cx.computeStream ) );
+        Log::Line( "Marker Set to %d", 11)
+CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) );
 
         const uint64 lEntryCount = cx.tableEntryCounts[(int)lTable];
         Log::Line( "Table %u now has %u / %llu  ( %.2lf%% ) entries.", (uint)lTable+1, 
diff --git a/cuda/CudaPlotPhase3.cu b/cuda/CudaPlotPhase3.cu
index 8fcdfe2a..176f344c 100644
--- a/cuda/CudaPlotPhase3.cu
+++ b/cuda/CudaPlotPhase3.cu
@@ -374,12 +374,14 @@ void Step1( CudaK32PlotContext& cx )
     const TableId rTable = cx.table;
 
     // Clear pruned table count
-    CudaErrCheck( cudaMemsetAsync( p3.devPrunedEntryCount, 0, sizeof( uint32 ), cx.computeStream ) );
+    Log::Line( "Marker Set to %d", 12)
+CudaErrCheck( cudaMemsetAsync( p3.devPrunedEntryCount, 0, sizeof( uint32 ), cx.computeStream ) );
 
     // Load marking table (must be loaded before first bucket, on the same stream)
     if( cx.table < TableId::Table7 )
     {
-        CudaErrCheck( cudaMemcpyAsync( s1.rTableMarks, cx.hostMarkingTables[(int)rTable],
+        Log::Line( "Marker Set to %d", 13)
+CudaErrCheck( cudaMemcpyAsync( s1.rTableMarks, cx.hostMarkingTables[(int)rTable],
                         GetMarkingTableBitFieldSize(), cudaMemcpyHostToDevice, s1.pairsLIn.GetQueue()->GetStream() ) );
     }
 
@@ -416,7 +418,8 @@ void Step1( CudaK32PlotContext& cx )
         // Generate map
         #define KERN_RMAP_ARGS entryCount, rTableOffset, devSliceCounts, p3.devPrunedEntryCount, devRMap, devLPairs, devRPairs, s1.rTableMarks
 
-        CudaErrCheck( cudaMemsetAsync( devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT, cx.computeStream ) );
+        Log::Line( "Marker Set to %d", 14)
+CudaErrCheck( cudaMemsetAsync( devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT, cx.computeStream ) );
 
         if( cx.table < TableId::Table7 )
             PruneAndWriteRMap<true><<<blocksPerGrid, threadPerBlock, 0, cx.computeStream>>>( KERN_RMAP_ARGS );
@@ -436,7 +439,8 @@ void Step1( CudaK32PlotContext& cx )
     // Download slice counts
     cudaStream_t downloadStream = s1.rMapOut.GetQueue()->GetStream();
 
-    CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, 
+    Log::Line( "Marker Set to %d", 15)
+CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT,
                     cudaMemcpyDeviceToHost, downloadStream ) );
 
     // Wait for completion
@@ -446,7 +450,8 @@ void Step1( CudaK32PlotContext& cx )
     s1.pairsLIn.Reset();
     s1.pairsRIn.Reset();
 
-    CudaErrCheck( cudaStreamSynchronize( downloadStream ) );
+    Log::Line( "Marker Set to %d", 16)
+CudaErrCheck( cudaStreamSynchronize( downloadStream ) );
 
     // Add-up pruned bucket counts and tables counts
     memcpy( &s1.prunedBucketSlices[0][0], cx.hostBucketSlices, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT );
@@ -521,7 +526,8 @@ void CompressInlinedTable( CudaK32PlotContext& cx )
     #endif
 
     // Load R Marking table (must be loaded before first bucket, on the same stream)
-    CudaErrCheck( cudaMemcpyAsync( (void*)tx.devRMarks, cx.hostMarkingTables[(int)rTable],
+    Log::Line( "Marker Set to %d", 17)
+CudaErrCheck( cudaMemcpyAsync( (void*)tx.devRMarks, cx.hostMarkingTables[(int)rTable],
                 GetMarkingTableBitFieldSize(), cudaMemcpyHostToDevice, p3.xTable.xIn.GetQueue()->GetStream() ) );
 
     // Load initial bucket
@@ -536,7 +542,8 @@ void CompressInlinedTable( CudaK32PlotContext& cx )
     uint64 tablePrunedEntryCount = 0;
     uint32 rTableOffset          = 0;
 
-    CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cx.computeStream ) );
+    Log::Line( "Marker Set to %d", 18)
+CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cx.computeStream ) );
 
     for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
     {
@@ -559,7 +566,8 @@ void CompressInlinedTable( CudaK32PlotContext& cx )
         uint32* devSliceCounts = cx.devSliceCounts + bucket * BBCU_BUCKET_COUNT;
 
         #if _DEBUG
-            CudaErrCheck( cudaMemsetAsync( outLps, 0, sizeof( uint64 ) * P3_PRUNED_BUCKET_MAX, cx.computeStream ) );
+            Log::Line( "Marker Set to %d", 19)
+CudaErrCheck( cudaMemsetAsync( outLps, 0, sizeof( uint64 ) * P3_PRUNED_BUCKET_MAX, cx.computeStream ) );
         #endif
 
         CudaConvertInlinedXsToLinePoints<<<blocksPerGrid, threadPerBlock, 0, cx.computeStream>>>(
diff --git a/cuda/CudaPlotPhase3Step2.cu b/cuda/CudaPlotPhase3Step2.cu
index 3a7a6449..eaa03f15 100644
--- a/cuda/CudaPlotPhase3Step2.cu
+++ b/cuda/CudaPlotPhase3Step2.cu
@@ -148,7 +148,8 @@ static void ConvertRMapToLinePoints( CudaK32PlotContext& cx, const uint32 entryC
             const uint64 divisor          = P3_CalculateMaxLPValue( prunedEntryCount ) / BBCU_BUCKET_COUNT;
 
             // #TODO: Use upload stream?
-            CudaErrCheck( cudaMemcpyToSymbolAsync( BucketDivisor, &divisor, sizeof( divisor ), 0, cudaMemcpyHostToDevice, cx.computeStream ) );
+            Log::Line( "Marker Set to %d", 20)
+CudaErrCheck( cudaMemcpyToSymbolAsync( BucketDivisor, &divisor, sizeof( divisor ), 0, cudaMemcpyHostToDevice, cx.computeStream ) );
         }
 
         CudaConvertRMapToLinePoints<false><<<blocks, threads, 0, stream>>>( Rmap2LPParams, 0 );
@@ -223,7 +224,8 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx )
         if( isCompressed )
         {
             // Copy from upload buffer to working buffer
-            CudaErrCheck( cudaMemcpyAsync( lTable, lMap, BBCU_BUCKET_ENTRY_COUNT * sizeof( uint32 ), cudaMemcpyDeviceToDevice, cx.computeStream ) );
+            Log::Line( "Marker Set to %d", 21)
+CudaErrCheck( cudaMemcpyAsync( lTable, lMap, BBCU_BUCKET_ENTRY_COUNT * sizeof( uint32 ), cudaMemcpyDeviceToDevice, cx.computeStream ) );
         }
         else
         {
@@ -264,7 +266,8 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx )
     LoadRBucket( cx, 0 );
 
     // Clear pruned entry count
-    CudaErrCheck( cudaMemsetAsync( p3.devPrunedEntryCount, 0, sizeof( uint32 ), cx.computeStream ) );
+    Log::Line( "Marker Set to %d", 22)
+CudaErrCheck( cudaMemsetAsync( p3.devPrunedEntryCount, 0, sizeof( uint32 ), cx.computeStream ) );
 
     // Unpack the first map beforehand
     UnpackLBucket( cx, 0 );
@@ -275,7 +278,8 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx )
     /// 
     uint32 rTableOffset = 0; // Track the global origin index of R entry/line point 
 
-    CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cx.computeStream ) );
+    Log::Line( "Marker Set to %d", 23)
+CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cx.computeStream ) );
 
     for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
     {
@@ -298,7 +302,8 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx )
             static_assert( BBCU_BUCKET_ALLOC_ENTRY_COUNT - BBCU_BUCKET_ENTRY_COUNT == copyCount );
 
             uint32* nextLTable = s2.devLTable[nextBucket & 1];
-            CudaErrCheck( cudaMemcpyAsync( (uint32*)devLTable + BBCU_BUCKET_ENTRY_COUNT, nextLTable, copyCount * sizeof( uint32 ), cudaMemcpyDeviceToDevice, cx.computeStream ) );
+            Log::Line( "Marker Set to %d", 24)
+CudaErrCheck( cudaMemcpyAsync( (uint32*)devLTable + BBCU_BUCKET_ENTRY_COUNT, nextLTable, copyCount * sizeof( uint32 ), cudaMemcpyDeviceToDevice, cx.computeStream ) );
         }
 
         if( nextBucketL < BBCU_BUCKET_COUNT )
@@ -351,12 +356,14 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx )
     // Copy slice counts & bucket count
     cudaStream_t downloadStream = s2.lpOut.GetQueue()->GetStream();
 
-    CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT,
+    Log::Line( "Marker Set to %d", 25)
+CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT,
                     cudaMemcpyDeviceToHost, downloadStream ) );
 
     memset( p3.prunedBucketCounts[(int)rTable], 0, BBCU_BUCKET_COUNT * sizeof( uint32 ) );
 
-    CudaErrCheck( cudaStreamSynchronize( downloadStream ) );
+    Log::Line( "Marker Set to %d", 26)
+CudaErrCheck( cudaStreamSynchronize( downloadStream ) );
     bbmemcpy_t( &s2.prunedBucketSlices[0][0], cx.hostBucketSlices, BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT );
     for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
     {
@@ -481,9 +488,11 @@ void WritePark7( CudaK32PlotContext& cx )
             const size_t  copySize             = sizeof( uint32 ) * retainedEntryCount;
 
             if( !isLastBucket )
-                CudaErrCheck( cudaMemcpyAsync( devIndexBuffer - retainedEntryCount, copySource, copySize, cudaMemcpyDeviceToDevice, cx.computeStream ) );
+                Log::Line( "Marker Set to %d", 27)
+CudaErrCheck( cudaMemcpyAsync( devIndexBuffer - retainedEntryCount, copySource, copySize, cudaMemcpyDeviceToDevice, cx.computeStream ) );
             else
-                CudaErrCheck( cudaMemcpyAsync( hostLastParkEntries, copySource, copySize, cudaMemcpyDeviceToHost, cx.computeStream ) );
+                Log::Line( "Marker Set to %d", 28)
+CudaErrCheck( cudaMemcpyAsync( hostLastParkEntries, copySource, copySize, cudaMemcpyDeviceToHost, cx.computeStream ) );
         }
 
         // Download parks & write to plot
diff --git a/cuda/CudaPlotPhase3Step3.cu b/cuda/CudaPlotPhase3Step3.cu
index c8f9337b..93b7e681 100644
--- a/cuda/CudaPlotPhase3Step3.cu
+++ b/cuda/CudaPlotPhase3Step3.cu
@@ -61,7 +61,8 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx )
     const FSE_CTable* hostCTable = !isCompressed ? CTables[(int)lTable] : cx.gCfg->ctable;
 
     // (upload must be loaded before first bucket, on the same stream)
-    CudaErrCheck( cudaMemcpyAsync( s3.devCTable, hostCTable, cTableSize, cudaMemcpyHostToDevice, 
+    Log::Line( "Marker Set to %d", 29)
+CudaErrCheck( cudaMemcpyAsync( s3.devCTable, hostCTable, cTableSize, cudaMemcpyHostToDevice,
                     s3.lpIn.GetQueue()->GetStream() ) );
 
     // Load initial bucket
@@ -116,11 +117,14 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx )
     cudaStream_t lpStream         = cx.computeStream;//B;
     cudaStream_t downloadStream   = cx.gpuDownloadStream[0]->GetStream();
 
-    CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, sortAndMapStream ) );
-    CudaErrCheck( cudaMemsetAsync( s3.devParkOverrunCount, 0, sizeof( uint32 ), sortAndMapStream ) );
+    Log::Line( "Marker Set to %d", 30)
+CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, sortAndMapStream ) );
+    Log::Line( "Marker Set to %d", 31)
+CudaErrCheck( cudaMemsetAsync( s3.devParkOverrunCount, 0, sizeof( uint32 ), sortAndMapStream ) );
 
     // Set initial event LP stream event as set.
-    CudaErrCheck( cudaEventRecord( cx.computeEventA, lpStream ) );
+    Log::Line( "Marker Set to %d", 32)
+CudaErrCheck( cudaEventRecord( cx.computeEventA, lpStream ) );
 
     cx.parkFence->Reset( 0 );
     s3.parkBucket = 0;
@@ -145,23 +149,27 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx )
         #if _DEBUG
         {
             size_t sortRequiredSize = 0;
-            CudaErrCheck( cub::DeviceRadixSort::SortPairs<uint64, uint32>( nullptr, sortRequiredSize, nullptr, nullptr, nullptr, nullptr, bucketEntryCount, 0, 64 ) );
+            Log::Line( "Marker Set to %d", 33)
+CudaErrCheck( cub::DeviceRadixSort::SortPairs<uint64, uint32>( nullptr, sortRequiredSize, nullptr, nullptr, nullptr, nullptr, bucketEntryCount, 0, 64 ) );
             ASSERT( s3.sizeTmpSort >= sortRequiredSize );
         }
         #endif
 
         // Wait for the previous bucket's LP work to finish, so we can re-use the device buffer
-        CudaErrCheck( cudaStreamWaitEvent( sortAndMapStream, cx.computeEventA ) );
+        Log::Line( "Marker Set to %d", 34)
+CudaErrCheck( cudaStreamWaitEvent( sortAndMapStream, cx.computeEventA ) );
 
         // #TODO: We can use 63-7 (log2(128 buckets)), which might be faster
         // #NOTE: I did change it and the sort failed. Investigate.
-        CudaErrCheck( cub::DeviceRadixSort::SortPairs<uint64, uint32>(
+        Log::Line( "Marker Set to %d", 35)
+CudaErrCheck( cub::DeviceRadixSort::SortPairs<uint64, uint32>(
             s3.devSortTmpData,  s3.sizeTmpSort,
             unsortedLinePoints, sortedLinePoints,
             unsortedIndices,    sortedIndices,
             bucketEntryCount, 0, 64, sortAndMapStream ) );
 
-        CudaErrCheck( cudaEventRecord( cx.computeEventB, sortAndMapStream ) );
+        Log::Line( "Marker Set to %d", 36)
+CudaErrCheck( cudaEventRecord( cx.computeEventB, sortAndMapStream ) );
 
         s3.lpIn   .ReleaseDeviceBuffer( sortAndMapStream ); unsortedLinePoints = nullptr;
         s3.indexIn.ReleaseDeviceBuffer( sortAndMapStream ); unsortedIndices    = nullptr;
@@ -190,12 +198,14 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx )
         ASSERT( parkCount <= P3_PRUNED_MAX_PARKS_PER_BUCKET );
 
         // Wait for sort to finish
-        CudaErrCheck( cudaStreamWaitEvent( lpStream, cx.computeEventB ) );
+        Log::Line( "Marker Set to %d", 37)
+CudaErrCheck( cudaStreamWaitEvent( lpStream, cx.computeEventB ) );
 
         // Deltafy line points
         DeltafyLinePoints( cx, entryCount, parkLinePoints, s3.devDeltaLinePoints, lpStream );
 
-        CudaErrCheck( cudaEventRecord( cx.computeEventC, lpStream ) );  // Signal download stream can download remaining line points for last park
+        Log::Line( "Marker Set to %d", 38)
+CudaErrCheck( cudaEventRecord( cx.computeEventC, lpStream ) );  // Signal download stream can download remaining line points for last park
 
         // Compress line point parks
         byte* devParks = (byte*)s3.parksOut.LockDeviceBuffer( lpStream );
@@ -214,20 +224,24 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx )
             if( !isLastBucket )
             {
                 // Not the last bucket, so retain entries for the next GPU compression bucket
-                CudaErrCheck( cudaMemcpyAsync( sortedLinePoints - retainedLPCount, copySource, copySize, cudaMemcpyDeviceToDevice, lpStream ) );
+                Log::Line( "Marker Set to %d", 39)
+CudaErrCheck( cudaMemcpyAsync( sortedLinePoints - retainedLPCount, copySource, copySize, cudaMemcpyDeviceToDevice, lpStream ) );
             }
             else
             {       
                 // No more buckets so we have to compress this last park on the CPU
-                CudaErrCheck( cudaStreamWaitEvent( downloadStream, cx.computeEventC ) );
+                Log::Line( "Marker Set to %d", 40)
+CudaErrCheck( cudaStreamWaitEvent( downloadStream, cx.computeEventC ) );
 
                 hostRetainedEntries = cx.useParkContext ? cx.parkContext->hostRetainedLinePoints :
                                                        (uint64*)( hostParksWriter + hostParkSize * parkCount );
-                CudaErrCheck( cudaMemcpyAsync( hostRetainedEntries, copySource, copySize, cudaMemcpyDeviceToHost, downloadStream ) );
+                Log::Line( "Marker Set to %d", 41)
+CudaErrCheck( cudaMemcpyAsync( hostRetainedEntries, copySource, copySize, cudaMemcpyDeviceToHost, downloadStream ) );
             }
         }
 
-        CudaErrCheck( cudaEventRecord( cx.computeEventA, lpStream ) );  // Signal sortedLinePoints buffer ready for use again
+        Log::Line( "Marker Set to %d", 42)
+CudaErrCheck( cudaEventRecord( cx.computeEventA, lpStream ) );  // Signal sortedLinePoints buffer ready for use again
 
 
         // Download parks
@@ -270,17 +284,20 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx )
     }
 
     // Copy park overrun count
-    CudaErrCheck( cudaMemcpyAsync( s3.hostParkOverrunCount, s3.devParkOverrunCount, sizeof( uint32 ), cudaMemcpyDeviceToHost, downloadStream ) );
+    Log::Line( "Marker Set to %d", 43)
+CudaErrCheck( cudaMemcpyAsync( s3.hostParkOverrunCount, s3.devParkOverrunCount, sizeof( uint32 ), cudaMemcpyDeviceToHost, downloadStream ) );
 
     // Wait for parks to complete downloading
     s3.parksOut.WaitForCompletion();
     s3.parksOut.Reset();
 
     // Copy map slice counts (for the next step 2)
-    CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT,
+    Log::Line( "Marker Set to %d", 44)
+CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT,
                     cudaMemcpyDeviceToHost, downloadStream ) );
 
-    CudaErrCheck( cudaStreamSynchronize( downloadStream ) );
+    Log::Line( "Marker Set to %d", 45)
+CudaErrCheck( cudaStreamSynchronize( downloadStream ) );
     memcpy( &s3.prunedBucketSlices[0][0], cx.hostBucketSlices, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT );
 
     FatalIf( *s3.hostParkOverrunCount > 0, "Park buffer overrun." );
diff --git a/cuda/CudaPlotUtil.cu b/cuda/CudaPlotUtil.cu
index 4f7f18b3..228be9f6 100644
--- a/cuda/CudaPlotUtil.cu
+++ b/cuda/CudaPlotUtil.cu
@@ -21,7 +21,8 @@ void CudaK32PlotGenSortKey( const uint32 entryCount, uint32* devKey, cudaStream_
 
     GenSortKey<<<blockCount, threadsPerBlock, 0, stream>>>( entryCount, devKey );
     if( synchronize )
-        CudaErrCheck( cudaStreamSynchronize( stream ) );
+        Log::Line( "Marker Set to %d", 46)
+CudaErrCheck( cudaStreamSynchronize( stream ) );
     
 }
 
@@ -48,7 +49,8 @@ void CudaK32PlotSortByKey( const uint32 entryCount, const uint32* devKey, const
 
     SortByKey<T><<<blockCount, threadsPerBlock, 0, stream>>>( entryCount, devKey, devInput, devOutput );
     if( synchronize )
-        CudaErrCheck( cudaStreamSynchronize( stream ) );
+        Log::Line( "Marker Set to %d", 47)
+CudaErrCheck( cudaStreamSynchronize( stream ) );
 }
 
 //-----------------------------------------------------------
diff --git a/cuda/CudaPlotter.cu b/cuda/CudaPlotter.cu
index 80ba8b0e..64e9e94d 100644
--- a/cuda/CudaPlotter.cu
+++ b/cuda/CudaPlotter.cu
@@ -201,10 +201,14 @@ void InitContext( CudaK32PlotConfig& cfg, CudaK32PlotContext*& outContext )
 
     CudaInit( cx );
 
-    CudaErrCheck( cudaStreamCreateWithFlags( &cx.computeStream , cudaStreamNonBlocking ) );
-    CudaErrCheck( cudaStreamCreateWithFlags( &cx.computeStreamB, cudaStreamNonBlocking ) );
-    CudaErrCheck( cudaStreamCreateWithFlags( &cx.computeStreamC, cudaStreamNonBlocking ) );
-    CudaErrCheck( cudaStreamCreateWithFlags( &cx.computeStreamD, cudaStreamNonBlocking ) );
+    Log::Line( "Marker Set to %d", 48)
+CudaErrCheck( cudaStreamCreateWithFlags( &cx.computeStream , cudaStreamNonBlocking ) );
+    Log::Line( "Marker Set to %d", 49)
+CudaErrCheck( cudaStreamCreateWithFlags( &cx.computeStreamB, cudaStreamNonBlocking ) );
+    Log::Line( "Marker Set to %d", 50)
+CudaErrCheck( cudaStreamCreateWithFlags( &cx.computeStreamC, cudaStreamNonBlocking ) );
+    Log::Line( "Marker Set to %d", 51)
+CudaErrCheck( cudaStreamCreateWithFlags( &cx.computeStreamD, cudaStreamNonBlocking ) );
 
     cudaEventCreateWithFlags( &cx.computeEventA, cudaEventDisableTiming );
     cudaEventCreateWithFlags( &cx.computeEventB, cudaEventDisableTiming );
@@ -318,7 +322,8 @@ void CudaInit( CudaK32PlotContext& cx )
     cx.cudaDevice = (int32)cx.cfg.deviceIndex;
 
     cudaDeviceProp* cudaDevProps = new cudaDeviceProp{};
-    CudaErrCheck( cudaGetDeviceProperties( cudaDevProps, cx.cudaDevice ) );
+    Log::Line( "Marker Set to %d", 52)
+CudaErrCheck( cudaGetDeviceProperties( cudaDevProps, cx.cudaDevice ) );
     cx.cudaDevProps = cudaDevProps;
 
     Log::Line( "Selected cuda device %u : %s", cx.cudaDevice, cudaDevProps->name );
@@ -364,7 +369,8 @@ void CudaK32Plotter::Run( const PlotRequest& req )
     const auto& cfg = _cfg;
 
     // Only start profiling from here (don't profile allocations)
-    CudaErrCheck( cudaProfilerStart() );
+    Log::Line( "Marker Set to %d", 53)
+CudaErrCheck( cudaProfilerStart() );
 
     ASSERT( cx.plotWriter == nullptr );
     cx.plotWriter = new PlotWriter( !cfg.gCfg->disableOutputDirectIO );
@@ -522,7 +528,8 @@ void FpTable( CudaK32PlotContext& cx )
     }
 
     // Clear slice counts
-    CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cx.computeStream ) );
+    Log::Line( "Marker Set to %d", 54)
+CudaErrCheck( cudaMemsetAsync( cx.devSliceCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cx.computeStream ) );
 
     // Load initial buckets
     UploadBucketForTable( cx, 0 );
@@ -533,7 +540,8 @@ void FpTable( CudaK32PlotContext& cx )
         FpTableBucket( cx, bucket );
     }
 
-    CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) );
+    Log::Line( "Marker Set to %d", 55)
+CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) );
 
     // Copy bucket slices to host
     cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, 
diff --git a/cuda/GpuDownloadStream.cu b/cuda/GpuDownloadStream.cu
index 4d35a06b..d903adc2 100644
--- a/cuda/GpuDownloadStream.cu
+++ b/cuda/GpuDownloadStream.cu
@@ -11,7 +11,8 @@ void* GpuDownloadBuffer::GetDeviceBuffer()
 {
     const uint32 index = self->outgoingSequence % self->bufferCount;
 
-    CudaErrCheck( cudaEventSynchronize( self->events[index] ) );
+    Log::Line( "Marker Set to %d", 56)
+CudaErrCheck( cudaEventSynchronize( self->events[index] ) );
 
     return self->deviceBuffer[index];
 }
@@ -25,7 +26,8 @@ void* GpuDownloadBuffer::LockDeviceBuffer( cudaStream_t stream )
     self->lockSequence++;
 
     // Wait for the device buffer to be free to be used by kernels
-    CudaErrCheck( cudaStreamWaitEvent( stream, self->events[index] ) );
+    Log::Line( "Marker Set to %d", 57)
+CudaErrCheck( cudaStreamWaitEvent( stream, self->events[index] ) );
     return self->deviceBuffer[index];
 }
 
@@ -73,10 +75,12 @@ void GpuDownloadBuffer::DownloadAndCopy( void* hostBuffer, void* finalBuffer, co
     // // Copy
 
     // // Signal that the device buffer is free to be re-used
-    // CudaErrCheck( cudaEventRecord( self->events[index], stream ) );
+    // Log::Line( "Marker Set to %d", 58)
+CudaErrCheck( cudaEventRecord( self->events[index], stream ) );
 
     // // Launch copy command
-    // CudaErrCheck( cudaLaunchHostFunc( stream, []( void* userData ){
+    // Log::Line( "Marker Set to %d", 59)
+CudaErrCheck( cudaLaunchHostFunc( stream, []( void* userData ){
 
     //     const CopyInfo& c = *reinterpret_cast<CopyInfo*>( userData );
     //     IGpuBuffer* self = c.self;
@@ -131,11 +135,13 @@ void GpuDownloadBuffer::PerformDownload2D( void* hostBuffer, size_t width, size_
 
 
     // Signal from the work stream when it has finished doing kernel work with the device buffer
-    CudaErrCheck( cudaEventRecord( self->workEvent[index], workStream ) );
+    Log::Line( "Marker Set to %d", 60)
+CudaErrCheck( cudaEventRecord( self->workEvent[index], workStream ) );
 
     // From the download stream, wait for the work stream to finish
     cudaStream_t downloadStream = self->queue->_stream;
-    CudaErrCheck( cudaStreamWaitEvent( downloadStream, self->workEvent[index] ) );
+    Log::Line( "Marker Set to %d", 61)
+CudaErrCheck( cudaStreamWaitEvent( downloadStream, self->workEvent[index] ) );
 
 
     if( self->diskBuffer )
diff --git a/cuda/GpuQueue.cu b/cuda/GpuQueue.cu
index f673f640..110929a0 100644
--- a/cuda/GpuQueue.cu
+++ b/cuda/GpuQueue.cu
@@ -9,9 +9,12 @@
 GpuQueue::GpuQueue( Kind kind ) : _kind( kind )
     , _bufferReadySignal( BBCU_BUCKET_COUNT )
 {
-    CudaErrCheck( cudaStreamCreateWithFlags( &_stream        , cudaStreamNonBlocking ) );
-    CudaErrCheck( cudaStreamCreateWithFlags( &_preloadStream , cudaStreamNonBlocking ) );
-    CudaErrCheck( cudaStreamCreateWithFlags( &_callbackStream, cudaStreamNonBlocking ) );
+    Log::Line( "Marker Set to %d", 62)
+CudaErrCheck( cudaStreamCreateWithFlags( &_stream        , cudaStreamNonBlocking ) );
+    Log::Line( "Marker Set to %d", 63)
+CudaErrCheck( cudaStreamCreateWithFlags( &_preloadStream , cudaStreamNonBlocking ) );
+    Log::Line( "Marker Set to %d", 64)
+CudaErrCheck( cudaStreamCreateWithFlags( &_callbackStream, cudaStreamNonBlocking ) );
 
     _queueThread.Run( QueueThreadEntryPoint, this );
 }
@@ -168,13 +171,19 @@ struct IGpuBuffer* GpuQueue::CreateGpuBuffer( const GpuStreamDescriptor& desc, b
 
         for( int32 i = 0; i < desc.bufferCount; i++ )
         {
-            CudaErrCheck( cudaEventCreateWithFlags( &buf->events[i]         , cudaEventDisableTiming ) );
-            CudaErrCheck( cudaEventCreateWithFlags( &buf->completedEvents[i], cudaEventDisableTiming ) );
-            CudaErrCheck( cudaEventCreateWithFlags( &buf->readyEvents[i]    , cudaEventDisableTiming ) );
-            CudaErrCheck( cudaEventCreateWithFlags( &buf->pinnedEvent[i]  , cudaEventDisableTiming ) );
-
-            CudaErrCheck( cudaEventCreateWithFlags( &buf->callbackLockEvent     , cudaEventDisableTiming ) );
-            CudaErrCheck( cudaEventCreateWithFlags( &buf->callbackCompletedEvent, cudaEventDisableTiming ) );
+            Log::Line( "Marker Set to %d", 65)
+CudaErrCheck( cudaEventCreateWithFlags( &buf->events[i]         , cudaEventDisableTiming ) );
+            Log::Line( "Marker Set to %d", 66)
+CudaErrCheck( cudaEventCreateWithFlags( &buf->completedEvents[i], cudaEventDisableTiming ) );
+            Log::Line( "Marker Set to %d", 67)
+CudaErrCheck( cudaEventCreateWithFlags( &buf->readyEvents[i]    , cudaEventDisableTiming ) );
+            Log::Line( "Marker Set to %d", 68)
+CudaErrCheck( cudaEventCreateWithFlags( &buf->pinnedEvent[i]  , cudaEventDisableTiming ) );
+
+            Log::Line( "Marker Set to %d", 69)
+CudaErrCheck( cudaEventCreateWithFlags( &buf->callbackLockEvent     , cudaEventDisableTiming ) );
+            Log::Line( "Marker Set to %d", 70)
+CudaErrCheck( cudaEventCreateWithFlags( &buf->callbackCompletedEvent, cudaEventDisableTiming ) );
             
             buf->deviceBuffer[i] = devBuffers[i];
             buf->pinnedBuffer[i] = pinnedBuffers[i];
@@ -240,14 +249,17 @@ void GpuQueue::DispatchHostFunc( GpuCallbackDispath func, cudaStream_t stream, c
     // #MAYBE: Perhaps support having multiple callback streams, and multiple copy streams.
 
     // Signal from the work stream into the callback stream that we are ready for callback
-    CudaErrCheck( cudaEventRecord( lockEvent, stream ) );
+    Log::Line( "Marker Set to %d", 71)
+CudaErrCheck( cudaEventRecord( lockEvent, stream ) );
 
     // Wait on the callback stream until it's ready to dsitpatch
-    CudaErrCheck( cudaStreamWaitEvent( _callbackStream, lockEvent ) );
+    Log::Line( "Marker Set to %d", 72)
+CudaErrCheck( cudaStreamWaitEvent( _callbackStream, lockEvent ) );
 
     // #MAYBE: Use a bump allocator perhaps later to avoid locking here by new/delete if needed for performance.
     auto* fnCpy = new std::function<void()>( std::move( func ) );
-    CudaErrCheck( cudaLaunchHostFunc( _callbackStream, []( void* userData ){
+    Log::Line( "Marker Set to %d", 73)
+CudaErrCheck( cudaLaunchHostFunc( _callbackStream, []( void* userData ){
 
         auto& fn = *reinterpret_cast<std::function<void()>*>( userData );
         fn();
@@ -256,10 +268,12 @@ void GpuQueue::DispatchHostFunc( GpuCallbackDispath func, cudaStream_t stream, c
     }, fnCpy ) );
 
     // Signal from the callback stream that the callback finished
-    CudaErrCheck( cudaEventRecord( completedEvent, _callbackStream ) );
+    Log::Line( "Marker Set to %d", 74)
+CudaErrCheck( cudaEventRecord( completedEvent, _callbackStream ) );
 
     // Wait on work stream for the callback to complete
-    CudaErrCheck( cudaStreamWaitEvent( stream, completedEvent ) );
+    Log::Line( "Marker Set to %d", 75)
+CudaErrCheck( cudaStreamWaitEvent( stream, completedEvent ) );
 }
 
 size_t GpuQueue::CalculateSliceSizeFromDescriptor( const GpuStreamDescriptor& desc )
diff --git a/cuda/GpuStreams.cu b/cuda/GpuStreams.cu
index 2c129408..e075e306 100644
--- a/cuda/GpuStreams.cu
+++ b/cuda/GpuStreams.cu
@@ -60,26 +60,32 @@ void GpuUploadBuffer::Upload( const void* hostBuffer, size_t size, cudaStream_t
     {
         // Copy from unpinned to pinned first
         // #TODO: This should be done in a different backgrund host-to-host copy stream
-        CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->pinnedEvent[index] ) );
-        CudaErrCheck( cudaMemcpyAsync( self->pinnedBuffer[index], hostBuffer, size, cudaMemcpyHostToHost, uploadStream ) );
+        Log::Line( "Marker Set to %d", 76)
+CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->pinnedEvent[index] ) );
+        Log::Line( "Marker Set to %d", 77)
+CudaErrCheck( cudaMemcpyAsync( self->pinnedBuffer[index], hostBuffer, size, cudaMemcpyHostToHost, uploadStream ) );
 
         hostBuffer = self->pinnedBuffer[index];
     }
 
     // Ensure the device buffer is ready for use
-    CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->deviceEvents[index] ) );
+    Log::Line( "Marker Set to %d", 78)
+CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->deviceEvents[index] ) );
 
     // Upload to the device buffer
-    CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], hostBuffer, size, cudaMemcpyHostToDevice, uploadStream ) );
+    Log::Line( "Marker Set to %d", 79)
+CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], hostBuffer, size, cudaMemcpyHostToDevice, uploadStream ) );
 
     if( !isDirect )
     {
         // Signal that the pinned buffer is ready for re-use
-        CudaErrCheck( cudaEventRecord( self->pinnedEvent[index], uploadStream ) );
+        Log::Line( "Marker Set to %d", 80)
+CudaErrCheck( cudaEventRecord( self->pinnedEvent[index], uploadStream ) );
     }
 
     // Signal work stream that the device buffer is ready to be used
-    CudaErrCheck( cudaEventRecord( self->readyEvents[index], uploadStream ) );
+    Log::Line( "Marker Set to %d", 81)
+CudaErrCheck( cudaEventRecord( self->readyEvents[index], uploadStream ) );
 }
 
 void GpuUploadBuffer::UploadAndPreLoad( void* hostBuffer, const size_t size, const void* copyBufferSrc, const size_t copySize )
@@ -164,7 +170,8 @@ void GpuUploadBuffer::UploadArray( const void* hostBuffer, uint32 length, uint32
         const auto copyMode  = isDirect ? cudaMemcpyHostToDevice : cudaMemcpyHostToHost;
 
         // Wait on device or pinned buffer to be ready (depending if a direct copy or not)
-        CudaErrCheck( cudaStreamWaitEvent( uploadStream, waitEvent ) );
+        Log::Line( "Marker Set to %d", 82)
+CudaErrCheck( cudaStreamWaitEvent( uploadStream, waitEvent ) );
 
         const byte*   src   = (byte*)hostBuffer;
               byte*   dst   = (byte*)( isDirect ? self->deviceBuffer[index] : self->pinnedBuffer[index] );
@@ -174,7 +181,8 @@ void GpuUploadBuffer::UploadArray( const void* hostBuffer, uint32 length, uint32
         {
             const size_t size = *sizes * (size_t)elementSize;
 
-            CudaErrCheck( cudaMemcpyAsync( dst, src, size, copyMode, uploadStream ) );
+            Log::Line( "Marker Set to %d", 83)
+CudaErrCheck( cudaMemcpyAsync( dst, src, size, copyMode, uploadStream ) );
 
             dst    += size;
             src    += srcStride;
@@ -199,15 +207,19 @@ void GpuUploadBuffer::UploadArray( const void* hostBuffer, uint32 length, uint32
         }
 
         // #TODO: This should be done in a copy stream to perform the copies in the background
-        CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->deviceEvents[index] ) );
-        CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], hostBuffer, totalBufferSize, cudaMemcpyHostToDevice, uploadStream ) );
+        Log::Line( "Marker Set to %d", 84)
+CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->deviceEvents[index] ) );
+        Log::Line( "Marker Set to %d", 84)
+CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], hostBuffer, totalBufferSize, cudaMemcpyHostToDevice, uploadStream ) );
 
         if( !self->diskBuffer )
-            CudaErrCheck( cudaEventRecord( self->pinnedEvent[index], uploadStream ) );
+            Log::Line( "Marker Set to %d", 85)
+CudaErrCheck( cudaEventRecord( self->pinnedEvent[index], uploadStream ) );
     }
 
     // Signal work stream that the device buffer is ready to be used
-    CudaErrCheck( cudaEventRecord( self->readyEvents[index], uploadStream ) );
+    Log::Line( "Marker Set to %d", 86)
+CudaErrCheck( cudaEventRecord( self->readyEvents[index], uploadStream ) );
 }
 
 void GpuUploadBuffer::UploadArrayForIndex( const uint32 index, const void* hostBuffer, uint32 length, 
@@ -218,7 +230,8 @@ void GpuUploadBuffer::UploadArrayForIndex( const uint32 index, const void* hostB
     auto stream = self->queue->GetStream();
 
     // Ensure the device buffer is ready for use
-    CudaErrCheck( cudaStreamWaitEvent( stream, self->events[index] ) );
+    Log::Line( "Marker Set to %d", 87)
+CudaErrCheck( cudaStreamWaitEvent( stream, self->events[index] ) );
 
     // Perform uploads
     //size_t deviceCopySize = 0;
@@ -229,7 +242,8 @@ void GpuUploadBuffer::UploadArrayForIndex( const uint32 index, const void* hostB
     {
         const size_t size = *counts * (size_t)elementSize;
         //memcpy( dst, src, size );
-        CudaErrCheck( cudaMemcpyAsync( dst, src, size, cudaMemcpyHostToDevice, stream ) );
+        Log::Line( "Marker Set to %d", 88)
+CudaErrCheck( cudaMemcpyAsync( dst, src, size, cudaMemcpyHostToDevice, stream ) );
 
         //deviceCopySize += size;
 
@@ -239,10 +253,12 @@ void GpuUploadBuffer::UploadArrayForIndex( const uint32 index, const void* hostB
     }
 
     // Copy to device buffer
-    //CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], cpy.dstBuffer, deviceCopySize, cudaMemcpyHostToDevice, _stream ) );
+    //Log::Line( "Marker Set to %d", 89)
+CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], cpy.dstBuffer, deviceCopySize, cudaMemcpyHostToDevice, _stream ) );
 
     // Signal work stream that the device buffer is ready to be used
-    CudaErrCheck( cudaEventRecord( self->readyEvents[index], stream ) );
+    Log::Line( "Marker Set to %d", 90)
+CudaErrCheck( cudaEventRecord( self->readyEvents[index], stream ) );
 }
 
 void GpuUploadBuffer::Upload( const void* hostBuffer, const size_t size )
@@ -269,7 +285,8 @@ void* GpuUploadBuffer::GetUploadedDeviceBuffer( cudaStream_t workStream )
     const uint32 index = self->completedSequence % self->bufferCount;
     self->completedSequence++;
 
-    CudaErrCheck( cudaStreamWaitEvent( workStream, self->readyEvents[index] ) );
+    Log::Line( "Marker Set to %d", 91)
+CudaErrCheck( cudaStreamWaitEvent( workStream, self->readyEvents[index] ) );
 
     return self->deviceBuffer[index];
 }
@@ -283,7 +300,8 @@ void GpuUploadBuffer::ReleaseDeviceBuffer( cudaStream_t workStream )
     const uint32 index = self->lockSequence % self->bufferCount;
     self->lockSequence++;
 
-    CudaErrCheck( cudaEventRecord( self->deviceEvents[index], workStream ) );
+    Log::Line( "Marker Set to %d", 92)
+CudaErrCheck( cudaEventRecord( self->deviceEvents[index], workStream ) );
 }
 
 void GpuUploadBuffer::WaitForPreloadsToComplete()
@@ -341,7 +359,8 @@ DiskBufferBase* GpuUploadBuffer::GetDiskBuffer() const
 void GpuUploadBuffer::CallHostFunctionOnStream( cudaStream_t stream, std::function<void()> func )
 {
     auto* fnCpy = new std::function<void()>( std::move( func ) );
-    CudaErrCheck( cudaLaunchHostFunc( stream, []( void* userData ) {
+    Log::Line( "Marker Set to %d", 93)
+CudaErrCheck( cudaLaunchHostFunc( stream, []( void* userData ) {
 
         auto& fn = *reinterpret_cast<std::function<void()>*>( userData );
         fn();
diff --git a/cuda/chacha8.cu b/cuda/chacha8.cu
index 3e17c15a..32f665f2 100644
--- a/cuda/chacha8.cu
+++ b/cuda/chacha8.cu
@@ -201,8 +201,10 @@ void GenF1Cuda( CudaK32PlotContext& cx )
     chacha8_ctx chacha;
     chacha8_keysetup( &chacha, key, 256, nullptr );
     
-    CudaErrCheck( cudaMemcpyAsync( devChaChaInput, chacha.input, 64, cudaMemcpyHostToDevice, cx.computeStream ) );
-    CudaErrCheck( cudaMemsetAsync( devBucketCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cx.computeStream ) );
+    Log::Line( "Marker Set to %d", 94)
+CudaErrCheck( cudaMemcpyAsync( devChaChaInput, chacha.input, 64, cudaMemcpyHostToDevice, cx.computeStream ) );
+    Log::Line( "Marker Set to %d", 95)
+CudaErrCheck( cudaMemsetAsync( devBucketCounts, 0, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, cx.computeStream ) );
    
     const uint32 outIndex = CudaK32PlotGetOutputIndex( cx );
 
@@ -218,7 +220,8 @@ void GenF1Cuda( CudaK32PlotContext& cx )
         uint32* devMeta = (uint32*)cx.metaOut.LockDeviceBuffer( cx.computeStream );
 
         #if _DEBUG
-            CudaErrCheck( cudaMemsetAsync( devY, 0, sizeof( uint32 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, cx.computeStream ) );
+            Log::Line( "Marker Set to %d", 96)
+CudaErrCheck( cudaMemsetAsync( devY, 0, sizeof( uint32 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, cx.computeStream ) );
         #endif
 
         // Gen chacha blocks
@@ -229,10 +232,12 @@ void GenF1Cuda( CudaK32PlotContext& cx )
     }
 
     // Copy bucket slices to host
-    CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT, 
+    Log::Line( "Marker Set to %d", 97)
+CudaErrCheck( cudaMemcpyAsync( cx.hostBucketSlices, cx.devSliceCounts, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT,
                         cudaMemcpyDeviceToHost, cx.computeStream ) );
 
-    CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) );
+    Log::Line( "Marker Set to %d", 98)
+CudaErrCheck( cudaStreamSynchronize( cx.computeStream ) );
 
     memcpy( &cx.bucketSlices[0], cx.hostBucketSlices, sizeof( uint32 ) * BBCU_BUCKET_COUNT * BBCU_BUCKET_COUNT );