heavy: add error checks, fix strict aliasing and linux

The core problem was the cuda hefty Thread per block set to high but took me several hours to find that... btw... +25% in heavy 12500 with 256 threads per block... vs 128 & 512 if max reg count is set to 80...
cbuchner1 · Nov 27, 2014 · 1b65cd0 · 1b65cd0
1 parent 1032f19
commit 1b65cd0
Show file tree

Hide file tree

Showing 20 changed files with 339 additions and 381 deletions.
diff --git a/Makefile.am b/Makefile.am
@@ -80,6 +80,9 @@ nvcc_FLAGS += $(JANSSON_INCLUDES) --ptxas-options="-v"
 blake32.o: blake32.cu
 	$(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $<
 
+heavy/cuda_hefty1.o: heavy/cuda_hefty1.cu
+	$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
+
 keccak/cuda_keccak256.o: keccak/cuda_keccak256.cu
 	$(NVCC) $(nvcc_FLAGS) --maxrregcount=92 -o $@ -c $<
 

diff --git a/blake32.cu b/blake32.cu
@@ -303,7 +303,11 @@ void blake256_gpu_hash_16(const uint32_t threads, const uint32_t startNonce, uin
 		ending[3] = nonce; /* our tested value */
 
 		blake256_compress(h, ending, 640, rounds);
-
+#if 0
+		if (trace) {
+			printf("blake hash[6][7]: %08x %08x\n", h[6], h[7]);
+		}
+#endif
 		//if (h[7] == 0 && high64 <= highTarget) {
 		if (h[7] == 0) {
 #if NBN == 2
@@ -318,14 +322,14 @@ void blake256_gpu_hash_16(const uint32_t threads, const uint32_t startNonce, uin
 #else
 			resNonce[0] = nonce;
 #endif
-			if (trace) {
 #ifdef _DEBUG
+			if (trace) {
 				uint64_t high64 = ((uint64_t*)h)[3];
 				printf("gpu:  %16llx\n", high64);
 				printf("gpu: %08x.%08x\n", h[7], h[6]);
 				printf("tgt:  %16llx\n", highTarget);
-#endif
 			}
+#endif
 		}
 	}
 }

diff --git a/ccminer.vcxproj b/ccminer.vcxproj
@@ -173,7 +173,7 @@
       <MaxRegCount>80</MaxRegCount>
       <PtxAsOptionV>true</PtxAsOptionV>
       <Keep>false</Keep>
-      <CodeGeneration>compute_30,sm_30;compute_50,sm_50</CodeGeneration>
+      <CodeGeneration>compute_50,sm_50</CodeGeneration>
       <AdditionalOptions>--ptxas-options="-O2" %(AdditionalOptions)</AdditionalOptions>
       <Defines>
       </Defines>
@@ -306,12 +306,7 @@
     <ClInclude Include="cuda_groestlcoin.h" />
     <ClInclude Include="cuda_helper.h" />
     <ClInclude Include="elist.h" />
-    <ClInclude Include="heavy\cuda_blake512.h" />
-    <ClInclude Include="heavy\cuda_combine.h" />
-    <ClInclude Include="heavy\cuda_groestl512.h" />
-    <ClInclude Include="heavy\cuda_hefty1.h" />
-    <ClInclude Include="heavy\cuda_keccak512.h" />
-    <ClInclude Include="heavy\cuda_sha256.h" />
+    <ClInclude Include="heavy\heavy.h" />
     <ClInclude Include="hefty1.h" />
     <ClInclude Include="miner.h" />
     <ClInclude Include="nvml.h" />
@@ -358,6 +353,7 @@
     <CudaCompile Include="heavy\cuda_groestl512.cu">
     </CudaCompile>
     <CudaCompile Include="heavy\cuda_hefty1.cu">
+      <MaxRegCount>80</MaxRegCount>
     </CudaCompile>
     <CudaCompile Include="heavy\cuda_keccak512.cu">
     </CudaCompile>

diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters
@@ -43,9 +43,6 @@
     <Filter Include="Source Files\CUDA\heavy">
       <UniqueIdentifier>{c3222908-22ba-4586-a637-6363f455b06d}</UniqueIdentifier>
     </Filter>
-    <Filter Include="Header Files\CUDA\heavy">
-      <UniqueIdentifier>{3281db48-f394-49ea-a1ef-6ebd09828d50}</UniqueIdentifier>
-    </Filter>
     <Filter Include="Source Files\CUDA\qubit">
       <UniqueIdentifier>{f3ed23a2-8ce7-41a5-b051-6da56047dc35}</UniqueIdentifier>
     </Filter>
@@ -293,23 +290,8 @@
     <ClInclude Include="sph\sph_types.h">
       <Filter>Header Files\sph</Filter>
     </ClInclude>
-    <ClInclude Include="heavy\cuda_blake512.h">
-      <Filter>Header Files\CUDA\heavy</Filter>
-    </ClInclude>
-    <ClInclude Include="heavy\cuda_combine.h">
-      <Filter>Header Files\CUDA\heavy</Filter>
-    </ClInclude>
-    <ClInclude Include="heavy\cuda_groestl512.h">
-      <Filter>Header Files\CUDA\heavy</Filter>
-    </ClInclude>
-    <ClInclude Include="heavy\cuda_hefty1.h">
-      <Filter>Header Files\CUDA\heavy</Filter>
-    </ClInclude>
-    <ClInclude Include="heavy\cuda_keccak512.h">
-      <Filter>Header Files\CUDA\heavy</Filter>
-    </ClInclude>
-    <ClInclude Include="heavy\cuda_sha256.h">
-      <Filter>Header Files\CUDA\heavy</Filter>
+    <ClInclude Include="heavy\heavy.h">
+      <Filter>Header Files\CUDA</Filter>
     </ClInclude>
     <ClInclude Include="cuda_helper.h">
       <Filter>Header Files\CUDA</Filter>
@@ -539,4 +521,4 @@
       <Filter>Source Files\CUDA\x11</Filter>
     </CudaCompile>
   </ItemGroup>
-</Project>
+</Project>
diff --git a/heavy/cuda_blake512.cu b/heavy/cuda_blake512.cu
@@ -3,11 +3,11 @@
 
 #include "cuda_helper.h"
 
-// globaler Speicher für alle HeftyHashes aller Threads
-extern uint32_t *d_heftyHashes[8];
-extern uint32_t *d_nonceVector[8];
+// globaler Speicher für alle HeftyHashes aller Threads
+extern uint32_t *heavy_heftyHashes[8];
+extern uint32_t *heavy_nonceVector[8];
 
-// globaler Speicher für unsere Ergebnisse
+// globaler Speicher für unsere Ergebnisse
 uint32_t *d_hash5output[8];
 
 // die Message (112 bzw. 116 Bytes) mit Padding zur Berechnung auf der GPU
@@ -53,13 +53,13 @@ __constant__ uint64_t c_u512[16];
 
 const uint64_t host_u512[16] =
 {
-  0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL, 
+  0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL,
   0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL,
-  0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL, 
+  0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL,
   0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL,
-  0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL, 
+  0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL,
   0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL,
-  0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL, 
+  0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL,
   0x0801f2e2858efc16ULL, 0x636920d871574e69ULL
 };
 
@@ -123,7 +123,7 @@ template <int BLOCKSIZE> __global__ void blake512_gpu_hash(int threads, uint32_t
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-		// bestimme den aktuellen Zähler
+		// bestimme den aktuellen Zähler
 		//uint32_t nounce = startNounce + thread;
 		uint32_t nounce = nonceVector[thread];
 
@@ -141,10 +141,10 @@ template <int BLOCKSIZE> __global__ void blake512_gpu_hash(int threads, uint32_t
 		h[6] = 0x1f83d9abfb41bd6bULL;
 		h[7] = 0x5be0cd19137e2179ULL;
 
-		// 128 Byte für die Message
+		// 128 Byte für die Message
 		uint64_t buf[16];
 
-		// Message für die erste Runde in Register holen
+		// Message für die erste Runde in Register holen
 #pragma unroll 16
 		for (int i=0; i < 16; ++i) buf[i] = c_PaddedMessage[i];
 
@@ -154,7 +154,7 @@ template <int BLOCKSIZE> __global__ void blake512_gpu_hash(int threads, uint32_t
 		uint32_t *hefty = heftyHashes + 8 * hashPosition;
 		if (BLOCKSIZE == 84) {
 			// den thread-spezifischen Hefty1 hash einsetzen
-			// aufwändig, weil das nicht mit uint64_t Wörtern aligned ist.
+			// aufwändig, weil das nicht mit uint64_t Wörtern aligned ist.
 			buf[10] = REPLACE_HIWORD(buf[10], hefty[0]);
 			buf[11] = REPLACE_LOWORD(buf[11], hefty[1]);
 			buf[11] = REPLACE_HIWORD(buf[11], hefty[2]);
@@ -173,14 +173,14 @@ template <int BLOCKSIZE> __global__ void blake512_gpu_hash(int threads, uint32_t
 
 		// erste Runde
 		blake512_compress<BLOCKSIZE>( h, buf, 0, c_sigma, c_u512 );
-		
-		
+
+
 		// zweite Runde
 #pragma unroll 15
 		for (int i=0; i < 15; ++i) buf[i] = c_SecondRound[i];
 		buf[15] = SWAP64(8*(BLOCKSIZE+32)); // Blocksize in Bits einsetzen
 		blake512_compress<BLOCKSIZE>( h, buf, 1, c_sigma, c_u512 );
-		
+
 		// Hash rauslassen
 		uint64_t *outHash = (uint64_t *)outputHash + 8 * hashPosition;
 #pragma unroll 8
@@ -210,8 +210,8 @@ __host__ void blake512_cpu_init(int thr_id, int threads)
 						sizeof(host_SecondRound),
 						0, cudaMemcpyHostToDevice);
 
-	// Speicher für alle Ergebnisse belegen
-	cudaMalloc(&d_hash5output[thr_id], 16 * sizeof(uint32_t) * threads);
+	// Speicher für alle Ergebnisse belegen
+	CUDA_SAFE_CALL(cudaMalloc(&d_hash5output[thr_id], 16 * sizeof(uint32_t) * threads));
 }
 
 static int BLOCKSIZE = 84;
@@ -222,14 +222,14 @@ __host__ void blake512_cpu_setBlock(void *pdata, int len)
 {
 	unsigned char PaddedMessage[128];
 	if (len == 84) {
-		// Message mit Padding für erste Runde bereitstellen
+		// Message mit Padding für erste Runde bereitstellen
 		memcpy(PaddedMessage, pdata, 84);
-		memset(PaddedMessage+84, 0, 32); // leeres Hefty Hash einfüllen
+		memset(PaddedMessage+84, 0, 32); // leeres Hefty Hash einfüllen
 		memset(PaddedMessage+116, 0, 12);
 		PaddedMessage[116] = 0x80;
 	} else if (len == 80) {
 		memcpy(PaddedMessage, pdata, 80);
-		memset(PaddedMessage+80, 0, 32); // leeres Hefty Hash einfüllen
+		memset(PaddedMessage+80, 0, 32); // leeres Hefty Hash einfüllen
 		memset(PaddedMessage+112, 0, 16);
 		PaddedMessage[112] = 0x80;
 	}
@@ -246,11 +246,11 @@ __host__ void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	// Größe des dynamischen Shared Memory Bereichs
+	// Größe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 
 	if (BLOCKSIZE == 80)
-		blake512_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash5output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
+		blake512_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash5output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
 	else if (BLOCKSIZE == 84)
-		blake512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash5output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
+		blake512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash5output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
 }
diff --git a/heavy/cuda_blake512.h b/heavy/cuda_blake512.h
diff --git a/heavy/cuda_combine.cu b/heavy/cuda_combine.cu
@@ -1,16 +1,19 @@
-#include "cuda_helper.h"
+#include <stdio.h>
 
-// globaler Speicher für unsere Ergebnisse
-uint32_t *d_hashoutput[8];
+#include "cuda_helper.h"
 
+// globaler Speicher für unsere Ergebnisse
+static uint32_t *d_hashoutput[8];
 extern uint32_t *d_hash2output[8];
 extern uint32_t *d_hash3output[8];
 extern uint32_t *d_hash4output[8];
 extern uint32_t *d_hash5output[8];
-extern uint32_t *d_nonceVector[8];
+
+extern uint32_t *heavy_nonceVector[8];
 
 /* Combines top 64-bits from each hash into a single hash */
-static void __device__ combine_hashes(uint32_t *out, uint32_t *hash1, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4)
+__device__
+static void combine_hashes(uint32_t *out, uint32_t *hash1, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4)
 {
 	uint32_t lout[8]; // Combining in Registern machen
 
@@ -98,7 +101,8 @@ static void __device__ combine_hashes(uint32_t *out, uint32_t *hash1, uint32_t *
 		out[i] = lout[i];
 }
 
-__global__ void combine_gpu_hash(int threads, uint32_t startNounce, uint32_t *out, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4, uint32_t *hash5, uint32_t *nonceVector)
+__global__
+void combine_gpu_hash(int threads, uint32_t startNounce, uint32_t *out, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4, uint32_t *hash5, uint32_t *nonceVector)
 {
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
@@ -116,13 +120,14 @@ __global__ void combine_gpu_hash(int threads, uint32_t startNounce, uint32_t *ou
 	}
 }
 
-// Setup-Funktionen
-__host__ void combine_cpu_init(int thr_id, int threads)
+__host__
+void combine_cpu_init(int thr_id, int threads)
 {
-	// Speicher für alle Ergebnisse belegen
-	cudaMalloc(&d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads);
+	// Speicher für alle Ergebnisse belegen
+	CUDA_SAFE_CALL(cudaMalloc(&d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads));
 }
 
+__host__
 void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *hash)
 {
 	// diese Kopien sind optional, da die Hashes jetzt bereits auf der GPU liegen sollten
@@ -133,11 +138,8 @@ void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *h
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	// Größe des dynamischen Shared Memory Bereichs
-	size_t shared_size = 0;
-
-	combine_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_hashoutput[thr_id], d_hash2output[thr_id], d_hash3output[thr_id], d_hash4output[thr_id], d_hash5output[thr_id], d_nonceVector[thr_id]);
+	combine_gpu_hash <<<grid, block>>> (threads, startNounce, d_hashoutput[thr_id], d_hash2output[thr_id], d_hash3output[thr_id], d_hash4output[thr_id], d_hash5output[thr_id], heavy_nonceVector[thr_id]);
 
-	// da die Hash Auswertung noch auf der CPU erfolgt, müssen die Ergebnisse auf jeden Fall zum Host kopiert werden
-	cudaMemcpy(hash, d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads, cudaMemcpyDeviceToHost);
+	// da die Hash Auswertung noch auf der CPU erfolgt, müssen die Ergebnisse auf jeden Fall zum Host kopiert werden
+	CUDA_SAFE_CALL(cudaMemcpy(hash, d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads, cudaMemcpyDeviceToHost));
 }
diff --git a/heavy/cuda_combine.h b/heavy/cuda_combine.h