Skip to content

Commit

Permalink
heavy: add error checks, fix strict aliasing and linux
Browse files Browse the repository at this point in the history
The core problem was the cuda hefty Thread per block set to high
but took me several hours to find that...

btw... +25% in heavy 12500 with 256 threads per block... vs 128 & 512
if max reg count is set to 80...
  • Loading branch information
tpruvot committed Nov 27, 2014
1 parent 1032f19 commit 1b65cd0
Show file tree
Hide file tree
Showing 20 changed files with 339 additions and 381 deletions.
3 changes: 3 additions & 0 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,9 @@ nvcc_FLAGS += $(JANSSON_INCLUDES) --ptxas-options="-v"
blake32.o: blake32.cu
$(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $<

heavy/cuda_hefty1.o: heavy/cuda_hefty1.cu
$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<

keccak/cuda_keccak256.o: keccak/cuda_keccak256.cu
$(NVCC) $(nvcc_FLAGS) --maxrregcount=92 -o $@ -c $<

Expand Down
10 changes: 7 additions & 3 deletions blake32.cu
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,11 @@ void blake256_gpu_hash_16(const uint32_t threads, const uint32_t startNonce, uin
ending[3] = nonce; /* our tested value */

blake256_compress(h, ending, 640, rounds);

#if 0
if (trace) {
printf("blake hash[6][7]: %08x %08x\n", h[6], h[7]);
}
#endif
//if (h[7] == 0 && high64 <= highTarget) {
if (h[7] == 0) {
#if NBN == 2
Expand All @@ -318,14 +322,14 @@ void blake256_gpu_hash_16(const uint32_t threads, const uint32_t startNonce, uin
#else
resNonce[0] = nonce;
#endif
if (trace) {
#ifdef _DEBUG
if (trace) {
uint64_t high64 = ((uint64_t*)h)[3];
printf("gpu: %16llx\n", high64);
printf("gpu: %08x.%08x\n", h[7], h[6]);
printf("tgt: %16llx\n", highTarget);
#endif
}
#endif
}
}
}
Expand Down
10 changes: 3 additions & 7 deletions ccminer.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@
<MaxRegCount>80</MaxRegCount>
<PtxAsOptionV>true</PtxAsOptionV>
<Keep>false</Keep>
<CodeGeneration>compute_30,sm_30;compute_50,sm_50</CodeGeneration>
<CodeGeneration>compute_50,sm_50</CodeGeneration>
<AdditionalOptions>--ptxas-options="-O2" %(AdditionalOptions)</AdditionalOptions>
<Defines>
</Defines>
Expand Down Expand Up @@ -306,12 +306,7 @@
<ClInclude Include="cuda_groestlcoin.h" />
<ClInclude Include="cuda_helper.h" />
<ClInclude Include="elist.h" />
<ClInclude Include="heavy\cuda_blake512.h" />
<ClInclude Include="heavy\cuda_combine.h" />
<ClInclude Include="heavy\cuda_groestl512.h" />
<ClInclude Include="heavy\cuda_hefty1.h" />
<ClInclude Include="heavy\cuda_keccak512.h" />
<ClInclude Include="heavy\cuda_sha256.h" />
<ClInclude Include="heavy\heavy.h" />
<ClInclude Include="hefty1.h" />
<ClInclude Include="miner.h" />
<ClInclude Include="nvml.h" />
Expand Down Expand Up @@ -358,6 +353,7 @@
<CudaCompile Include="heavy\cuda_groestl512.cu">
</CudaCompile>
<CudaCompile Include="heavy\cuda_hefty1.cu">
<MaxRegCount>80</MaxRegCount>
</CudaCompile>
<CudaCompile Include="heavy\cuda_keccak512.cu">
</CudaCompile>
Expand Down
24 changes: 3 additions & 21 deletions ccminer.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,6 @@
<Filter Include="Source Files\CUDA\heavy">
<UniqueIdentifier>{c3222908-22ba-4586-a637-6363f455b06d}</UniqueIdentifier>
</Filter>
<Filter Include="Header Files\CUDA\heavy">
<UniqueIdentifier>{3281db48-f394-49ea-a1ef-6ebd09828d50}</UniqueIdentifier>
</Filter>
<Filter Include="Source Files\CUDA\qubit">
<UniqueIdentifier>{f3ed23a2-8ce7-41a5-b051-6da56047dc35}</UniqueIdentifier>
</Filter>
Expand Down Expand Up @@ -293,23 +290,8 @@
<ClInclude Include="sph\sph_types.h">
<Filter>Header Files\sph</Filter>
</ClInclude>
<ClInclude Include="heavy\cuda_blake512.h">
<Filter>Header Files\CUDA\heavy</Filter>
</ClInclude>
<ClInclude Include="heavy\cuda_combine.h">
<Filter>Header Files\CUDA\heavy</Filter>
</ClInclude>
<ClInclude Include="heavy\cuda_groestl512.h">
<Filter>Header Files\CUDA\heavy</Filter>
</ClInclude>
<ClInclude Include="heavy\cuda_hefty1.h">
<Filter>Header Files\CUDA\heavy</Filter>
</ClInclude>
<ClInclude Include="heavy\cuda_keccak512.h">
<Filter>Header Files\CUDA\heavy</Filter>
</ClInclude>
<ClInclude Include="heavy\cuda_sha256.h">
<Filter>Header Files\CUDA\heavy</Filter>
<ClInclude Include="heavy\heavy.h">
<Filter>Header Files\CUDA</Filter>
</ClInclude>
<ClInclude Include="cuda_helper.h">
<Filter>Header Files\CUDA</Filter>
Expand Down Expand Up @@ -539,4 +521,4 @@
<Filter>Source Files\CUDA\x11</Filter>
</CudaCompile>
</ItemGroup>
</Project>
</Project>
46 changes: 23 additions & 23 deletions heavy/cuda_blake512.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@

#include "cuda_helper.h"

// globaler Speicher für alle HeftyHashes aller Threads
extern uint32_t *d_heftyHashes[8];
extern uint32_t *d_nonceVector[8];
// globaler Speicher für alle HeftyHashes aller Threads
extern uint32_t *heavy_heftyHashes[8];
extern uint32_t *heavy_nonceVector[8];

// globaler Speicher für unsere Ergebnisse
// globaler Speicher für unsere Ergebnisse
uint32_t *d_hash5output[8];

// die Message (112 bzw. 116 Bytes) mit Padding zur Berechnung auf der GPU
Expand Down Expand Up @@ -53,13 +53,13 @@ __constant__ uint64_t c_u512[16];

const uint64_t host_u512[16] =
{
0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL,
0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL,
0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL,
0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL,
0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL,
0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL,
0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL,
0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL,
0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL,
0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL,
0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL,
0x0801f2e2858efc16ULL, 0x636920d871574e69ULL
};

Expand Down Expand Up @@ -123,7 +123,7 @@ template <int BLOCKSIZE> __global__ void blake512_gpu_hash(int threads, uint32_t
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
// bestimme den aktuellen Zähler
// bestimme den aktuellen Zähler
//uint32_t nounce = startNounce + thread;
uint32_t nounce = nonceVector[thread];

Expand All @@ -141,10 +141,10 @@ template <int BLOCKSIZE> __global__ void blake512_gpu_hash(int threads, uint32_t
h[6] = 0x1f83d9abfb41bd6bULL;
h[7] = 0x5be0cd19137e2179ULL;

// 128 Byte für die Message
// 128 Byte für die Message
uint64_t buf[16];

// Message für die erste Runde in Register holen
// Message für die erste Runde in Register holen
#pragma unroll 16
for (int i=0; i < 16; ++i) buf[i] = c_PaddedMessage[i];

Expand All @@ -154,7 +154,7 @@ template <int BLOCKSIZE> __global__ void blake512_gpu_hash(int threads, uint32_t
uint32_t *hefty = heftyHashes + 8 * hashPosition;
if (BLOCKSIZE == 84) {
// den thread-spezifischen Hefty1 hash einsetzen
// aufwändig, weil das nicht mit uint64_t Wörtern aligned ist.
// aufwändig, weil das nicht mit uint64_t Wörtern aligned ist.
buf[10] = REPLACE_HIWORD(buf[10], hefty[0]);
buf[11] = REPLACE_LOWORD(buf[11], hefty[1]);
buf[11] = REPLACE_HIWORD(buf[11], hefty[2]);
Expand All @@ -173,14 +173,14 @@ template <int BLOCKSIZE> __global__ void blake512_gpu_hash(int threads, uint32_t

// erste Runde
blake512_compress<BLOCKSIZE>( h, buf, 0, c_sigma, c_u512 );


// zweite Runde
#pragma unroll 15
for (int i=0; i < 15; ++i) buf[i] = c_SecondRound[i];
buf[15] = SWAP64(8*(BLOCKSIZE+32)); // Blocksize in Bits einsetzen
blake512_compress<BLOCKSIZE>( h, buf, 1, c_sigma, c_u512 );

// Hash rauslassen
uint64_t *outHash = (uint64_t *)outputHash + 8 * hashPosition;
#pragma unroll 8
Expand Down Expand Up @@ -210,8 +210,8 @@ __host__ void blake512_cpu_init(int thr_id, int threads)
sizeof(host_SecondRound),
0, cudaMemcpyHostToDevice);

// Speicher für alle Ergebnisse belegen
cudaMalloc(&d_hash5output[thr_id], 16 * sizeof(uint32_t) * threads);
// Speicher für alle Ergebnisse belegen
CUDA_SAFE_CALL(cudaMalloc(&d_hash5output[thr_id], 16 * sizeof(uint32_t) * threads));
}

static int BLOCKSIZE = 84;
Expand All @@ -222,14 +222,14 @@ __host__ void blake512_cpu_setBlock(void *pdata, int len)
{
unsigned char PaddedMessage[128];
if (len == 84) {
// Message mit Padding für erste Runde bereitstellen
// Message mit Padding für erste Runde bereitstellen
memcpy(PaddedMessage, pdata, 84);
memset(PaddedMessage+84, 0, 32); // leeres Hefty Hash einfüllen
memset(PaddedMessage+84, 0, 32); // leeres Hefty Hash einfüllen
memset(PaddedMessage+116, 0, 12);
PaddedMessage[116] = 0x80;
} else if (len == 80) {
memcpy(PaddedMessage, pdata, 80);
memset(PaddedMessage+80, 0, 32); // leeres Hefty Hash einfüllen
memset(PaddedMessage+80, 0, 32); // leeres Hefty Hash einfüllen
memset(PaddedMessage+112, 0, 16);
PaddedMessage[112] = 0x80;
}
Expand All @@ -246,11 +246,11 @@ __host__ void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);

// Größe des dynamischen Shared Memory Bereichs
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;

if (BLOCKSIZE == 80)
blake512_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash5output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
blake512_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash5output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
else if (BLOCKSIZE == 84)
blake512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash5output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
blake512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash5output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
}
7 changes: 0 additions & 7 deletions heavy/cuda_blake512.h

This file was deleted.

34 changes: 18 additions & 16 deletions heavy/cuda_combine.cu
Original file line number Diff line number Diff line change
@@ -1,16 +1,19 @@
#include "cuda_helper.h"
#include <stdio.h>

// globaler Speicher für unsere Ergebnisse
uint32_t *d_hashoutput[8];
#include "cuda_helper.h"

// globaler Speicher für unsere Ergebnisse
static uint32_t *d_hashoutput[8];
extern uint32_t *d_hash2output[8];
extern uint32_t *d_hash3output[8];
extern uint32_t *d_hash4output[8];
extern uint32_t *d_hash5output[8];
extern uint32_t *d_nonceVector[8];

extern uint32_t *heavy_nonceVector[8];

/* Combines top 64-bits from each hash into a single hash */
static void __device__ combine_hashes(uint32_t *out, uint32_t *hash1, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4)
__device__
static void combine_hashes(uint32_t *out, uint32_t *hash1, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4)
{
uint32_t lout[8]; // Combining in Registern machen

Expand Down Expand Up @@ -98,7 +101,8 @@ static void __device__ combine_hashes(uint32_t *out, uint32_t *hash1, uint32_t *
out[i] = lout[i];
}

__global__ void combine_gpu_hash(int threads, uint32_t startNounce, uint32_t *out, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4, uint32_t *hash5, uint32_t *nonceVector)
__global__
void combine_gpu_hash(int threads, uint32_t startNounce, uint32_t *out, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4, uint32_t *hash5, uint32_t *nonceVector)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
Expand All @@ -116,13 +120,14 @@ __global__ void combine_gpu_hash(int threads, uint32_t startNounce, uint32_t *ou
}
}

// Setup-Funktionen
__host__ void combine_cpu_init(int thr_id, int threads)
__host__
void combine_cpu_init(int thr_id, int threads)
{
// Speicher für alle Ergebnisse belegen
cudaMalloc(&d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads);
// Speicher für alle Ergebnisse belegen
CUDA_SAFE_CALL(cudaMalloc(&d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads));
}

__host__
void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *hash)
{
// diese Kopien sind optional, da die Hashes jetzt bereits auf der GPU liegen sollten
Expand All @@ -133,11 +138,8 @@ void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *h
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);

// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;

combine_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_hashoutput[thr_id], d_hash2output[thr_id], d_hash3output[thr_id], d_hash4output[thr_id], d_hash5output[thr_id], d_nonceVector[thr_id]);
combine_gpu_hash <<<grid, block>>> (threads, startNounce, d_hashoutput[thr_id], d_hash2output[thr_id], d_hash3output[thr_id], d_hash4output[thr_id], d_hash5output[thr_id], heavy_nonceVector[thr_id]);

// da die Hash Auswertung noch auf der CPU erfolgt, müssen die Ergebnisse auf jeden Fall zum Host kopiert werden
cudaMemcpy(hash, d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads, cudaMemcpyDeviceToHost);
// da die Hash Auswertung noch auf der CPU erfolgt, müssen die Ergebnisse auf jeden Fall zum Host kopiert werden
CUDA_SAFE_CALL(cudaMemcpy(hash, d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads, cudaMemcpyDeviceToHost));
}
7 changes: 0 additions & 7 deletions heavy/cuda_combine.h

This file was deleted.

Loading

0 comments on commit 1b65cd0

Please sign in to comment.