From 45c7e250762a7bf517893002dfa638999c382e3f Mon Sep 17 00:00:00 2001 From: Chris <34682781+monkins1010@users.noreply.github.com> Date: Sat, 30 Oct 2021 11:32:08 +0100 Subject: [PATCH] Update verus.cu --- verus/verus.cu | 364 +++++++++++++++++-------------------------------- 1 file changed, 126 insertions(+), 238 deletions(-) diff --git a/verus/verus.cu b/verus/verus.cu index d91b03abc4..fa3f3604e8 100644 --- a/verus/verus.cu +++ b/verus/verus.cu @@ -114,7 +114,7 @@ __global__ void verus_gpu_hash(const uint32_t threads, const uint32_t startNonce uint128m * __restrict__ d_key_input, uint8_t version); __global__ void verus_extra_gpu_prepare(const uint32_t threads, uint128m * d_key_input); -#define TOTAL_MAX 0x20000 +#define TOTAL_MAX 0x10000 static uint32_t *d_nonces[MAX_GPUS]; static uint4 *d_long_keys[MAX_GPUS]; @@ -130,7 +130,6 @@ void verus_init(int thr_id, uint32_t throughput) { //cudaFuncSetCacheConfig(verus_gpu_hash, cudaFuncCachePreferEqual); CUDA_SAFE_CALL(cudaMalloc(&d_nonces[thr_id], 1 * sizeof(uint32_t))); - CUDA_SAFE_CALL(cudaMalloc(&d_long_keys[thr_id], TOTAL_MAX * VERUS_KEY_SIZE)); }; @@ -161,43 +160,19 @@ void verus_hash(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *res CUDA_SAFE_CALL(cudaMemcpy(resNonces, d_nonces[thr_id], 1 * sizeof(uint32_t), cudaMemcpyDeviceToHost)); }; -__device__ __forceinline__ -uint32_t xor3x(uint32_t a, uint32_t b, uint32_t c) { - uint32_t result; -#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050 - asm("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(result) : "r"(a), "r"(b), "r"(c)); //0x96 = 0xF0 ^ 0xCC ^ 0xAA -#else - result = a^b^c; -#endif - return result; -} - -__device__ __forceinline__ uint128m _mm_xor_si128_emu(uint128m a, uint128m b) -{ - uint128m result; - asm("xor.b32 %0, %1, %2; // xor1" : "=r"(result.x) : "r"(a.x), "r"(b.x)); - asm("xor.b32 %0, %1, %2; // xor1" : "=r"(result.y) : "r"(a.y), "r"(b.y)); - asm("xor.b32 %0, %1, %2; // xor1" : "=r"(result.z) : "r"(a.z), "r"(b.z)); - asm("xor.b32 %0, %1, %2; // xor1" : "=r"(result.w) : "r"(a.w), "r"(b.w)); - return result; - - -} +#define _mm_xor_si128_emu(a,b) a^b; __device__ uint128m _mm_clmulepi64_si128_emu(uint128m ai, uint128m bi, int imm) { - uint64_t a = ((uint64_t*)&ai)[0]; // (0xffffffffull & ai.x) | ((0x00000000ffffffffull & ai.y) << 32);//+ (imm & 1)); + uint64_t a = ((uint64_t*)&ai)[0]; - uint64_t b = ((uint64_t*)&bi)[1]; // (0xffffffffull & bi.z) | ((0x00000000ffffffffull & bi.w) << 32); + uint64_t b = ((uint64_t*)&bi)[1]; - // uint8_t i; - // uint2 u[8]; - uint64_t r[2]; //uint128m r; - //uint2 tmp; - - if (__popcll(a) > __popcll(b)) { + uint64_t r[2]; + if (__popcll(a) > __popcll(b)) + { a = b; b = ((uint64_t*)&ai)[0]; } r[0] = 0; r[1] = 0; @@ -212,70 +187,6 @@ __device__ uint128m _mm_clmulepi64_si128_emu(uint128m ai, uint128m bi, int imm r[1] ^= b >> ((counter)); }; - - /* - // XCHG(a,b); - u[0].x = 0; //000 x b u[0].y = 0; - } - - u[1].x = bi.z; //001 x b u[1].y = bi.w; //001 x - - u[2].x = u[1].x << 1; //010 x b - u[2].y = __funnelshift_l(u[1].x, u[1].y, 1); //010 x b - - u[3].x = u[2].x ^ bi.z; //011 x b - u[3].y = u[2].y ^ bi.w; //011 x b - - u[4].x = u[2].x << 1; //100 x b - u[4].y = __funnelshift_l(u[2].x, u[2].y, 1); //010 x b - - u[5].x = u[4].x ^ bi.z; //101 x b - u[5].y = u[4].y ^ bi.w; //101 x b - - u[6].x = u[3].x << 1; //110 x b - u[6].y = __funnelshift_l(u[3].x, u[3].y, 1); //010 x b - - u[7].x = u[6].x ^ bi.z; //111 x b - u[7].y = u[6].y ^ bi.w; - //Multiply - r.x = u[a & 7].x; //first window only affects lower word - r.y = u[a & 7].y; - r.z = r.w = 0; - //#pragma unroll - for (i = 3; i < 31; i += 3) { - tmp.x = u[a >> i & 7].x; - tmp.y = u[a >> i & 7].y; - r.x ^= (tmp.x << i) ; - r.y ^= __funnelshift_l(tmp.x, tmp.y, i); - r.z ^= ( tmp.y >> ( 32 - i)); - - } - - //#pragma unroll - for (i = 33; i < 64; i += 3) { - tmp.x = u[a >> i & 7].x; - tmp.y = u[a >> i & 7].y; - r.y ^= (tmp.x << (i - 32 )); - r.z ^= __funnelshift_r(tmp.x, tmp.y, (64-i)); - r.w ^= tmp.y >> (64 - i); - } - - - if ((bi.w ) & 0x80000000) - { - uint32_t t0 = LIMMY_R(ai.x, ai.y, 1); - uint32_t t1 = ai.y >> 1; - r.z ^= (t0 & 0xDB6DB6DB); //0, 21x 110 - r.w ^= (t1 & 0x36DB6DB6); //0x6DB6DB6DB6DB6DB6 -> 0x36DB6DB6DB6DB6DB after >>1 - } - if ((bi.w ) & 0x40000000) - { - uint32_t t0 = LIMMY_R(ai.x, ai.y, 2); - uint32_t t1 = ai.y >> 2; - r.z ^= (t0 & 0x49249249); //0, 21x 100 - r.w ^= (t1 & 0x12492492); //0x4924924924924924 -> 0x1249249249249249 after >>2 - } - */ return ((uint128m*)&r)[0]; } @@ -283,39 +194,16 @@ __device__ __forceinline__ uint128m _mm_clmulepi64_si128_emu2(uint128m ai) { uint64_t a = ((uint64_t*)&ai)[1]; - //uint64_t b = 27 ; - uint8_t i; //window size s = 4, - //uint64_t two_s = 16; //2^s - //uint64_t smask = 15; //s 15 - uint8_t u[8]; - uint128m r; - uint64_t tmp; - //Precomputation - - //#pragma unroll - u[0] = 0; //000 x b - u[1] = 27; //001 x b - u[2] = 54; // u[1] << 1; //010 x b - u[3] = 45; //011 x b - u[4] = 108; //100 x b - u[5] = 119; //101 x b - u[6] = 90; //110 x b - u[7] = 65; //111 x b - //Multiply - ((uint64_t*)&r)[0] = u[a & 7]; //first window only affects lower word - - r.z = r.w = 0; - //#pragma unroll - for (i = 3; i < 64; i += 3) { - tmp = u[a >> i & 7]; - r.x ^= (tmp << i) & 0xffffffff; - r.y ^= ((tmp << i) & 0xffffffff00000000) >> 32; - // ((uint64_t*)&r)[1] ^= tmp >> (64 - i); - r.z ^= (tmp >> (64 - i)) & 0xffffffff; - r.w ^= ((tmp >> (64 - i)) & 0xffffffff00000000) >> 32; - } + uint64_t result[2] = { 0,0 }; + result[0] = a; + result[0] ^= a << 1; + result[1] ^= a >> 63; + result[0] ^= a << 3; + result[1] ^= a >> 61; + result[0] ^= a << 4; + result[1] ^= a >> 60; - return r; + return AS_UINT4(result); } #define _mm_load_si128_emu(p) (*(uint128m*)(p)); @@ -342,6 +230,17 @@ __device__ __forceinline__ void _mm_unpackboth_epi32_emu(uint128m &a, uint12 asm("mov.b64 {%0, %1}, %2; ": "=r"(a.w), "=r"(b.y) : "l"(value)); } +__device__ __forceinline__ uint128m unpackandmix(uint128m a, uint128m b, uint128m acc) +{ + uint128m tmp; + + tmp.x = a.x ^ acc.x ^ a.z; + tmp.y = b.x ^ acc.y ^ b.z; + tmp.z = a.y ^ acc.z ^ a.w; + tmp.w = b.y ^ acc.w ^ b.w; + + return tmp; +} __device__ __forceinline__ uint128m _mm_unpacklo_epi32_emu(uint128m a, uint128m b) { @@ -430,12 +329,12 @@ __device__ __forceinline__ uint128m _mm_set_epi64x_emu(uint64_t hi, uint64_t lo ((uint64_t *)&result)[1] = hi; return result; } -__device__ __forceinline__ uint128m _mm_shuffle_epi8_emu(uint128m b) +__device__ __forceinline__ uint128m _mm_shuffle_epi8_emu(uint2 b) { uint128m result = { 0 }; const uint128m M = { 0x2d361b00,0x415a776c,0xf5eec3d8,0x9982afb4 }; - const uint128m Q = { 0x80808080, 0x80808080, 0x80808080, 0x80808080 }; - const uint128m W = b & Q; + const uint2 Q = { 0x80808080, 0x80808080}; + const uint2 W = b & Q; #pragma unroll @@ -452,12 +351,12 @@ __device__ __forceinline__ uint128m _mm_shuffle_epi8_emu(uint128m b) -__device__ __forceinline__ uint128m _mm_srli_si128_emu(uint128m input, int imm8) +__device__ __forceinline__ uint2 _mm_srli_si128_emu(uint128m input, int imm8) { //we can cheat here as its an 8 byte shift just copy the 64bits - uint128m temp; + uint2 temp; ((uint64_t*)&temp)[0] = ((uint64_t*)&input)[1]; - ((uint64_t*)&temp)[1] = 0; +// ((uint64_t*)&temp)[1] = 0; return temp; @@ -475,11 +374,9 @@ __device__ __forceinline__ uint128m _mm_mulhrs_epi16_emu(uint128m _a, uint12 for (int i = 0; i < 8; i++) { asm("mad.lo.s32 %0, %1, %2, 16384; ": "=r"(po) : "r"((int32_t)a[i]), "r"((int32_t)b[i])); - result[i] = po >> 15; - // result[i] = (int16_t)((((int32_t)(a[i]) * (int32_t)(b[i])) + 0x4000) >> 15); - } + return *(uint128m *)result; } @@ -489,7 +386,7 @@ __device__ __forceinline__ void case_0(uint128m &prand, uint128m &prandex, c { const uint128m temp1 = prandex; - const uint128m temp2 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1)); + const uint128m temp2 = _mm_load_si128_emu(pbuf - ((selector & 1) ? 1 : -1)); const uint128m add1 = _mm_xor_si128_emu(temp1, temp2); @@ -532,7 +429,7 @@ __device__ __forceinline__ void case_4(uint128m &prand, uint128m &prandex, co const uint128m temp12 = prandex; prandex = tempa2; - const uint128m temp22 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1)); + const uint128m temp22 = _mm_load_si128_emu(pbuf - ((selector & 1) ? 1 : -1)); const uint128m add12 = _mm_xor_si128_emu(temp12, temp22); acc = _mm_xor_si128_emu(add12, acc); @@ -555,7 +452,7 @@ __device__ __forceinline__ void case_8(uint128m &prand, uint128m &prandex, c const uint128m temp12 = prand; prand = tempa2; - const uint128m temp22 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1)); + const uint128m temp22 = _mm_load_si128_emu(pbuf - ((selector & 1) ? 1 : -1)); const uint128m add12 = _mm_xor_si128_emu(temp12, temp22); const uint128m clprod12 = _mm_clmulepi64_si128_emu(add12, add12, 0x10); acc = _mm_xor_si128_emu(clprod12, acc); @@ -573,7 +470,7 @@ __device__ __forceinline__ void case_0c_1(uint128m &prand, uint128m &prandex, uint64_t selector, uint128m &acc) { const uint128m temp1 = prand; - const uint128m temp2 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1)); + const uint128m temp2 = _mm_load_si128_emu(pbuf - ((selector & 1) ? 1 : -1)); const uint128m add1 = _mm_xor_si128_emu(temp1, temp2); // cannot be zero here @@ -617,7 +514,7 @@ __device__ __forceinline__ void case_0c_2(uint128m &prand, uint128m &prandex, uint64_t selector, uint128m &acc) { const uint128m temp1 = prand; - const uint128m temp2 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1)); + const uint128m temp2 = _mm_load_si128_emu(pbuf - ((selector & 1) ? 1 : -1)); const uint128m add1 = _mm_xor_si128_emu(temp1, temp2); // cannot be zero here @@ -665,7 +562,7 @@ __device__ __forceinline__ void case_10(uint128m &prand, uint128m &prandex, c uint128m tmp; - uint128m temp1 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1)); + uint128m temp1 = _mm_load_si128_emu(pbuf - ((selector & 1) ? 1 : -1)); uint128m temp2 = _mm_load_si128_emu(pbuf); AES2_EMU(temp1, temp2, 0); @@ -676,11 +573,7 @@ __device__ __forceinline__ void case_10(uint128m &prand, uint128m &prandex, c MIX2_EMU(temp1, temp2); AES2_EMU(temp1, temp2, 8); - MIX2_EMU(temp1, temp2); - - - acc = _mm_xor_si128_emu(temp1, acc); - acc = _mm_xor_si128_emu(temp2, acc); + acc = unpackandmix(temp1, temp2, acc); const uint128m tempa1 = prand; const uint128m tempa2 = _mm_mulhrs_epi16_emu(acc, tempa1); @@ -694,43 +587,39 @@ __device__ __forceinline__ void case_14(uint128m &prand, uint128m &prandex, c uint64_t selector, uint128m &acc, uint128m *randomsource, uint32_t prand_idx, uint32_t *sharedMemory1) { // we'll just call this one the monkins loop, inspired by Chris - const uint128m *buftmp = pbuf - (((selector & 1) << 1) - 1); + const uint128m *buftmp = pbuf - ((selector & 1) ? 1 : -1); uint128m tmp; // used by MIX2 uint64_t rounds = selector >> 61; // loop randomly between 1 and 8 times uint128m *rc = &randomsource[prand_idx]; - uint64_t aesround = 0; uint128m onekey; uint64_t loop_c; do { loop_c = selector & ((uint64_t)0x10000000 << rounds); - if (loop_c) - { - onekey = _mm_load_si128_emu(rc++); - const uint128m temp2 = _mm_load_si128_emu(rounds & 1 ? pbuf : buftmp); - const uint128m add1 = _mm_xor_si128_emu(onekey, temp2); - const uint128m clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10); - acc = _mm_xor_si128_emu(clprod1, acc); - rounds--; - if (rounds != (uint64_t)0xffffffffffffffff) loop_c = selector & ((uint64_t)0x10000000 << rounds); - } - if (!loop_c && (rounds != (uint64_t)0xffffffffffffffff)) - { - onekey = _mm_load_si128_emu(rc++); - uint128m temp2 = _mm_load_si128_emu(rounds & 1 ? buftmp : pbuf); - - const uint64_t roundidx = aesround++ << 2; - AES2_EMU(onekey, temp2, roundidx); + if (loop_c) + { + onekey = _mm_load_si128_emu(rc++); + const uint128m temp2 = _mm_load_si128_emu(rounds & 1 ? pbuf : buftmp); + const uint128m add1 = _mm_xor_si128_emu(onekey, temp2); + const uint128m clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10); + acc = _mm_xor_si128_emu(clprod1, acc); + rounds--; + if (rounds != (uint64_t)0xffffffffffffffff) loop_c = selector & ((uint64_t)0x10000000 << rounds); + } + if (!loop_c && (rounds != (uint64_t)0xffffffffffffffff)) + { + onekey = _mm_load_si128_emu(rc++); + uint128m temp2 = _mm_load_si128_emu(rounds & 1 ? buftmp : pbuf); - MIX2_EMU(onekey, temp2); + const uint64_t roundidx = aesround++ << 2; + AES2_EMU(onekey, temp2, roundidx); + acc = unpackandmix(onekey, temp2, acc); - acc = _mm_xor_si128_emu(onekey, acc); - acc = _mm_xor_si128_emu(temp2, acc); - rounds--; - } + rounds--; + } } while (rounds != (uint64_t)0xffffffffffffffff); const uint128m tempa1 = (prand); @@ -746,7 +635,7 @@ __device__ __forceinline__ void case_18_1(uint128m &prand, uint128m &prandex uint64_t selector, uint128m &acc, uint128m *randomsource, uint32_t prand_idx) { // we'll just call this one the monkins loop, inspired by Chris - const uint4 *buftmp = pbuf - (((selector & 1) << 1) - 1); + const uint4 *buftmp = pbuf - ((selector & 1) ? 1 : -1); uint64_t rounds = selector >> 61; // loop randomly between 1 and 8 times @@ -759,26 +648,26 @@ __device__ __forceinline__ void case_18_1(uint128m &prand, uint128m &prandex loop_c = selector & ((uint64_t)0x10000000 << rounds); if (loop_c) { - onekey = _mm_load_si128_emu(rc++); - const uint4 temp2 = _mm_load_si128_emu(rounds & 1 ? pbuf : buftmp); - const uint4 add1 = _mm_xor_si128_emu(onekey, temp2); - - const int32_t divisor = (uint32_t)selector; - const int64_t dividend = ((int64_t*)&add1)[0]; - uint4 modulo = { 0 }; ((int32_t*)&modulo)[0] = (dividend % divisor); - acc = modulo ^ acc; - rounds--; - if (rounds != (uint64_t)0xffffffffffffffff) loop_c = selector & ((uint64_t)0x10000000 << rounds); + onekey = _mm_load_si128_emu(rc++); + const uint4 temp2 = _mm_load_si128_emu(rounds & 1 ? pbuf : buftmp); + const uint4 add1 = _mm_xor_si128_emu(onekey, temp2); + + const int32_t divisor = (uint32_t)selector; + const int64_t dividend = ((int64_t*)&add1)[0]; + uint4 modulo = { 0 }; ((int32_t*)&modulo)[0] = (dividend % divisor); + acc = modulo ^ acc; + rounds--; + if (rounds != (uint64_t)0xffffffffffffffff) loop_c = selector & ((uint64_t)0x10000000 << rounds); } if (!loop_c && (rounds != (uint64_t)0xffffffffffffffff)) { - onekey = _mm_load_si128_emu(rc++); - uint4 temp2 = _mm_load_si128_emu(rounds & 1 ? buftmp : pbuf); - uint4 add1 = (onekey^ temp2); - uint4 clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0); - uint4 clprod2 = _mm_mulhrs_epi16_emu(acc, clprod1); - acc = clprod2^ acc; - rounds--; + onekey = _mm_load_si128_emu(rc++); + uint4 temp2 = _mm_load_si128_emu(rounds & 1 ? buftmp : pbuf); + uint4 add1 = (onekey^ temp2); + uint4 clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0); + uint4 clprod2 = _mm_mulhrs_epi16_emu(acc, clprod1); + acc = clprod2^ acc; + rounds--; } } while (rounds != (uint64_t)0xffffffffffffffff); @@ -792,7 +681,7 @@ __device__ __forceinline__ void case_18_2(uint128m &prand, uint128m &prandex uint64_t selector, uint128m &acc, uint128m *randomsource, uint32_t prand_idx) { // we'll just call this one the monkins loop, inspired by Chris - const uint4 *buftmp = pbuf - (((selector & 1) << 1) - 1); + const uint4 *buftmp = pbuf - ((selector & 1) ? 1 : -1); uint64_t rounds = selector >> 61; // loop randomly between 1 and 8 times @@ -876,13 +765,31 @@ __device__ __forceinline__ void case_1c_2(uint128m &prand, uint128m &prande prand = tempa2; acc = _mm_xor_si128_emu(tempa3, acc); - const uint128m temp4 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1)); + const uint128m temp4 = _mm_load_si128_emu(pbuf - ((selector & 1) ? 1 : -1)); acc = _mm_xor_si128_emu(temp4, acc); const uint128m tempb1 = _mm_mulhrs_epi16_emu(acc, tempa3); const uint128m tempb2 = _mm_xor_si128_emu(tempb1, tempa3); prandex = tempb2; } +__device__ __forceinline__ uint2 precompReduction64(uint128m A) { + + + //static const uint128m M = { 0x2d361b00,0x415a776c,0xf5eec3d8,0x9982afb4 }; + // const uint128m tmp = { 27 }; + // A.z = 0; + //tmp.x = 27u; + uint128m Q2 = _mm_clmulepi64_si128_emu2(A); + uint128m Q3 = _mm_shuffle_epi8_emu({ Q2.z,Q2.w }); + + //uint128m Q4 = _mm_xor_si128_emu(Q2, A); + uint2 final; + final.x = xor3(A.x, Q2.x, Q3.x); + final.y = xor3(A.y, Q2.y, Q3.y); + + return final; +} + #define PRE selector = _mm_cvtsi128_si64_emu(acc);\ if (i > 0) {\ @@ -904,7 +811,7 @@ __device__ __forceinline__ void case_1c_2(uint128m &prand, uint128m &prande pbuf = buf + (acc.x & 3);\ case_v = selector & 0x1cu; -__device__ __forceinline__ uint128m __verusclmulwithoutreduction64alignedrepeatgpu(uint128m * __restrict__ randomsource, const uint128m * __restrict__ buf, +__device__ __forceinline__ uint2 __verusclmulwithoutreduction64alignedrepeatgpu(uint128m * __restrict__ randomsource, const uint128m * __restrict__ buf, uint32_t * __restrict__ sharedMemory1, uint8_t version) { uint128m const *pbuf; @@ -926,8 +833,9 @@ __device__ __forceinline__ uint128m __verusclmulwithoutreduction64alignedrepe //#pragma unroll int i = 0; uint8_t case_v; - - PRE + selector = _mm_cvtsi128_si64_emu(acc); + pbuf = buf + (acc.x & 3); + case_v = selector & 0x1cu; do { @@ -991,10 +899,10 @@ __device__ __forceinline__ uint128m __verusclmulwithoutreduction64alignedrepe } if (case_v == 0xc) { - if(version == 3) - case_0c_1(prand, prandex, pbuf, selector, acc); + if (version == 3) + case_0c_1(prand, prandex, pbuf, selector, acc); else - case_0c_2(prand, prandex, pbuf, selector, acc); + case_0c_2(prand, prandex, pbuf, selector, acc); randomsource[prand_idx] = prand; randomsource[prandex_idx] = prandex; @@ -1019,9 +927,9 @@ __device__ __forceinline__ uint128m __verusclmulwithoutreduction64alignedrepe } if (case_v == 0x1c) - { + { if (version == 3) - case_1c_1(prand, prandex, pbuf, selector, acc); + case_1c_1(prand, prandex, pbuf, selector, acc); else case_1c_2(prand, prandex, pbuf, selector, acc); @@ -1036,8 +944,9 @@ __device__ __forceinline__ uint128m __verusclmulwithoutreduction64alignedrepe } while (i != 32); + acc.x ^= 0x00010000; - return acc; + return precompReduction64(acc); } @@ -1069,24 +978,6 @@ __device__ __forceinline__ uint32_t haraka512_port_keyed2222(uint128m * __res } -__device__ __forceinline__ uint64_t precompReduction64(uint128m A) { - - - //static const uint128m M = { 0x2d361b00,0x415a776c,0xf5eec3d8,0x9982afb4 }; - // const uint128m tmp = { 27 }; - // A.z = 0; - //tmp.x = 27u; - uint128m Q2 = _mm_clmulepi64_si128_emu2(A); - uint128m Q3 = _mm_shuffle_epi8_emu(_mm_srli_si128_emu(Q2, 8)); - - //uint128m Q4 = _mm_xor_si128_emu(Q2, A); - uint128m final; - final.x = xor3(A.x, Q2.x, Q3.x); - final.y = xor3(A.y, Q2.y, Q3.y); - - return _mm_cvtsi128_si64_emu(final);/// WARNING: HIGH 64 BITS SHOULD BE ASSUMED TO CONTAIN GARBAGE -} - __global__ __launch_bounds__(THREADS, 1) @@ -1094,8 +985,9 @@ void verus_gpu_hash(const uint32_t threads, const uint32_t startNonce, uint32_t uint128m * __restrict__ d_key_input, uint8_t version) { const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - uint128m mid; // , biddy[VERUS_KEY_SIZE128]; - uint128m s[4]; + + __shared__ uint128m j[4 * THREADS]; + uint128m *s = &j[threadIdx.x << 2]; const uint32_t nounce = startNonce + thread; @@ -1125,40 +1017,36 @@ void verus_gpu_hash(const uint32_t threads, const uint32_t startNonce, uint32_t d_key_input[(VERUS_KEY_SIZE128 * (thread & (TOTAL_MAX - 1))) + ((threadIdx.x + i) & 511)] = sharedMemory3[((threadIdx.x + i) & 511)]; } - int b = threadIdx.x & 31; + int b = threadIdx.x % 40; for (int i = 0; i < 40; i++) { - d_key_input[((VERUS_KEY_SIZE128 * (thread & (TOTAL_MAX - 1)))) + 512 + ((i + b) % 40)] = sharedMemory3[512 + ((i + b) % 40)]; + d_key_input[((VERUS_KEY_SIZE128 * (thread & (TOTAL_MAX - 1)))) + 512 + ((b) % 40)] = sharedMemory3[512 + ((b) % 40)]; b++; } s[2].x = nounce; s[0] = s[0] ^ s[2]; s[1] = s[1] ^ s[3]; - mid = __verusclmulwithoutreduction64alignedrepeatgpu(&d_key_input[(VERUS_KEY_SIZE128 * (thread & (TOTAL_MAX - 1)))], s, sharedMemory1[0], version); - mid.x ^= 0x00010000; + uint2 acc = __verusclmulwithoutreduction64alignedrepeatgpu(&d_key_input[(VERUS_KEY_SIZE128 * (thread & (TOTAL_MAX - 1)))], s, sharedMemory1[0], version); - uint64_t acc = precompReduction64(mid);; s[0] = blockhash_half[0]; s[1] = blockhash_half[1]; - memcpy(((uint8_t*)&s) + 47, &acc, 8); - memcpy(((uint8_t*)&s) + 55, &acc, 8); - memcpy(((uint8_t*)&s) + 63, &acc, 1); - //uint64_t mask = 8191 >> 4; - acc &= 511; - - + uint2 tmp = ROR2(acc,8); + s[3].x = tmp.x; + s[3].y = tmp.y; + s[3].z = tmp.x; + s[3].w = tmp.y; - //haraka512_port_keyed((unsigned char*)hash, (const unsigned char*)s, (const unsigned char*)(biddy + mask), sharedMemory1, nounce); - uint32_t hash = haraka512_port_keyed2222(s, (&d_key_input[(VERUS_KEY_SIZE128 * (thread & (TOTAL_MAX - 1)))] + acc), sharedMemory1[0]); - if (hash < ptarget[7]) { + s[2].w = (s[2].w & 0x00ffffff) | (acc.x & 0xff) << 24; + acc.x &= 511; + uint32_t hash = haraka512_port_keyed2222(s, (&d_key_input[(VERUS_KEY_SIZE128 * (thread & (TOTAL_MAX - 1)))] + acc.x), sharedMemory1[0]); + if (hash < ptarget[7]) + { resNonce[0] = nounce; - } }; -