cuda_maxwell.cu

extern "C" { 
#include "logging.h" 
}

#include "driver-cuda.h" 
#include <string.h>
#include <stdbool.h>
#include <stdint.h>

#include <sys/types.h>
#include <cuda.h>
#include <cuda_runtime_api.h>

#include <stdlib.h>
#include <stdint.h>
#include <string.h>

#include <emmintrin.h>
#include <malloc.h>
#include <sys/time.h>


#include <assert.h>

#ifndef _WIN32
#include <byteswap.h>
#include <arpa/inet.h>
#endif

#include <emmintrin.h>
#include <malloc.h>
#include <new>
#include <unistd.h>


#define DELIMITER '/'
#define __FILENAME__ ( strrchr(__FILE__, DELIMITER) != NULL ? strrchr(__FILE__, DELIMITER)+1 : __FILE__ )
#define checkCudaErrors(x) { \
    cudaGetLastError(); \
    cudaError_t err; \
    err = x; \
    if (err != cudaSuccess) \
        applog(LOG_ERR, "GPU cudaError %d (%s) calling '%s' (%s line %d)\n", err, cudaGetErrorString(err), #x, __FILENAME__, __LINE__); \
}

// number of threads collaborating on one work unit (hash)
#define THREADS_PER_WU 1

#define MAX_WARPS_BLOCK 24
#define MAGIC 1024

// make scratchpad size dependent on N and LOOKUP_GAP
#define LOOKUP_GAP 1
#define SCRATCH (MAGIC*32)
#define WU_PER_WARP (32 / THREADS_PER_WU)
#define BACKOFF 4
#define WU_PER_LAUNCH (GRID_BLOCKS*WU_PER_BLOCK)
#define WU_PER_BLOCK (WU_PER_WARP*WARPS_PER_BLOCK)

//__constant__ uint32_t *c_V[TOTAL_WARP_LIMIT];


static __device__ __inline__ unsigned int __laneId(void) {
    unsigned int laneId;
    asm( "mov.u32 %0, %%laneid;" : "=r"( laneId ));
    return (laneId);
}

#define c_N 1024
#define c_N_1 1023


static int cuda_scrypt_core(struct cuda_dev *dev, struct cuda_thread_data *tdata, int cur, int N);
static uint64_t max_work(struct cuda_dev *dev, int dev_id, int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t min_nonce, uint32_t max_nonce, uint32_t *found_nonce);

extern "C"
int cuda_maxwell_allocate_mem(struct cuda_dev *dev, int dev_id) {
    size_t free_dev_mem, total_dev_mem; 
    cudaSetDevice(dev_id);

    // set whatever cache configuration and shared memory bank mode the kernel prefers
    checkCudaErrors(cudaDeviceSetCacheConfig(cudaFuncCachePreferL1));
    checkCudaErrors(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte));

    cudaMemGetInfo(&free_dev_mem, &total_dev_mem);

    applog(LOG_NOTICE, "GPU #%d: %d MB free on card", dev->sgminer_id, free_dev_mem / 1024 / 1024);
    dev->maxwarps = (free_dev_mem / (((SCRATCH + 256) * WU_PER_WARP) * sizeof(uint32_t)));

    dev->mem_allocated = 1;
    return (0);
}

static int _populate_tlist(struct cuda_dev *dev, struct cuda_tlist *tlist, int tlist_cnt) {
    int WARPS_PER_BLOCK;

    // we want to have enough total warps for half the multiprocessors at least
    // compute highest MAXWARPS number that we can support based on texture cache mode
    int MINTW = 16;
    int MAXTW = dev->maxwarps;

    // we want to have blocks for half the multiprocessors at least
    int MINB = 4;
    int MAXB = MAXTW;

    if (tlist) applog(LOG_ERR, "GPU #%d: Tuning card, maximum total warps (BxW): %d", dev->sgminer_id, MAXTW);

    int tlist_pos = 0;
    for (int GRID_BLOCKS = MINB; GRID_BLOCKS <= MAXB; ++GRID_BLOCKS) {
        for (WARPS_PER_BLOCK = 1; WARPS_PER_BLOCK <= MAX_WARPS_BLOCK; ++WARPS_PER_BLOCK) {
            if ((GRID_BLOCKS * WARPS_PER_BLOCK >= MINTW) && (GRID_BLOCKS * WARPS_PER_BLOCK <= MAXTW)) {
                if (tlist_pos < tlist_cnt) {
                    tlist[tlist_pos].grid = GRID_BLOCKS;
                    tlist[tlist_pos].warps = WARPS_PER_BLOCK;
                }
                tlist_pos++;
            }
        }
    }
    return (tlist_pos);
}

static int cuda_init_autotune_tlist(struct cuda_dev *dev) {
    int tlist_len;
    struct cuda_tlist *new_tlist;

    tlist_len = _populate_tlist(dev, NULL, 0);
    new_tlist = (struct cuda_tlist *)malloc(sizeof(struct cuda_tlist) * tlist_len);
    memset(new_tlist, 0, sizeof(struct cuda_tlist) * tlist_len);
    _populate_tlist(dev, new_tlist, tlist_len);

    if (dev->tlist) {
        free(dev->tlist);
    }
    dev->tlist = new_tlist;
    dev->tlist_len = tlist_len;
    dev->tlist_tunelen = tlist_len;
    dev->tlist_pos = 0;

    return (0);
}

static double doubletime(void) {
    timeval tv;
    double t;
    gettimeofday(&tv, NULL);
    t = tv.tv_sec + (tv.tv_usec / 1000000.0);
    return (t);
}

/* for qsort */
static int _tlist_cmp(const void *ta, const void *tb) {
    return ((((struct cuda_tlist *)tb)->best_rate * 1000) - (((struct cuda_tlist *)ta)->best_rate) * 1000);
}

uint64_t cuda_maxwell_tune_work(struct cuda_dev *dev, int dev_id, int thr_id, uint32_t *data, const uint32_t *ptarget,
                                uint32_t min_nonce, uint32_t max_nonce, uint32_t *found_nonce) {
    double start_time;
    uint64_t rv;
    struct cuda_tlist *tlist_ent;
    uint32_t adj_max_nonce;

    adj_max_nonce = min(max_nonce, min_nonce + dev->cur_batch);

    if (!dev->mem_allocated) cuda_maxwell_allocate_mem(dev, dev_id);

    if (dev->autotune_complete) {
        dev->wu_per_launch = dev->grid_blocks * dev->warps_per_block * WU_PER_WARP;
        return (max_work(dev, dev_id, thr_id, data, ptarget, min_nonce, max_nonce, found_nonce));
    }

    if (!dev->tlist) {
        cuda_init_autotune_tlist(dev);
    }

    if (dev->tlist_pos >= dev->tlist_tunelen) {
        dev->tlist_pos = 0;
        qsort(dev->tlist, dev->tlist_len, sizeof(struct cuda_tlist), _tlist_cmp);
        dev->tlist_tunelen = dev->tlist_tunelen / 2;
        dev->cur_batch = min(dev->cur_batch + dev->min_batch, dev->max_batch);
        if (dev->tlist_tunelen == 1) {
            applog(LOG_WARNING, "GPU%d: Autotune completed: %dx%d = %.2fkH/sec",
                   dev->sgminer_id, dev->tlist->grid, dev->tlist->warps, dev->tlist->best_rate / 1000);
            dev->grid_blocks = dev->tlist->grid;
            dev->warps_per_block = dev->tlist->warps;
            dev->wu_per_launch = dev->grid_blocks * dev->warps_per_block * WU_PER_WARP;
            dev->autotune_complete = 1;
            dev->cur_batch = dev->max_batch;
            dev->mem_allocated = 1;
            return (max_work(dev, dev_id, thr_id, data, ptarget, min_nonce, max_nonce, found_nonce));
        } else {
            applog(LOG_INFO, "GPU%d: Best performance so far: %dx%d = %.2fkH/sec (batch %d)",
                   dev->sgminer_id, dev->tlist->grid, dev->tlist->warps, dev->tlist->best_rate / 1000, dev->cur_batch);
        }
    } else {
        tlist_ent = &dev->tlist[dev->tlist_pos];
        dev->grid_blocks = tlist_ent->grid;
        dev->warps_per_block = tlist_ent->warps;
        dev->wu_per_launch = dev->grid_blocks * dev->warps_per_block * WU_PER_WARP;
        start_time = doubletime();
        rv = max_work(dev, dev_id, thr_id, data, ptarget, min_nonce, adj_max_nonce, found_nonce);
        if (rv <= 0) {
            // Failed
            tlist_ent->hashes_done = 0;
            dev->tlist_pos++;
            return(cuda_maxwell_tune_work(dev,dev_id,thr_id,data,ptarget,min_nonce,max_nonce,found_nonce));
        } else if (rv > 1000000) {
                tlist_ent->hashes_done = 0;
                dev->tlist_pos++;
                return(cuda_maxwell_tune_work(dev,dev_id,thr_id,data,ptarget,min_nonce,max_nonce,found_nonce));
        }

        if (tlist_ent->run_cnt) {
            tlist_ent->hashes_done += rv;
                tlist_ent->time_spent += (doubletime() - start_time);
                tlist_ent->best_rate = tlist_ent->hashes_done / tlist_ent->time_spent;
        }
        tlist_ent->run_cnt++;

        if (tlist_ent->run_cnt >= 3) {
            tlist_ent->run_cnt = 0;
            dev->tlist_pos++; 
        }
    }
    return (rv);
}

#if 0
void cuda_dump_uint32(int dev_id, const char *prefix, uint32_t *data, int len) {
    char cbuf[513];
    uint32_t dbuf[64];
    int pos = 0;

    cudaMemcpy(dbuf, data, len * sizeof(uint32_t), cudaMemcpyDeviceToHost);

    for (pos = 0; pos < len && pos < 64; pos++) snprintf(&cbuf[pos * 8], sizeof(cbuf) - (pos * 8), "%08x", dbuf[pos]);

    applog(LOG_INFO, "GPU #%d: %s%s", dev_id, prefix, cbuf);
}
#endif

static int max_free_buffers(struct cuda_dev *dev, struct cuda_thread_data *tdata) {
    checkCudaErrors(cudaFree(tdata->context_idata[0]));
    checkCudaErrors(cudaFree(tdata->context_odata[0]));
    checkCudaErrors(cudaFree(tdata->context_tstate[0]));
    checkCudaErrors(cudaFree(tdata->context_ostate[0]));
    return (0);
}

static void max_free_scratch(struct cuda_thread_data *tdata) {
    while (tdata->scratch_mem_size) {
        checkCudaErrors(cudaFree(tdata->scratch_mem_host[tdata->scratch_mem_size - 1]));
        tdata->scratch_mem_size--;
    }
}

static int max_initialize_scratch(struct cuda_dev *dev, struct cuda_thread_data *tdata) {
    int w = dev->warps_per_block * dev->grid_blocks;

    if (tdata->scratch_mem_size == w) return (0);
    
    if (!tdata->scratch_mem_dev) {
        // Allocate space for the device to track it's memory
        if (cudaMalloc((void **)&tdata->scratch_mem_dev, sizeof(uint32_t *) * 8192))
            return(-1);
    }
    while (tdata->scratch_mem_size < w) {
        if (cudaMalloc((void **)&tdata->scratch_mem_host[tdata->scratch_mem_size], (SCRATCH * WU_PER_WARP) * sizeof(uint32_t)))
            return(-1);
        tdata->scratch_mem_size++;
    }

    while (tdata->scratch_mem_size > w) {
        tdata->scratch_mem_size--;
        checkCudaErrors(cudaFree(tdata->scratch_mem_host[tdata->scratch_mem_size]));
    }

    cudaMemcpy(tdata->scratch_mem_dev, tdata->scratch_mem_host, sizeof(uint32_t *) * w, cudaMemcpyHostToDevice);

    return (0);
}

static int max_initialize_buffers(struct cuda_dev *dev, struct cuda_thread_data *tdata) {
    if (max_initialize_scratch(dev, tdata)) {
        applog(LOG_ERR, "GPU #%d: Unable to initialize device scratch for %dx%d", dev->sgminer_id, dev->grid_blocks, dev->warps_per_block);
        return(-1);
    }

    // buffers_initialized is set to the wu_per_launch, if this is already
    // equal to or greater than we need, do nothing
    if (tdata->buffers_initialized >= dev->wu_per_launch) return (0);


    if (tdata->buffers_initialized > 0) {
        // Free, then create
        max_free_buffers(dev, tdata);
    } else {
        // Then it must be first run, allocate things that don't change based on work size

        // allocate pinned host memory for scrypt hashes
        checkCudaErrors(cudaHostAlloc(&tdata->context_Hnonce, sizeof(uint32_t) * MAXBUFFERS, cudaHostAllocDefault));
        checkCudaErrors(cudaMalloc(&tdata->context_nonce, sizeof(uint32_t) * MAXBUFFERS));

        // create two CUDA streams
        checkCudaErrors(cudaStreamCreate(&tdata->context_streams[0]));

        // events used to serialize the kernel launches (we don't want any overlapping of kernels)
        checkCudaErrors(cudaEventCreateWithFlags(&tdata->context_serialize[0], cudaEventDisableTiming & cudaEventBlockingSync));
        checkCudaErrors(cudaEventRecord(tdata->context_serialize[0]));
    }

    unsigned int mem_size = dev->wu_per_launch * sizeof(uint32_t) * 32;
    unsigned int state_size = dev->wu_per_launch * sizeof(uint32_t) * 8;

    if (cudaMalloc(&tdata->context_idata[0], mem_size)) goto errout;
    if (cudaMalloc(&tdata->context_odata[0], mem_size)) goto errout;

    if (cudaMalloc(&tdata->context_tstate[0], state_size)) goto errout;
    if (cudaMalloc(&tdata->context_ostate[0], state_size)) goto errout;

    tdata->buffers_initialized = dev->wu_per_launch;
    return (0);

    errout:
        applog(LOG_ERR, "GPU #%d: Unable to device mem for %dx%d", dev->sgminer_id, dev->grid_blocks, dev->warps_per_block);
        return(-1);
}

// Abort on failure
#define AOF(x) if (x) { success = 0; goto abort; }

static uint64_t max_work(struct cuda_dev *dev, int dev_id, int thr_id, uint32_t *data, const uint32_t *ptarget,
                  uint32_t min_nonce, uint32_t max_nonce, uint32_t *found_nonce) {
    int throughput = dev->wu_per_launch;
    const uint32_t Htarg = ptarget[7];
    int N = MAGIC;
    uint32_t nonce;
    uint32_t *nres;
    uint32_t midstate[8];
    int success = 1;
    int cur = 0;

    // Clear error buff
    cudaGetLastError();


    if (max_initialize_buffers(dev, &dev->tdata[thr_id])) return (0);

    nres = dev->tdata[thr_id].context_Hnonce;
    nres[FOUND] = 0;

    cur = 0;
    cudaMemcpyAsync(&dev->tdata[thr_id].context_nonce[FOUND], &nres[FOUND], sizeof(uint32_t), cudaMemcpyHostToDevice, dev->tdata[thr_id].context_streams[cur]);
    max_sha256_init(&dev->tdata[thr_id], midstate);
    max_sha256_transform(midstate, data, 0);
    max_prepare_sha256(data, midstate);
    cur = 0;

    nonce = min_nonce;
    do {
        max_pre_sha256(&dev->tdata[thr_id], cur, nonce, 128, (throughput + 127) / 128);
        if (cuda_scrypt_core(dev, &dev->tdata[thr_id], cur, N)) {
            applog(LOG_ERR, "GPU #%d: Unable to run scrypt core with settings %dx%d", dev->sgminer_id, dev->grid_blocks, dev->warps_per_block);
            cudaError_t err = cudaGetLastError();
            applog(LOG_ERR, "GPU #%d: cudaError %d (%s) ", dev->sgminer_id, err, cudaGetErrorString(err));
            success = 0;
            goto abort;
        }
        max_post_sha256(&dev->tdata[thr_id], cur, 128, (throughput + 127) / 128, Htarg, nonce);
        nonce += throughput;
    } while (nonce < max_nonce);

abort:

    double sync_start = doubletime();

#ifndef _WIN32
/* Windows doesn't have the time resolution to make this worth while.  Also, the behaviour of
   Synchronous calls is different than on linux.  On Linux, it seems to spin the cpu, so the
   sleep is important to prevent 100% cpu usage */
    if (dev->autotune_complete && dev->tdata[thr_id].sync_time > 0.001) {
        usleep(dev->tdata[thr_id].sync_time * 1000000.0 * 0.95);
    }
#endif

    while (cudaStreamQuery(dev->tdata[thr_id].context_streams[0]) == cudaErrorNotReady) usleep(100);
    dev->tdata[thr_id].sync_time = doubletime() - sync_start;

    cudaMemcpy(&found_nonce[FOUND], &dev->tdata[thr_id].context_nonce[FOUND], sizeof(uint32_t), cudaMemcpyDeviceToHost);
    if (found_nonce[FOUND]) 
        cudaMemcpy(found_nonce, dev->tdata[thr_id].context_nonce, sizeof(uint32_t) * min(MAXBUFFERS, found_nonce[FOUND]), cudaMemcpyDeviceToHost);

    if (success && (cudaGetLastError() == 0)) return (nonce - min_nonce); 
    else return(0);
}


static __device__ uint4& operator^=(uint4 &left, const uint4 &right) {
    left.x ^= right.x;
    left.y ^= right.y;
    left.z ^= right.z;
    left.w ^= right.w;
    return (left);
}

__device__ static __forceinline__ uint4 __shfl(const uint4 val, unsigned int lane, unsigned int width) {
    return (make_uint4(
            (unsigned int)__shfl((int)val.x, lane, width),
            (unsigned int)__shfl((int)val.y, lane, width),
            (unsigned int)__shfl((int)val.z, lane, width),
            (unsigned int)__shfl((int)val.w, lane, width)));
}

#define SPACE 2
#define lane8 (__laneId() % 8)
#define tile (__laneId() / 8)


__device__ static __forceinline__ void __transposed_write_BC1(uint4(&B)[4], uint4(&C)[4], uint4 *D, int row) {
    uint4 T1[8];
    uint4 T2[8];

    /* Source matrix, A-H are threads, 0-7 are data items, thread A is marked with `*`:

       *A0  B0  C0  D0  E0  F0  G0  H0
       *A1  B1  C1  D1  E1  F1  G1  H1
       *A2  B2  C2  D2  E2  F2  G2  H2
       *A3  B3  C3  D3  E3  F3  G3  H3
       *A4  B4  C4  D4  E4  F4  G4  H4
       *A5  B5  C5  D5  E5  F5  G5  H5
       *A6  B6  C6  D6  E6  F6  G6  H6
       *A7  B7  C7  D7  E7  F7  G7  H7
    */

    // rotate rows
    T1[0] = B[0];
    T1[1] = __shfl(B[1], lane8 + 7, 8);
    T1[2] = __shfl(B[2], lane8 + 6, 8);
    T1[3] = __shfl(B[3], lane8 + 5, 8);
    T1[4] = __shfl(C[0], lane8 + 4, 8);
    T1[5] = __shfl(C[1], lane8 + 3, 8);
    T1[6] = __shfl(C[2], lane8 + 2, 8);
    T1[7] = __shfl(C[3], lane8 + 1, 8);

    /* Matrix after row rotates:

       *A0  B0  C0  D0  E0  F0  G0  H0
        H1 *A1  B1  C1  D1  E1  F1  G1
        G2  H2 *A2  B2  C2  D2  E2  F2
        F3  G3  H3 *A3  B3  C3  D3  E3
        E4  F4  G4  H4 *A4  B4  C4  D4
        D5  E5  F5  G5  H5 *A5  B5  C5
        C6  D6  E6  F6  G6  H6 *A6  B6
        B7  C7  D7  E7  F7  G7  H7 *A7
    */

    // rotate columns up using a barrel shifter simulation
    // column X is rotated up by (X+1) items
#pragma unroll 8
    for (int n = 0; n < 8; n++) T2[n] = ((lane8 + 1) & 1) ? T1[(n + 1) % 8] : T1[n];
#pragma unroll 8
    for (int n = 0; n < 8; n++) T1[n] = ((lane8 + 1) & 2) ? T2[(n + 2) % 8] : T2[n];
#pragma unroll 8
    for (int n = 0; n < 8; n++) T2[n] = ((lane8 + 1) & 4) ? T1[(n + 4) % 8] : T1[n];

    /* Matrix after column rotates:

        H1  H2  H3  H4  H5  H6  H7  H0
        G2  G3  G4  G5  G6  G7  G0  G1
        F3  F4  F5  F6  F7  F0  F1  F2
        E4  E5  E6  E7  E0  E1  E2  E3
        D5  D6  D7  D0  D1  D2  D3  D4
        C6  C7  C0  C1  C2  C3  C4  C5
        B7  B0  B1  B2  B3  B4  B5  B6
       *A0 *A1 *A2 *A3 *A4 *A5 *A6 *A7
    */

    // rotate rows again using address math and write to D, in reverse row order
    D[row * SPACE * (32 * tile) + lane8] = T2[7];
    D[row * SPACE * (32 * tile + 4) + (lane8 + 7) % 8] = T2[6];
    D[row * SPACE * (32 * tile + 8) + (lane8 + 6) % 8] = T2[5];
    D[row * SPACE * (32 * tile + 12) + (lane8 + 5) % 8] = T2[4];
    D[row * SPACE * (32 * tile + 16) + (lane8 + 4) % 8] = T2[3];
    D[row * SPACE * (32 * tile + 20) + (lane8 + 3) % 8] = T2[2];
    D[row * SPACE * (32 * tile + 24) + (lane8 + 2) % 8] = T2[1];
    D[(row * SPACE * (32 * tile + 28)) + ((lane8 + 1) % 8)] = T2[0];
}

__device__ static __forceinline__ void __transposed_read_BC(
    const uint4 *S,
    uint4(&B)[4],
    uint4(&C)[4],
    int spacing, int row, int do_xor) {
    // Perform the same transposition as in __transposed_write_BC, but in reverse order.
    // See the illustrations in comments for __transposed_write_BC.

    // read and rotate rows, in reverse row order
    uint4 T1[8], T2[8];
    T1[7] = __ldg(&S[(spacing * SPACE * (32 * tile) + lane8 + 8 * __shfl(row, 0, 8))]);
    T1[6] = __ldg(&S[(spacing * SPACE * (32 * tile + 4) + (lane8 + 7) % 8 + 8 * __shfl(row, 1, 8))]);
    T1[5] = __ldg(&S[(spacing * SPACE * (32 * tile + 8) + (lane8 + 6) % 8 + 8 * __shfl(row, 2, 8))]);
    T1[4] = __ldg(&S[(spacing * SPACE * (32 * tile + 12) + (lane8 + 5) % 8 + 8 * __shfl(row, 3, 8))]);
    T1[3] = __ldg(&S[(spacing * SPACE * (32 * tile + 16) + (lane8 + 4) % 8 + 8 * __shfl(row, 4, 8))]);
    T1[2] = __ldg(&S[(spacing * SPACE * (32 * tile + 20) + (lane8 + 3) % 8 + 8 * __shfl(row, 5, 8))]);
    T1[1] = __ldg(&S[(spacing * SPACE * (32 * tile + 24) + (lane8 + 2) % 8 + 8 * __shfl(row, 6, 8))]);
    T1[0] = __ldg(&S[(spacing * SPACE * (32 * tile + 28) + (lane8 + 1) % 8 + 8 * __shfl(row, 7, 8))]);


    // rotate columns down using a barrel shifter simulation
    // column X is rotated down by (X+1) items, or up by (8-(X+1)) = (7-X) items

    if ((7 - lane8) & 1) {
#pragma unroll 7
        for (int n = 0; n < 7; n++) T2[n] = T1[(n + 1)];
        T2[7] = T1[0];
    } else {
#pragma unroll 8
        for (int n = 0; n < 8; n++) T2[n] = T1[n];
    }

    if ((7 - lane8) & 2) {
#pragma unroll 6
        for (int n = 0; n < 6; n++) T1[n] = T2[(n + 2)];
        T1[6] = T2[0];
        T1[7] = T2[1];
    } else {
#pragma unroll 8
        for (int n = 0; n < 8; n++) T1[n] = T2[n];
    }

#pragma unroll 4
    for (int n = 0; n < 4; n++) T2[n] = ((7 - lane8) & 4) ? T1[(n + 4)] : T1[n];

#pragma unroll 4
    for (int n = 4; n < 8; n++) T2[n] = ((7 - lane8) & 4) ? T1[(n - 4)] : T1[n];

    // rotate rows
    if (do_xor) {
        B[0] ^= T2[0];
        B[1] ^= __shfl(T2[1], lane8 + 1, 8);
        B[2] ^= __shfl(T2[2], lane8 + 2, 8);
        B[3] ^= __shfl(T2[3], lane8 + 3, 8);
        C[0] ^= __shfl(T2[4], lane8 + 4, 8);
        C[1] ^= __shfl(T2[5], lane8 + 5, 8);
        C[2] ^= __shfl(T2[6], lane8 + 6, 8);
        C[3] ^= __shfl(T2[7], lane8 + 7, 8);
    } else {
        B[0] = T2[0];
        B[1] = __shfl(T2[1], lane8 + 1, 8);
        B[2] = __shfl(T2[2], lane8 + 2, 8);
        B[3] = __shfl(T2[3], lane8 + 3, 8);
        C[0] = __shfl(T2[4], lane8 + 4, 8);
        C[1] = __shfl(T2[5], lane8 + 5, 8);
        C[2] = __shfl(T2[6], lane8 + 6, 8);
        C[3] = __shfl(T2[7], lane8 + 7, 8);
    }
}


#define ROTL(a, b) __funnelshift_l( a, a, b );

#define ROTL7(a0,a1,a2,a3,a00,a10,a20,a30){\
a0^=ROTL(a00, 7); a1^=ROTL(a10, 7); a2^=ROTL(a20, 7); a3^=ROTL(a30, 7);\
};\

#define ROTL9(a0,a1,a2,a3,a00,a10,a20,a30){\
a0^=ROTL(a00, 9); a1^=ROTL(a10, 9); a2^=ROTL(a20, 9); a3^=ROTL(a30, 9);\
};\

#define ROTL13(a0,a1,a2,a3,a00,a10,a20,a30){\
a0^=ROTL(a00, 13); a1^=ROTL(a10, 13); a2^=ROTL(a20, 13); a3^=ROTL(a30, 13);\
};\

#define ROTL18(a0,a1,a2,a3,a00,a10,a20,a30){\
a0^=ROTL(a00, 18); a1^=ROTL(a10, 18); a2^=ROTL(a20, 18); a3^=ROTL(a30, 18);\
};\

__device__ static __forceinline__ void xor_salsa8(uint4 *B, uint4 *C) {
    uint32_t x0;
    uint32_t x1;
    uint32_t x2;
    uint32_t x3;
    uint32_t x4;
    uint32_t x5;
    uint32_t x6;
    uint32_t x7;
    uint32_t x8;
    uint32_t x9;
    uint32_t x10;
    uint32_t x11;
    uint32_t x12;
    uint32_t x13;
    uint32_t x14;
    uint32_t x15;

    x0 = (B[0].x ^= C[0].x);
    x1 = (B[0].y ^= C[0].y);
    x2 = (B[0].z ^= C[0].z);
    x3 = (B[0].w ^= C[0].w);
    x4 = (B[1].x ^= C[1].x);
    x5 = (B[1].y ^= C[1].y);
    x6 = (B[1].z ^= C[1].z);
    x7 = (B[1].w ^= C[1].w);
    x8 = (B[2].x ^= C[2].x);
    x9 = (B[2].y ^= C[2].y);
    x10 = (B[2].z ^= C[2].z);
    x11 = (B[2].w ^= C[2].w);
    x12 = (B[3].x ^= C[3].x);
    x13 = (B[3].y ^= C[3].y);
    x14 = (B[3].z ^= C[3].z);
    x15 = (B[3].w ^= C[3].w);

#pragma unroll 4
    for (int j = 0; j < 4; j++) {
        /* Operate on columns. */
        x4  ^= ROTL(x0  + x12, 7);
        x9  ^= ROTL(x1  + x5,  7);
        x14 ^= ROTL(x6  + x10, 7);
        x3  ^= ROTL(x11 + x15, 7);

        x8  ^= ROTL(x0  + x4 , 9);
        x13 ^= ROTL(x5  + x9 , 9);
        x2  ^= ROTL(x10 + x14, 9);
        x7  ^= ROTL(x3  + x15, 9);


        ROTL13(x12, x1, x6, x11, x4 + x8, x9 + x13, x2 + x14, x3 + x7);
        ROTL18(x0, x5, x10, x15, x8 + x12, x1 + x13, x2 + x6, x7 + x11);

        /* Operate on rows. */
        ROTL7(x1, x6, x11, x12, x0 + x3, x4 + x5, x9 + x10, x14 + x15);
        ROTL9(x2, x7, x8, x13, x0 + x1, x5 + x6, x10 + x11, x12 + x15);
        ROTL13(x3, x4, x9, x14, x1 + x2, x6 + x7, x8 + x11, x12 + x13);
        ROTL18(x0, x5, x10, x15, x2 + x3, x4 + x7, x8 + x9, x13 + x14);
    }


    B[0].x += x0; B[0].y += x1; B[0].z += x2; B[0].w += x3;
    B[1].x += x4; B[1].y += x5; B[1].z += x6; B[1].w += x7;
    B[2].x += x8; B[2].y += x9; B[2].z += x10; B[2].w += x11;
    B[3].x += x12; B[3].y += x13; B[3].z += x14; B[3].w += x15;
    x0 = (C[0].x ^= B[0].x);
    x1 = (C[0].y ^= B[0].y);
    x2 = (C[0].z ^= B[0].z);
    x3 = (C[0].w ^= B[0].w);
    x4 = (C[1].x ^= B[1].x);
    x5 = (C[1].y ^= B[1].y);
    x6 = (C[1].z ^= B[1].z);
    x7 = (C[1].w ^= B[1].w);
    x8 = (C[2].x ^= B[2].x);
    x9 = (C[2].y ^= B[2].y);
    x10 = (C[2].z ^= B[2].z);
    x11 = (C[2].w ^= B[2].w);
    x12 = (C[3].x ^= B[3].x);
    x13 = (C[3].y ^= B[3].y);
    x14 = (C[3].z ^= B[3].z);
    x15 = (C[3].w ^= B[3].w);
    for (int j = 0; j < 4; j++) {
        /* Operate on columns. */
        ROTL7(x4, x9, x14, x3, x0 + x12, x1 + x5, x6 + x10, x11 + x15);
        ROTL9(x8, x13, x2, x7, x0 + x4, x5 + x9, x10 + x14, x3 + x15);
        ROTL13(x12, x1, x6, x11, x4 + x8, x9 + x13, x2 + x14, x3 + x7);
        ROTL18(x0, x5, x10, x15, x8 + x12, x1 + x13, x2 + x6, x7 + x11);

        /* Operate on rows. */
        ROTL7(x1, x6, x11, x12, x0 + x3, x4 + x5, x9 + x10, x14 + x15);
        ROTL9(x2, x7, x8, x13, x0 + x1, x5 + x6, x10 + x11, x12 + x15);
        ROTL13(x3, x4, x9, x14, x1 + x2, x6 + x7, x8 + x11, x12 + x13);
        ROTL18(x0, x5, x10, x15, x2 + x3, x4 + x7, x8 + x9, x13 + x14);
    }

    C[0].x += x0;
    C[0].y += x1;
    C[0].z += x2;
    C[0].w += x3;
    C[1].x += x4;
    C[1].y += x5;
    C[1].z += x6;
    C[1].w += x7;
    C[2].x += x8;
    C[2].y += x9;
    C[2].z += x10;
    C[2].w += x11;
    C[3].x += x12;
    C[3].y += x13;
    C[3].z += x14;
    C[3].w += x15;
}

__global__ static void nv2_scrypt_core_kernel(uint32_t *g_idata, uint32_t *g_odata, uint32_t **scratch, int batch, int end);


static int cuda_scrypt_core(struct cuda_dev *dev, struct cuda_thread_data *tdata, int cur, int N) {
    dim3 grid;
    dim3 threads;

    grid.x = dev->grid_blocks;
    threads.x = 32 * dev->warps_per_block;

    nv2_scrypt_core_kernel<<< grid, threads, 0, tdata->context_streams[cur]>>>
    (tdata->context_idata[cur], tdata->context_odata[cur], tdata->scratch_mem_dev, MAGIC, N);
    return (0);
}

__global__ static void nv2_scrypt_core_kernel(uint32_t *g_idata, uint32_t *g_odata, uint32_t **scratch, int batch, int end) {
    int offset = blockIdx.x * blockDim.x + threadIdx.x / warpSize * warpSize;
    g_idata += 32 * offset;
    g_odata += 32 * offset;
    uint32_t *V = scratch[offset / warpSize];
    uint4 B[4], C[4];
    int begin;

    __transposed_read_BC((uint4 *)g_idata, B, C, 1, 0, 0);
    __transposed_write_BC1(B, C, (uint4 *)V, 1024);

#pragma unroll 1024
    for (begin = 1; begin < end; begin++) {
        xor_salsa8(B, C);
        __transposed_write_BC1(B, C, (uint4 *)(V + begin * 32), 1024);
    }
    __transposed_read_BC((uint4 *)V, B, C, c_N, c_N_1, 0);
    xor_salsa8(B, C);

#pragma unroll 1024
    for (begin = 0; begin < end; begin++) {
        int slot = C[0].x & c_N_1;
        __transposed_read_BC((uint4 *)(V), B, C, c_N, slot, 1);
        xor_salsa8(B, C);
    }
    __transposed_write_BC1(B, C, (uint4 *)(g_odata), 1);

}