In Tricore's implementation, the binary search has some problem? #2

Arsmart1 · 2023-04-13T12:18:16Z

Here, when we store the edge_list into "local" shared memory list, we use: local[p * 32 + i] = a[i * degree_m / 32];
Notice here we are not storing the first several layers of the binary search tree, but it is simply arithmetic progression, just like, if we have degree_m=4, we will store in: [0, 0, 0, 0, 0, 0, 0, 0(eight fist value), 1, 1, 1, 1, 1, 1, 1, 1(eight second value), ...] and it does not correspond to the later search index, like, Y = local[p * 32 + r];, what we want here is the r-th value in edge_list, but what we really have is not, is just from arithmetic progression.

__global__ void warp_binary_kernel(const uint32_t* __restrict__ edge_m, const uint32_t* __restrict__ node_index_m, uint32_t edge_m_count, uint32_t* __restrict__ adj_m, uint32_t start_node_n, const uint32_t* __restrict__ node_index_n, uint32_t node_index_n_count, uint32_t* __restrict__ adj_n, uint64_t *results) {
    //phase 1, partition
    uint64_t count = 0;
    __shared__ uint32_t local[BLOCKSIZE];

    uint32_t i = threadIdx.x % 32;
    uint32_t p = threadIdx.x / 32;
    for (uint32_t tid = (threadIdx.x + blockIdx.x * blockDim.x) / 32; tid < edge_m_count; tid += blockDim.x * gridDim.x / 32) {
        uint32_t node_m = edge_m[tid];
        uint32_t node_n = adj_m[tid];
        if (node_n < start_node_n || node_n >= start_node_n + node_index_n_count) {
            continue;
        }

        uint32_t degree_m = node_index_m[node_m + 1] - node_index_m[node_m];
        uint32_t degree_n = node_index_n[node_n + 1 - start_node_n] - node_index_n[node_n - start_node_n];
        uint32_t* a = adj_m + node_index_m[node_m];
        uint32_t* b = adj_n + node_index_n[node_n - start_node_n];
        if(degree_m < degree_n){
            uint32_t temp = degree_m;
            degree_m = degree_n;
            degree_n = temp;
            uint32_t *aa = a;
            a = b;
            b = aa;
        }

        //initial cache
        local[p * 32 + i] = a[i * degree_m / 32];
        __syncthreads();

        //search
        uint32_t j = i;
        while(j < degree_n){
            uint32_t X = b[j];
            uint32_t Y;
            //phase 1: cache
            int32_t bot = 0;
            int32_t top = 32;
            int32_t r;
            while(top > bot + 1){
                r = (top + bot) / 2;
                Y = local[p * 32 + r];
                if(X == Y){
                    count++;
                    bot = top + 32;
                }
                if(X < Y){
                    top = r;
                }
                if(X > Y){
                    bot = r;
                }
            }
            //phase 2
            bot = bot * degree_m / 32;
            top = top * degree_m / 32 - 1;
            while(top >= bot){
                r = (top + bot) / 2;
                Y = a[r];
                if(X == Y){
                    count++;
                }
                if(X <= Y){
                    top = r - 1;
                }
                if(X >= Y){
                    bot = r + 1;
                }
            }
            j += 32;
        }
        __syncthreads();
    }
    results[blockDim.x * blockIdx.x + threadIdx.x] = count;
}

The text was updated successfully, but these errors were encountered:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

In Tricore's implementation, the binary search has some problem? #2

In Tricore's implementation, the binary search has some problem? #2

Arsmart1 commented Apr 13, 2023

In Tricore's implementation, the binary search has some problem? #2

In Tricore's implementation, the binary search has some problem? #2

Comments

Arsmart1 commented Apr 13, 2023