Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

In Tricore's implementation, the binary search has some problem? #2

Open
Arsmart1 opened this issue Apr 13, 2023 · 0 comments
Open

Comments

@Arsmart1
Copy link

Here, when we store the edge_list into "local" shared memory list, we use: local[p * 32 + i] = a[i * degree_m / 32];
Notice here we are not storing the first several layers of the binary search tree, but it is simply arithmetic progression, just like, if we have degree_m=4, we will store in: [0, 0, 0, 0, 0, 0, 0, 0(eight fist value), 1, 1, 1, 1, 1, 1, 1, 1(eight second value), ...] and it does not correspond to the later search index, like, Y = local[p * 32 + r];, what we want here is the r-th value in edge_list, but what we really have is not, is just from arithmetic progression.

__global__ void warp_binary_kernel(const uint32_t* __restrict__ edge_m, const uint32_t* __restrict__ node_index_m, uint32_t edge_m_count, uint32_t* __restrict__ adj_m, uint32_t start_node_n, const uint32_t* __restrict__ node_index_n, uint32_t node_index_n_count, uint32_t* __restrict__ adj_n, uint64_t *results) {
    //phase 1, partition
    uint64_t count = 0;
    __shared__ uint32_t local[BLOCKSIZE];

    uint32_t i = threadIdx.x % 32;
    uint32_t p = threadIdx.x / 32;
    for (uint32_t tid = (threadIdx.x + blockIdx.x * blockDim.x) / 32; tid < edge_m_count; tid += blockDim.x * gridDim.x / 32) {
        uint32_t node_m = edge_m[tid];
        uint32_t node_n = adj_m[tid];
        if (node_n < start_node_n || node_n >= start_node_n + node_index_n_count) {
            continue;
        }

        uint32_t degree_m = node_index_m[node_m + 1] - node_index_m[node_m];
        uint32_t degree_n = node_index_n[node_n + 1 - start_node_n] - node_index_n[node_n - start_node_n];
        uint32_t* a = adj_m + node_index_m[node_m];
        uint32_t* b = adj_n + node_index_n[node_n - start_node_n];
        if(degree_m < degree_n){
            uint32_t temp = degree_m;
            degree_m = degree_n;
            degree_n = temp;
            uint32_t *aa = a;
            a = b;
            b = aa;
        }

        //initial cache
        local[p * 32 + i] = a[i * degree_m / 32];
        __syncthreads();

        //search
        uint32_t j = i;
        while(j < degree_n){
            uint32_t X = b[j];
            uint32_t Y;
            //phase 1: cache
            int32_t bot = 0;
            int32_t top = 32;
            int32_t r;
            while(top > bot + 1){
                r = (top + bot) / 2;
                Y = local[p * 32 + r];
                if(X == Y){
                    count++;
                    bot = top + 32;
                }
                if(X < Y){
                    top = r;
                }
                if(X > Y){
                    bot = r;
                }
            }
            //phase 2
            bot = bot * degree_m / 32;
            top = top * degree_m / 32 - 1;
            while(top >= bot){
                r = (top + bot) / 2;
                Y = a[r];
                if(X == Y){
                    count++;
                }
                if(X <= Y){
                    top = r - 1;
                }
                if(X >= Y){
                    bot = r + 1;
                }
            }
            j += 32;
        }
        __syncthreads();
    }
    results[blockDim.x * blockIdx.x + threadIdx.x] = count;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant