BFS的最短路径，将代码从CUDA移植到openCL [英] Shortest paths by BFS, porting a code from CUDA to openCL

查看：114 发布时间：2020/5/20 19:01:53 cuda opencl breadth-first-search

本文介绍了BFS的最短路径，将代码从CUDA移植到openCL的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我目前正在移植一个CUDA代码，该代码可查找(无向)图中从每个节点到其他节点的最短路径. 因此，基本上，CUDA代码构造了一个从文本文件读取的图形.然后继续构建相邻的数组h_v和h_e.

I am currently porting a CUDA code that finds shortest paths from each node to other nodes in a (undirected) graph. So basically, the CUDA code constructs a graph read from a text file. Then it proceeds to build adjancent arrays h_v and h_e.

For example
A B
A C
B C
gives 
h_v[0] = 0, h_e[0]=1
h_v[1] = 0, h_e[1]=2
h_v[2] = 1, h_e[2]=2

然后，它调用内核以使用BFS计算来自每个节点的最短路径.

Then it calls the kernel to compute shortest paths from each node using BFS.

cuda主机代码如下:

int cc_bfs(int n_count, int e_count, int *h_v, int *h_e, float *h_cc, bool ec){
  int *d_v, *d_e;


  cudaCheckError(cudaMalloc((void **)&d_v, sizeof(int)*e_count));
  cudaCheckError(cudaMalloc((void **)&d_e, sizeof(int)*e_count)); 

  cudaCheckError(cudaMemcpy(d_v, h_v, sizeof(int)*e_count, cudaMemcpyHostToDevice));
  cudaCheckError(cudaMemcpy(d_e, h_e, sizeof(int)*e_count, cudaMemcpyHostToDevice));

  int *d_d, *d_dist; 

  cudaCheckError(cudaMalloc((void **)&d_d, sizeof(int)*n_count));
  cudaCheckError(cudaMalloc((void **)&d_dist, sizeof(int)));

  int *h_d;
  h_d=(int *)malloc(sizeof(int)*n_count);
  bool *d_continue;
  cudaCheckError(cudaMalloc((void**)&d_continue, sizeof(bool)));

  for(int s=0; s<n_count; s++){ //BIG FOR LOOP

    //////code to initalize h_d[i]
    for(int i=0; i<n_count; i++)
      h_d[i]=-1;
    h_d[s]=0; //for marking the root
    cudaCheckError(cudaMemcpy(d_d, h_d, sizeof(int)*n_count, cudaMemcpyHostToDevice));
    //////////////////////////////

    ///////////////////////////////
    int threads_per_block=e_count;
    int blocks=1;
    if(e_count>MAX_THREADS_PER_BLOCK){
      blocks = (int)ceil(e_count/(float)MAX_THREADS_PER_BLOCK); 
      threads_per_block = MAX_THREADS_PER_BLOCK; 
    }
    dim3 grid(blocks);
    dim3 threads(threads_per_block);
    /////////////////////////////////

    bool h_continue;
    int h_dist=0;
    cudaCheckError(cudaMemset(d_dist, 0, sizeof(int)));
    do{
      h_continue=false;
      cudaCheckError(cudaMemcpy(d_continue, &h_continue, sizeof(bool), cudaMemcpyHostToDevice));
      cc_bfs_kernel<<<grid, threads>>>(d_v, d_e, d_d, d_continue, d_dist, e_count);
      checkCUDAError("Kernel invocation");
      cudaThreadSynchronize();
      h_dist++;
      cudaCheckError(cudaMemcpy(d_dist, &h_dist, sizeof(int), cudaMemcpyHostToDevice));//for what?
      cudaCheckError(cudaMemcpy(&h_continue, d_continue, sizeof(bool), cudaMemcpyDeviceToHost));
    }while(h_continue);

    ///////////////////
    //then code to read back h_d from device


}

这是cuda内核

__global__ void cc_bfs_kernel(int *d_v, int *d_e, int  *d_d,
    bool *d_continue, int *d_dist, int e_count){
  int tid = blockIdx.x*blockDim.x + threadIdx.x;
  if(tid<e_count){
    /* for each edge (u, w) */
    int u=d_v[tid];
    int w=d_e[tid];
    if(d_d[u]==*d_dist){ //of the interest root
      if(d_d[w]==-1){ //not yet check
        *d_continue=true; //continue
        d_d[w]=*d_dist+1; //increase
      }   
    }   
  }
}

这是我将其移植到openCL的努力.我只是openCL的业余爱好者，所以我正在尽力逐行移植原始代码:(

Here is my effort to port it to openCL. I am just an amateur in openCL, so I am trying the best to port the original code line by line :(

openCL主机代码

cl_mem d_d= clCreateBuffer(context,CL_MEM_WRITE_ONLY| CL_MEM_USE_HOST_PTR,sizeof(int)*n_count, NULL,NULL);
cl_mem d_dist= clCreateBuffer(context,CL_MEM_READ_WRITE| CL_MEM_USE_HOST_PTR,sizeof(int), NULL,NULL);
int *h_d;
h_d=(int *)malloc(sizeof(int)*n_count);
cl_mem d_continue = clCreateBuffer(context,CL_MEM_READ_WRITE| CL_MEM_USE_HOST_PTR,sizeof(bool), NULL,NULL);
float* h_cc;
h_cc = (float *)malloc(sizeof(float)*n_count);

cl_mem d_v= clCreateBuffer(context,CL_MEM_READ_ONLY| CL_MEM_USE_HOST_PTR,sizeof(int)*e_count, NULL,NULL);
cl_mem d_e= clCreateBuffer(context,CL_MEM_READ_ONLY| CL_MEM_USE_HOST_PTR,sizeof(int)*e_count, NULL,NULL);

err = clEnqueueWriteBuffer(queue, d_v, CL_TRUE, 0, e_count * sizeof(int), host_v, 0, NULL, NULL);
err = clEnqueueWriteBuffer(queue, d_e, CL_TRUE, 0, e_count * sizeof(int), host_e, 0, NULL, NULL);

size_t global_size= e_count;

for(int s=0; s<n_count; s++)
{ //BIG LOOP

    //initalize h_d[i]
    for(int i=0; i<n_count; i++)
        h_d[i]=-1;
    h_d[s]=0;

    //copy h_d to d_d
     err = clEnqueueWriteBuffer(queue, d_d, CL_TRUE, 0,
        n_count * sizeof(int), h_d, 0, NULL, NULL);

    bool h_continue;
    int h_dist=0;
    int mark = 0;
    int* h_id;
    h_id= (int*) malloc(sizeof(int)*e_count);
    cl_mem id= clCreateBuffer(context,CL_MEM_WRITE_ONLY| CL_MEM_USE_HOST_PTR,
            sizeof(int)*e_count, NULL,NULL);


    do{
        h_continue=false;
        err = clEnqueueWriteBuffer(queue, d_continue, CL_TRUE, 0,
          sizeof(bool), &h_continue, 0, NULL, NULL);


        err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&d_v);
        err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&d_e);
        err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&d_d);
        err = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&d_continue);
        err = clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&d_dist);
        err = clSetKernelArg(kernel, 5, sizeof(int), (void *)&e_count);
        err = clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *)&id);

        /////EXECUTE
        cl_event sync1;
        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, 
            &global_size, NULL, 0, NULL, &sync1); //wait for this to finish (to synchronize)                
        err = clWaitForEvents(1, &sync1);
        clReleaseEvent(sync1);          
        ///////////////////


        err = clEnqueueReadBuffer(queue, id, CL_TRUE, 0,
        sizeof(int)*e_count, h_id, 0, NULL, NULL);

        printf("e_count = %d error : %d\n",e_count, err);//check error?

        for(int j = 0; j< e_count; j++)
        {
            printf("%d ",h_id[j]);
        }

        h_dist++;
        mark++;//for debug
        err = clEnqueueWriteBuffer(queue, d_dist, CL_TRUE, 0,
        sizeof(int), &h_dist, 0, NULL, NULL);
        err = clEnqueueReadBuffer(queue, d_continue, CL_TRUE, 0,
        sizeof(bool), &h_continue, 0, NULL, NULL);
    }
    while(h_continue);

    err = clEnqueueReadBuffer(queue, d_d, CL_TRUE, 0,
        n_count*sizeof(int), h_d, 0, NULL, NULL);

和openCL内核

__kernel void cc_bfs_kernel(__global int *d_v, __global int *d_e, __global int  *d_d,
    __global bool *d_continue, __global int *d_dist, const int e_count, __global int *id)
{


        int tid = get_global_id(0)-get_global_offset(0);
        //barrier(CLK_GLOBAL_MEM_FENCE);

        for (int i = 0; i< e_count; i++)
        {
            id[i]=i;
        }

        if(tid<e_count){
            id[tid]= tid;
            /* for each edge (u, w) */
            int u=d_v[tid];
            int w=d_e[tid];
            if(d_d[u]==*d_dist){ //of the interest root
                if(d_d[w]==-1)
                { //not yet check 
                    *d_continue=true; //continue
                    d_d[w]=*d_dist+1; //increase
                }   
            }   
        }
}

代码无法给出正确的结果，因此我通过打印一些值(内核中的tid，marks值来检查代码经过while循环的次数)来调试它.可悲的是，tid给出了垃圾值，并且仅通过while循环一次.您能指出我在这里缺少的内容吗?

The code cant give the correct result, so I debug it by printing some values (the tid inside the kernel, the marks value to check how many times the code goes through the while loop). Sadly, the tid gives rubbish values,and it goes through the while loop only once.Could you please pointing out what I am missing here?

我还有一个疑问: 我如何做与cudathreadsynchronize()类似的事情?在此版本的openCL中，我将clEnqueueNDRangeKernel与命令事件相关联并等待它，但是显然我似乎不起作用:(

I have another doubt: How can I do something similar as cudathreadsynchronize()? In this version of openCL, I associate clEnqueueNDRangeKernel with a command event and wait for it, but apparently I seems not to work :(

非常感谢.

BFS的最短路径，将代码从CUDA移植到openCL [英] Shortest paths by BFS, porting a code from CUDA to openCL

问题描述

推荐答案

相关文章

其他开发最新文章

热门教程

热门工具

登录关闭

BFS的最短路径，将代码从CUDA移植到openCL [英] Shortest paths by BFS, porting a code from CUDA to openCL

问题描述

推荐答案

相关文章

其他开发最新文章

热门教程

热门工具

登录 关闭

登录关闭