如何在cuda推力中实现嵌套循环 [英] How to implement nested loops in cuda thrust

查看:103
本文介绍了如何在cuda推力中实现嵌套循环的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我目前必须运行一个嵌套循环如下:

  for(int i = 0; i  for(int j = i + 1; j <= N; j ++){
compute(...)//这里的计算
}
}

我试过离开第一个循环 CPU 并执行 GPU 中的第二个循环。结果内存访问过多。有没有其他方法来做呢?例如 thrust :: reduce_by_key



整个程序在这里:

  #include< thrust / device_vector.h> 
#include< thrust / host_vector.h>
#include< thrust / generate.h>
#include< thrust / sort.h>
#include< thrust / binary_search.h>
#include< thrust / iterator / counting_iterator.h>
#include< thrust / random.h>
#include< cmath>
#include< iostream>
#include< iomanip>

#define N 1000000

//定义一个2d点对
typedef thrust :: tuple< float,float>点;

//返回一个随机点在[0,1] ^ 2
点make_point(void)
{
static thrust :: default_random_engine rng(12345);
static thrust :: uniform_real_distribution< float> dist(0.0f,1.0f);
float x = dist(rng);
float y = dist(rng);
return Point(x,y);
}

struct sqrt_dis:public thrust :: unary_function< Point,double>
{
float x,y;
double tmp;
sqrt_dis(float _x,float _y):x(_x),y(_y){}
__host__ __device__
浮点运算符b tmp =(thrust :: get 0(a)-x)*(thrust :: get 0(a)-x)+ \
(thrust :: get 1 -y)*(thrust :: get 1(a)-y);
tmp = -1.0 *(sqrt(tmp));
return(1.0 / tmp);
}

};


int main(void){
clock_t t1,t2;
double result;

t1 = clock();
//在主机上的单位平方分配一些随机点
thrust :: host_vector< Point> h_points(N);
thrust :: generate(h_points.begin(),h_points.end(),make_point);

//转移到设备
thrust :: device_vector< Point> points = h_points;

thrust :: plus< double> binary_op;
float init = 0;

for(int i = 0; i Point tmp_i = points [i];
float x = thrust :: get< 0>(tmp_i);
float y = thrust :: get"(tmp_i);
result + = thrust :: transform_reduce(points.begin()+ i,\
points.end(),sqrt_dis(x,y),\
init,binary_op);
std :: cout<<result<< i<<:<< result<< std :: endl;
}
t2 = clock() - t1;
std :: cout<<result:;
std :: cout.precision(10);
std :: cout<< result<< std :: endl;
std :: cout<<run time:<<< t2 / CLOCKS_PER_SEC<<s<< std :: endl;
return 0;
}


解决方案

发布一个例子,这里是如何解决它:



你有 n like this(here n = 4

  points = [p0 p1 p2 p3] 

根据您的代码,我假设您要计算:

  result = f(p0,p1)+ f(p0,p2)+ f(p0,p3)+ 
f f(p1,p3)+
f(p2,p3)

c> f()是你需要执行的距离函数 m 次数

  m =(n-1)* n / 2 

这个例子: m = 6



你可以把这个问题看成一个三角矩阵:

  [p0 p1 p2 p3] 
[p1 p2 p3]
[p2 p3]
[p3]

将此矩阵转换为一个线性向量 m 元素,而省略对角元素导致:

  [p1 p2 p3 p2 p3 p3] 

向量中元素的索引为 k = [0,m-1]
索引 k 可以重新映射到三角矩阵的列和行 k - > (sqrt(-8 *)),其中(i,j) k + 4 * n *(n-1)-7)/2.0-0.5)
j = k + i + 1-n *(n-1)/ 2 + )/ 2

i j



在我们的示例中:

  0  - > (0,1)
1 - > (0,2)
2 - > (0,3)
3 - > (1,2)
4 - > (1,3)
5 - > (2,3)

现在你可以把所有这些放在一起并执行一个修改的距离函数 m 次,应用上述映射以根据索引获取相应的对,然后总计一切。



  #include< thrust / device_vector.h> 
#include< thrust / generate.h>
#include< thrust / iterator / counting_iterator.h>
#include< thrust / transform_reduce.h>
#include< thrust / random.h>
#include< math.h>
#include< iostream>
#include< stdio.h>
#include< stdint.h>

#define PRINT_DEBUG

typedef float Float;

//定义一个2d点对
typedef thrust :: tuple< Float,Float>点;

//返回一个随机点在[0,1] ^ 2
点make_point(void)
{
static thrust :: default_random_engine rng(12345);
static thrust :: uniform_real_distribution< Float> dist(0.0,1.0);
Float x = dist(rng);
Float y = dist(rng);
return Point(x,y);
}


struct sqrt_dis_new
{
typedef thrust :: device_ptr< Point> DevPtr;

DevPtr点;
const uint64_t n;

__host__
sqrt_dis_new(uint64_t n,DevPtr p):n(n),points(p)
{
}

__device__
Float operator()(uint64_t k)const
{
//计算三角形矩阵中的索引
const uint64_t i = n - 2 - floor(sqrt(double) 8 * k + 4 * n *(n-1)-7))/ 2.0-0.5);
const uint64_t j = k + i + 1-n *(n-1)/ 2 +(n-i)*((n-i)-1)/ 2;

#ifdef PRINT_DEBUG
printf(%llu - >(%llu,%llu)\\\
,k,i,j);
#endif

const Point& p1 = *(points.get()+ j);
const Point& p2 = *(points.get()+ i);

const Float xm = thrust :: get< 0>(p1)-thrust :: get< 0>(p2)
const Float ym = thrust :: get(p1)-thrust :: get(p2);

return 1.0 /( - 1.0 * sqrt(xm * xm + ym * ym));
}
};


int main()
{
const uint64_t N = 4;

//在主机上的单位平方分配一些随机点
thrust :: host_vector< Point> h_points(N);
thrust :: generate(h_points.begin(),h_points.end(),make_point);

//转移到设备
thrust :: device_vector< Point> d_points = h_points;

const uint64_t count =(N-1)* N / 2;

std :: cout<< count<< std :: endl;


thrust :: plus< Float> binary_op;
const Float init = 0.0;

Float result = thrust :: transform_reduce(thrust :: make_counting_iterator((uint64_t)0),
thrust :: make_counting_iterator(count),
sqrt_dis_new(N,d_points.data ()),
init,
binary_op);

std :: cout.precision(10);
std :: cout<<result:<<结果< std :: endl;

return 0;
}






code> compute 函数。
通常,对于 i j 的每个组合,以2D方式展开循环并启动内核。如果计算是独立的。
查看Thrust示例,并找出与您的问题类似的用例。


I currently have to run a nested loop as follow:

for(int i = 0; i < N; i++){
    for(int j = i+1; j <= N; j++){
        compute(...)//some calculation here
    }
}

I've tried leaving the first loop in CPU and do the second loop in GPU. Results are too many memory access. Is there any other ways to do it? For example by thrust::reduce_by_key?

The whole program is here:

#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/generate.h>
#include <thrust/sort.h>
#include <thrust/binary_search.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/random.h>
#include <cmath>
#include <iostream>
#include <iomanip>

#define N 1000000

// define a 2d point pair
typedef thrust::tuple<float, float> Point;

// return a random Point in [0,1)^2
Point make_point(void)
{
  static thrust::default_random_engine rng(12345);
  static thrust::uniform_real_distribution<float> dist(0.0f, 1.0f);
  float x = dist(rng);
  float y = dist(rng);
  return Point(x,y);
}

struct sqrt_dis: public thrust::unary_function<Point, double>
{
  float x, y;
  double tmp;
  sqrt_dis(float _x, float _y): x(_x), y(_y){}
  __host__ __device__
  float operator()(Point a)
 {
    tmp =(thrust::get<0>(a)-x)*(thrust::get<0>(a)-x)+\
    (thrust::get<1>(a)-y)*(thrust::get<1>(a)-y);
    tmp = -1.0*(sqrt(tmp));
    return (1.0/tmp);
 }

};


int main(void) {
  clock_t t1, t2;
  double result;

  t1 = clock();
  // allocate some random points in the unit square on the host
  thrust::host_vector<Point> h_points(N);
  thrust::generate(h_points.begin(), h_points.end(), make_point);

  // transfer to device
  thrust::device_vector<Point> points = h_points;

  thrust::plus<double> binary_op;
  float init = 0;

  for(int i = 0; i < N; i++){
    Point tmp_i = points[i];
    float x = thrust::get<0>(tmp_i);
    float y = thrust::get<1>(tmp_i);
    result += thrust::transform_reduce(points.begin()+i,\
                                       points.end(),sqrt_dis(x,y),\
                                       init,binary_op);
    std::cout<<"result"<<i<<": "<<result<<std::endl;
  }
  t2 = clock()-t1;
  std::cout<<"result: ";
  std::cout.precision(10);
  std::cout<< result <<std::endl;
  std::cout<<"run time: "<<t2/CLOCKS_PER_SEC<<"s"<<std::endl;
  return 0;
 }

解决方案

EDIT: Now that you have posted an example, here is how you could solve it:

You have n 2D points stored in a linear array like this (here n=4)

points = [p0 p1 p2 p3]

Based on your code I assume you want to calculate:

result = f(p0, p1) + f(p0, p2) + f(p0, p3) +
         f(p1, p2) + f(p1, p3) +
         f(p2, p3)

Where f() is your distance function which needs to be executed m times in total:

m = (n-1)*n/2

in this example: m=6

You can look at this problem as a triangular matrix:

[ p0 p1 p2 p3 ] 
[    p1 p2 p3 ]
[       p2 p3 ]
[          p3 ]

Transforming this matrix into a linear vector with m elements while leaving out the diagonal elements results in:

[p1 p2 p3 p2 p3 p3]

The index of an element in the vector is k = [0,m-1]. Index k can be remapped to columns and rows of the triangular matrix to k -> (i,j):

i = n - 2 - floor(sqrt(-8*k + 4*n*(n-1)-7)/2.0 - 0.5)
j = k + i + 1 - n*(n-1)/2 + (n-i)*((n-i)-1)/2

i is the row and j is the column.

In our example:

0 -> (0, 1)
1 -> (0, 2)
2 -> (0, 3)
3 -> (1, 2)
4 -> (1, 3)
5 -> (2, 3)

Now you can put all this together and execute a modified distance functor m times which applies the aforementioned mapping to get the corresponding pairs based on the index and then sum up everything.

I modified your code accordingly:

#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/transform_reduce.h>
#include <thrust/random.h>
#include <math.h>
#include <iostream>
#include <stdio.h>
#include <stdint.h>

#define PRINT_DEBUG

typedef float Float;

// define a 2d point pair
typedef thrust::tuple<Float, Float> Point;

// return a random Point in [0,1)^2
Point make_point(void)
{
    static thrust::default_random_engine rng(12345);
    static thrust::uniform_real_distribution<Float> dist(0.0, 1.0);
    Float x = dist(rng);
    Float y = dist(rng);
    return Point(x,y);
}


struct sqrt_dis_new
{
    typedef thrust::device_ptr<Point> DevPtr;

    DevPtr points;
    const uint64_t n;

    __host__
    sqrt_dis_new(uint64_t n, DevPtr p) : n(n), points(p)
    {
    }

    __device__ 
    Float operator()(uint64_t k) const
    {
        // calculate indices in triangular matrix
        const uint64_t i = n - 2 - floor(sqrt((double)(-8*k + 4*n*(n-1)-7))/2.0 - 0.5);
        const uint64_t j = k + i + 1 - n*(n-1)/2 + (n-i)*((n-i)-1)/2;

#ifdef PRINT_DEBUG
        printf("%llu -> (%llu, %llu)\n", k,i,j);
#endif

        const Point& p1 = *(points.get()+j);
        const Point& p2 = *(points.get()+i);

        const Float xm = thrust::get<0>(p1)-thrust::get<0>(p2);
        const Float ym = thrust::get<1>(p1)-thrust::get<1>(p2);

        return 1.0/(-1.0 * sqrt(xm*xm + ym*ym));
    }
};


int main()
{
    const uint64_t N = 4;

    // allocate some random points in the unit square on the host
    thrust::host_vector<Point> h_points(N);
    thrust::generate(h_points.begin(), h_points.end(), make_point);

    // transfer to device
    thrust::device_vector<Point> d_points = h_points;

    const uint64_t count = (N-1)*N/2;

    std::cout << count << std::endl;


    thrust::plus<Float> binary_op;
    const Float init = 0.0;

    Float result = thrust::transform_reduce(thrust::make_counting_iterator((uint64_t)0),
                                            thrust::make_counting_iterator(count),
                                            sqrt_dis_new(N, d_points.data()),
                                            init,
                                            binary_op);

    std::cout.precision(10);  
    std::cout<<"result: " << result << std::endl;

    return 0;
}    


It depends on your compute function which you do not specify. Usually you unroll the loops and launch the kernel in a 2D manner for every combination of i and j if the computations are independent. Have a look at the Thrust examples and identify similar use cases to your problem.

这篇关于如何在cuda推力中实现嵌套循环的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆