如何在cuda推力中实现嵌套循环 [英] How to implement nested loops in cuda thrust
问题描述
我目前必须运行一个嵌套循环如下:
for(int i = 0; i for(int j = i + 1; j <= N; j ++){
compute(...)//这里的计算
}
}
我试过离开第一个循环 CPU
并执行 GPU
中的第二个循环。结果内存访问过多
。有没有其他方法来做呢?例如 thrust :: reduce_by_key
?
整个程序在这里:
#include< thrust / device_vector.h>
#include< thrust / host_vector.h>
#include< thrust / generate.h>
#include< thrust / sort.h>
#include< thrust / binary_search.h>
#include< thrust / iterator / counting_iterator.h>
#include< thrust / random.h>
#include< cmath>
#include< iostream>
#include< iomanip>
#define N 1000000
//定义一个2d点对
typedef thrust :: tuple< float,float>点;
//返回一个随机点在[0,1] ^ 2
点make_point(void)
{
static thrust :: default_random_engine rng(12345);
static thrust :: uniform_real_distribution< float> dist(0.0f,1.0f);
float x = dist(rng);
float y = dist(rng);
return Point(x,y);
}
struct sqrt_dis:public thrust :: unary_function< Point,double>
{
float x,y;
double tmp;
sqrt_dis(float _x,float _y):x(_x),y(_y){}
__host__ __device__
浮点运算符b tmp =(thrust :: get 0(a)-x)*(thrust :: get 0(a)-x)+ \
(thrust :: get 1 -y)*(thrust :: get 1(a)-y);
tmp = -1.0 *(sqrt(tmp));
return(1.0 / tmp);
}
};
int main(void){
clock_t t1,t2;
double result;
t1 = clock();
//在主机上的单位平方分配一些随机点
thrust :: host_vector< Point> h_points(N);
thrust :: generate(h_points.begin(),h_points.end(),make_point);
//转移到设备
thrust :: device_vector< Point> points = h_points;
thrust :: plus< double> binary_op;
float init = 0;
for(int i = 0; i Point tmp_i = points [i];
float x = thrust :: get< 0>(tmp_i);
float y = thrust :: get"(tmp_i);
result + = thrust :: transform_reduce(points.begin()+ i,\
points.end(),sqrt_dis(x,y),\
init,binary_op);
std :: cout<<result<< i<<:<< result<< std :: endl;
}
t2 = clock() - t1;
std :: cout<<result:;
std :: cout.precision(10);
std :: cout<< result<< std :: endl;
std :: cout<<run time:<<< t2 / CLOCKS_PER_SEC<<s<< std :: endl;
return 0;
}
发布一个例子,这里是如何解决它:
你有 n
like this(here n = 4
)
points = [p0 p1 p2 p3]
根据您的代码,我假设您要计算:
result = f(p0,p1)+ f(p0,p2)+ f(p0,p3)+
f f(p1,p3)+
f(p2,p3)
c> f()是你需要执行的距离函数 m
次数:
m =(n-1)* n / 2
这个例子: m = 6
你可以把这个问题看成一个三角矩阵:
[p0 p1 p2 p3]
[p1 p2 p3]
[p2 p3]
[p3]
将此矩阵转换为一个线性向量 m
元素,而省略对角元素导致:
[p1 p2 p3 p2 p3 p3]
向量中元素的索引为 k = [0,m-1]
。
索引 k
可以重新映射到三角矩阵的列和行到 k - > (sqrt(-8 *)),其中(i,j) k + 4 * n *(n-1)-7)/2.0-0.5)
j = k + i + 1-n *(n-1)/ 2 + )/ 2
i
j
在我们的示例中:
0 - > (0,1)
1 - > (0,2)
2 - > (0,3)
3 - > (1,2)
4 - > (1,3)
5 - > (2,3)
现在你可以把所有这些放在一起并执行一个修改的距离函数 m
次,应用上述映射以根据索引获取相应的对,然后总计一切。
#include< thrust / device_vector.h>
#include< thrust / generate.h>
#include< thrust / iterator / counting_iterator.h>
#include< thrust / transform_reduce.h>
#include< thrust / random.h>
#include< math.h>
#include< iostream>
#include< stdio.h>
#include< stdint.h>
#define PRINT_DEBUG
typedef float Float;
//定义一个2d点对
typedef thrust :: tuple< Float,Float>点;
//返回一个随机点在[0,1] ^ 2
点make_point(void)
{
static thrust :: default_random_engine rng(12345);
static thrust :: uniform_real_distribution< Float> dist(0.0,1.0);
Float x = dist(rng);
Float y = dist(rng);
return Point(x,y);
}
struct sqrt_dis_new
{
typedef thrust :: device_ptr< Point> DevPtr;
DevPtr点;
const uint64_t n;
__host__
sqrt_dis_new(uint64_t n,DevPtr p):n(n),points(p)
{
}
__device__
Float operator()(uint64_t k)const
{
//计算三角形矩阵中的索引
const uint64_t i = n - 2 - floor(sqrt(double) 8 * k + 4 * n *(n-1)-7))/ 2.0-0.5);
const uint64_t j = k + i + 1-n *(n-1)/ 2 +(n-i)*((n-i)-1)/ 2;
#ifdef PRINT_DEBUG
printf(%llu - >(%llu,%llu)\\\
,k,i,j);
#endif
const Point& p1 = *(points.get()+ j);
const Point& p2 = *(points.get()+ i);
const Float xm = thrust :: get< 0>(p1)-thrust :: get< 0>(p2)
const Float ym = thrust :: get(p1)-thrust :: get(p2);
return 1.0 /( - 1.0 * sqrt(xm * xm + ym * ym));
}
};
int main()
{
const uint64_t N = 4;
//在主机上的单位平方分配一些随机点
thrust :: host_vector< Point> h_points(N);
thrust :: generate(h_points.begin(),h_points.end(),make_point);
//转移到设备
thrust :: device_vector< Point> d_points = h_points;
const uint64_t count =(N-1)* N / 2;
std :: cout<< count<< std :: endl;
thrust :: plus< Float> binary_op;
const Float init = 0.0;
Float result = thrust :: transform_reduce(thrust :: make_counting_iterator((uint64_t)0),
thrust :: make_counting_iterator(count),
sqrt_dis_new(N,d_points.data ()),
init,
binary_op);
std :: cout.precision(10);
std :: cout<<result:<<结果< std :: endl;
return 0;
}
code> compute 函数。
通常,对于 i
和 j
的每个组合,以2D方式展开循环并启动内核。如果计算是独立的。
查看Thrust示例,并找出与您的问题类似的用例。
I currently have to run a nested loop as follow:
for(int i = 0; i < N; i++){
for(int j = i+1; j <= N; j++){
compute(...)//some calculation here
}
}
I've tried leaving the first loop in CPU
and do the second loop in GPU
. Results are too many memory access
. Is there any other ways to do it? For example by thrust::reduce_by_key
?
The whole program is here:
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/generate.h>
#include <thrust/sort.h>
#include <thrust/binary_search.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/random.h>
#include <cmath>
#include <iostream>
#include <iomanip>
#define N 1000000
// define a 2d point pair
typedef thrust::tuple<float, float> Point;
// return a random Point in [0,1)^2
Point make_point(void)
{
static thrust::default_random_engine rng(12345);
static thrust::uniform_real_distribution<float> dist(0.0f, 1.0f);
float x = dist(rng);
float y = dist(rng);
return Point(x,y);
}
struct sqrt_dis: public thrust::unary_function<Point, double>
{
float x, y;
double tmp;
sqrt_dis(float _x, float _y): x(_x), y(_y){}
__host__ __device__
float operator()(Point a)
{
tmp =(thrust::get<0>(a)-x)*(thrust::get<0>(a)-x)+\
(thrust::get<1>(a)-y)*(thrust::get<1>(a)-y);
tmp = -1.0*(sqrt(tmp));
return (1.0/tmp);
}
};
int main(void) {
clock_t t1, t2;
double result;
t1 = clock();
// allocate some random points in the unit square on the host
thrust::host_vector<Point> h_points(N);
thrust::generate(h_points.begin(), h_points.end(), make_point);
// transfer to device
thrust::device_vector<Point> points = h_points;
thrust::plus<double> binary_op;
float init = 0;
for(int i = 0; i < N; i++){
Point tmp_i = points[i];
float x = thrust::get<0>(tmp_i);
float y = thrust::get<1>(tmp_i);
result += thrust::transform_reduce(points.begin()+i,\
points.end(),sqrt_dis(x,y),\
init,binary_op);
std::cout<<"result"<<i<<": "<<result<<std::endl;
}
t2 = clock()-t1;
std::cout<<"result: ";
std::cout.precision(10);
std::cout<< result <<std::endl;
std::cout<<"run time: "<<t2/CLOCKS_PER_SEC<<"s"<<std::endl;
return 0;
}
EDIT: Now that you have posted an example, here is how you could solve it:
You have n
2D points stored in a linear array like this (here n=4
)
points = [p0 p1 p2 p3]
Based on your code I assume you want to calculate:
result = f(p0, p1) + f(p0, p2) + f(p0, p3) +
f(p1, p2) + f(p1, p3) +
f(p2, p3)
Where f()
is your distance function which needs to be executed m
times in total:
m = (n-1)*n/2
in this example: m=6
You can look at this problem as a triangular matrix:
[ p0 p1 p2 p3 ]
[ p1 p2 p3 ]
[ p2 p3 ]
[ p3 ]
Transforming this matrix into a linear vector with m
elements while leaving out the diagonal elements results in:
[p1 p2 p3 p2 p3 p3]
The index of an element in the vector is k = [0,m-1]
.
Index k
can be remapped to columns and rows of the triangular matrix to k -> (i,j)
:
i = n - 2 - floor(sqrt(-8*k + 4*n*(n-1)-7)/2.0 - 0.5)
j = k + i + 1 - n*(n-1)/2 + (n-i)*((n-i)-1)/2
i
is the row and j
is the column.
In our example:
0 -> (0, 1)
1 -> (0, 2)
2 -> (0, 3)
3 -> (1, 2)
4 -> (1, 3)
5 -> (2, 3)
Now you can put all this together and execute a modified distance functor m
times which applies the aforementioned mapping to get the corresponding pairs based on the index and then sum up everything.
I modified your code accordingly:
#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/transform_reduce.h>
#include <thrust/random.h>
#include <math.h>
#include <iostream>
#include <stdio.h>
#include <stdint.h>
#define PRINT_DEBUG
typedef float Float;
// define a 2d point pair
typedef thrust::tuple<Float, Float> Point;
// return a random Point in [0,1)^2
Point make_point(void)
{
static thrust::default_random_engine rng(12345);
static thrust::uniform_real_distribution<Float> dist(0.0, 1.0);
Float x = dist(rng);
Float y = dist(rng);
return Point(x,y);
}
struct sqrt_dis_new
{
typedef thrust::device_ptr<Point> DevPtr;
DevPtr points;
const uint64_t n;
__host__
sqrt_dis_new(uint64_t n, DevPtr p) : n(n), points(p)
{
}
__device__
Float operator()(uint64_t k) const
{
// calculate indices in triangular matrix
const uint64_t i = n - 2 - floor(sqrt((double)(-8*k + 4*n*(n-1)-7))/2.0 - 0.5);
const uint64_t j = k + i + 1 - n*(n-1)/2 + (n-i)*((n-i)-1)/2;
#ifdef PRINT_DEBUG
printf("%llu -> (%llu, %llu)\n", k,i,j);
#endif
const Point& p1 = *(points.get()+j);
const Point& p2 = *(points.get()+i);
const Float xm = thrust::get<0>(p1)-thrust::get<0>(p2);
const Float ym = thrust::get<1>(p1)-thrust::get<1>(p2);
return 1.0/(-1.0 * sqrt(xm*xm + ym*ym));
}
};
int main()
{
const uint64_t N = 4;
// allocate some random points in the unit square on the host
thrust::host_vector<Point> h_points(N);
thrust::generate(h_points.begin(), h_points.end(), make_point);
// transfer to device
thrust::device_vector<Point> d_points = h_points;
const uint64_t count = (N-1)*N/2;
std::cout << count << std::endl;
thrust::plus<Float> binary_op;
const Float init = 0.0;
Float result = thrust::transform_reduce(thrust::make_counting_iterator((uint64_t)0),
thrust::make_counting_iterator(count),
sqrt_dis_new(N, d_points.data()),
init,
binary_op);
std::cout.precision(10);
std::cout<<"result: " << result << std::endl;
return 0;
}
It depends on your compute
function which you do not specify.
Usually you unroll the loops and launch the kernel in a 2D manner for every combination of i
and j
if the computations are independent.
Have a look at the Thrust examples and identify similar use cases to your problem.
这篇关于如何在cuda推力中实现嵌套循环的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!