嵌套调用CUDA :: thrust函子的函子,以zip_iterator的形式运行 [英] functor with nested calls to CUDA::thrust functors operating as zip_iterator

查看:115
本文介绍了嵌套调用CUDA :: thrust函子的函子,以zip_iterator的形式运行的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我发现尝试使用CUDA :: Thurst迭代器来实现在GPU上运行的ODE求解器例程的过程中遇到一些困难,以解决GPU中的一系列耦合的一阶方程式。我想解决以前的中的方法问题,使用户可以使用对向量元组起作用的任意函子,尽可能地像人类一样编写方程组。详细来说,这是一小段代码:

I found some difficulties trying to implement ODEs solver routines running on GPUs using CUDA::Thurst iterators to solve a bunch of coupled first order equations in the GPU. I want to work around the approach in the former question enabling the user to write the systems of equations as human like as possible using arbitrary functors working on tuples of vectors. In details, here is a small piece of code:

#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/sequence.h>
#include <thrust/copy.h>
#include <thrust/fill.h>
#include <thrust/replace.h>
#include <thrust/functional.h>
#include <thrust/for_each.h>
#include <thrust/iterator/zip_iterator.h>

#include <iostream>
#include <math.h>


__host__ __device__ float f1(float x)
{
  return sinf(x);
}

__host__ __device__ float f2(float x)
{
  return cosf(x);
}

__host__ __device__ float Vx(float x)
{
  return sinf(x);
}

struct q_dot
{
  float x;
  float delta;
  q_dot(float _x,float _delta): x(_x),delta(_delta){};
  template <typename Tuple>
  __host__ __device__
  float operator()(Tuple t)
  {
    float p = thrust::get<1>(t) + delta;
    return  p/MASS;
  }
};


struct p_dot
{
  float x;
  float delta;
  p_dot(float _x,float _delta): x(_x),delta(_delta){};
  template <typename Tuple>
  __host__ __device__
  float operator()(Tuple t)
  {
    float q = thrust::get<0>(t) +   delta;
    return  -Vx(q);
  }
};




struct euler_functor
{
  unsigned fn;
  float h;
  float er;
  float x0;

  euler_functor(unsigned _fn,float _x0,float _h, float _er) : fn(_fn),h(_h),er(_er),x0(_x0) {};
  template <typename Tuple>
  __host__ __device__
  void operator()(Tuple t) const  {
    // if (fn == 1) y = h*f1(y);
    //else if (fn == 2) y = h*f2(y); This can be handled in this way?

    q =  h*p_dot(x0,h/2)(t);
    p =  h*p_dot(x0,h/2)(t);
    float er_p,er_q;
    er_p=0.5*h*p_dot(x0,h/2)(t);
    er_q=0.5*h*q_dot(x0,h/2)(t);
    er = er_p;

  }
};


int main(void)
{
  float t=0;
  float t_step=0.1;
  float error;


  const unsigned N = 8;
  // allocate three device_vectors with 10 elements
  thrust::device_vector<float> Q(N),P(N);
  // initilaize to some values
  thrust::sequence(Q.begin(), Q.end(),  0.0f, (float)(6.283/(float)N));
  // initilaize to some values
  thrust::sequence(P.begin(), P.end(),  0.0f, (float)(10.283/(float)N));

  // apply euler for each element of Q and P
  //thrust::for_each(X.begin(),X.end(),euler_functor(1,t,t_step,error)); this becomes:
  thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(Q.begin(),P.begin())),
                   thrust::make_zip_iterator(thrust::make_tuple(  Q.end(),  P.end())),euler_functor(1,t,t_step,error));
  // print the values
  for(int i = 0; i < N; i++) std::cout<< Q[i]<<"  "<<P[i]<< std::endl;
}

但是,当我编译以前的代码时,会遇到很多错误。同样,我不确定这是否是最好的方法。我该如何运作?我的错误在哪里?有没有更好的方法?因为携带有关数值误差信息的er变量始终返回零。如何获得此信息?可以使用它来实现一些自适应技巧。

But when I compile the former code I get a lot of errors. Again, I am not sure if this is the best way to do it. How can I make it work? Where are my error in it? Is there a better approach? as It is the er variable that carries information about the numerical error is returning always zero. Howto get this information? It can be use to implement some adaptive trick.

推荐答案

您的代码存在很多问题。我不确定我是否会在这里捕获所有这些信息,但是:

There were a number of problems with your code. I'm not sure I will capture all of them here, but:


  1. MASS
  2. 您的 p_dot q_dot 仿函数需要额外的 __ device __ 装饰

  3. p q的用法欧拉函子中的变量没有任何意义;它们没有在任何地方定义,也不是将值返回到 P Q 向量的正确方法,如果那是您的意图。

  4. 我们不会通过实例化时传递给函子的变量返回数据。因此,为了在每个时间步长返回 er 变量,我创建了一个单独的向量( ERP ERQ )。

  1. MASS was undefined.
  2. Your p_dot and q_dot functors needed additional __device__ decorations
  3. Your usage of the p and q variables inside the euler functor did not make any sense; they are not defined anywhere, nor is this the correct way to return values into P and Q vectors, if that was your intent.
  4. We don't return data through the variable passed to the functor at instantiation. Therefore to return the er variable at each timestep, I create a separate vector (ERP and ERQ) to do so.

此处是经过修改的代码,其中存在上述问题以及其他各种问题固定。尽管我没有详细检查算法,但似乎返回了明智的结果。

Here is a modified code which has the above issues and various other issues fixed. It seems to return sensible results, although I've not checked the arithmetic in detail.

#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/sequence.h>
#include <thrust/copy.h>
#include <thrust/fill.h>
#include <thrust/replace.h>
#include <thrust/functional.h>
#include <thrust/for_each.h>
#include <thrust/iterator/zip_iterator.h>

#include <iostream>
#include <math.h>

#define MASS 1.0f

__host__ __device__ float f1(float x)
{
  return sinf(x);
}

__host__ __device__ float f2(float x)
{
  return cosf(x);
}

__host__ __device__ float Vx(float x)
{
  return sinf(x);
}

struct q_dot
{
  float x;
  float delta;
  __host__ __device__
  q_dot(float _x,float _delta): x(_x),delta(_delta){};
  template <typename Tuple>
  __host__ __device__
  float operator()(Tuple t)
  {
    float p = thrust::get<1>(t) + delta;
    return  p/MASS;
  }
};


struct p_dot
{
  float x;
  float delta;
  __host__ __device__
  p_dot(float _x,float _delta): x(_x),delta(_delta){};
  template <typename Tuple>
  __host__ __device__
  float operator()(Tuple t)
  {
    float q = thrust::get<0>(t) +   delta;
    return  -Vx(q);
  }
};



struct euler_functor
{
  unsigned fn;
  float h;
  float x0;

  euler_functor(unsigned _fn,float _x0,float _h) : fn(_fn),h(_h),x0(_x0) {};
  template <typename Tuple>
  __host__ __device__
  void operator()(const Tuple &t) {
    // if (fn == 1) y = h*f1(y);
    //else if (fn == 2) y = h*f2(y); 
    float t0, t1, t2, t3;
    t0 =  h*p_dot(x0,h/2.0f)(t);
    t1 =  h*q_dot(x0,h/2.0f)(t);
    t2=0.5*h*p_dot(x0,h/2.0f)(t);
    t3=0.5*h*q_dot(x0,h/2.0f)(t);
    thrust::get<0>(t) = t0;
    thrust::get<1>(t) = t1;
    thrust::get<2>(t) = t2;
    thrust::get<3>(t) = t3;

  }
};


int main(void)
{
  float t=0;
  float t_step=0.1;


  const unsigned N = 8;
  // allocate three device_vectors with 10 elements
  thrust::device_vector<float> Q(N),P(N), ERP(N), ERQ(N);
  // initilaize to some values
  thrust::sequence(Q.begin(), Q.end(),  0.0f, (float)(6.283/(float)N));
  // initilaize to some values
  thrust::sequence(P.begin(), P.end(),  0.0f, (float)(10.283/(float)N));
  for(int i = 0; i < N; i++) std::cout<< Q[i]<<" "<<P[i]<< " "<< ERP[i] << " " << ERQ[i] << std::endl;
  std::cout<< "*****" << std::endl;
  // apply euler for each element of Q and P
  //thrust::for_each(X.begin(),X.end(),euler_functor(1,t,t_step,error)); this becomes:
  thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(Q.begin(),P.begin(),ERP.begin(), ERQ.begin())),thrust::make_zip_iterator(thrust::make_tuple(Q.end(),P.end(),ERP.end(), ERQ.end())),euler_functor(1,t,t_step));
  // print the values
  for(int i = 0; i < N; i++) std::cout<< Q[i]<<" "<<P[i]<< " "<< ERP[i] << " " << ERQ[i] << std::endl;
}

这篇关于嵌套调用CUDA :: thrust函子的函子,以zip_iterator的形式运行的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆