N体CUDA优化 [英] N-Body CUDA optimization

查看:814
本文介绍了N体CUDA优化的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我在CUDA中开发了一个N-body算法,我想了解一些优化的提示和技巧。



我设法获得<$在NVIDIA Geforce GTX 260上运行 16384 的主体 20Flops ,其中 27 Streaming Multiprocessors。



KernelcomputeForces 函数是慢速捅约 95 %的时间,我想知道是否还有我可以做的优化我的代码。



据我所知,针对内存空间局部性和内存写入进行了优化。在CUDA文档中的某处,它说共享内存更快,但我不明白我可以如何使用。我在 16 块中分配了 512 个线程,但是对我来说有点模糊。 / p>

请帮助并感谢您阅读。

  body 

gm是gpu质量指针

gpx是gpu位置x指针

gpy是gpu位置y指针

gpz是gpu位置z指针

gfx是gpu力x指针

gfy是gpu力y指针

gfz是gpu力z指针

相关的内核函数

  __ global__ void KernelcomputeForces(unsigned int n,float * gm,float * gpx,float * gpy,float * gpz,float * gfx,float * gfy ,float * gfz){
int tid = blockDim.x * blockIdx.x + threadIdx.x;
int numThreads = BlockDim.x * gridDim.x;

float GRAVITY = 0.00001f;

//全部比较所有
for(unsigned int ia = tid; ia< n; ia + = numThreads){
float lfx = 0.0f;
float lfy = 0.0f;
float lfz = 0.0f;

for(unsigned int ib = 0; ib // compute distance
float dx =(gpx [ib] - gpx [ia]);
float dy =(gpy [ib] -gpy [ia]);
float dz =(gpz [ib] - gpz [ia]);
// float distance = sqrt(dx * dx + dy * dy + dz * dz);
float distanceSquared = dx * dx + dy * dy + dz * dz;

//防止弹弓和除以零
// distance + = 0.1f;
distanceSquared + = 0.01f;

//计算物体之间的重力量值
// float magnitude = GRAVITY *(gm [ia] * gm [ib])/(distance * distance * distance * distance)
float magnitude = GRAVITY *(gm [ia] * gm [ib])/(distanceSquared);

//计算物体的力量
//幅度乘以方向
lfx + =幅度*(dx);
lfy + = magnitude *(dy);
lfz + = magnitude *(dz);
}

//将本地内存存储到全局内存
gfx [ia] = lfx;
gfy [ia] = lfy;
gfz [ia] = lfz;
}
}

extern void GPUcomputeForces(unsigned int n,float * gm,float * gpx,float * gpy,float * gpz,float * gfx,float * gfy, float * gfz){
dim3 gridDim(16,1,1); //指定三个可能维度中的块数
dim3 blockDim(512,1,1); //每个线程的线程
KernelcomputeForces<<<< gridDim,blockDim>>>>(n,gm,gpx,gpy,gpz,gfx,gfy,gfz);
}


解决方案

共享内存将在这种内核中有用的优化 - 它允许粒子位置和质量的读取的聚结,这在GT200上将是非常重要的。我发现这大约是你的版本的两倍(使用128个128线程的128个块的16384粒子启动):

  template< int blocksize> 
__global__
void KernelcomputeForces1(unsigned int n1,float * gm,float * gpx,
float * gpy,float * gpz,float * gfx,float * gfy,float * gfz)
{
__shared__ float lgpx [blocksize],lgpy [blocksize],
lgpz [blocksize],lgm [blocksize];

const float GRAVITY = 0.00001f;

//全部比较所有
float lfx = 0.0f,lfy = 0.0f,lfz = 0.0f;
int ia = blockDim.x * blockIdx.x + threadIdx.x;
float lgpx0 = gpx [ia],lgpy0 = gpy [ia],
lgpz0 = gpz [ia],lgm0 = gm [ia]

for(unsigned int ib = 0; ib
lgpx [threadIdx.x] = gpx [ib + threadIdx.x];
lgpy [threadIdx.x] = gpy [ib + threadIdx.x];
lgpz [threadIdx.x] = gpz [ib + threadIdx.x];
lgm [threadIdx.x] = gm [ib + threadIdx.x];
__syncthreads();

#pragma unroll
for(unsigned int ic = 0; ic
//计算距离
float dx = lgpx [ic] - lgpx0);
float dy =(lgpy [ic] - lgpy0);
float dz =(lgpz [ic] - lgpz0);
float distanceSquared = dx * dx + dy * dy + dz * dz;

//防止弹弓和除以零
distanceSquared + = 0.01f;

//计算物体之间的引力幅度
float magnitude = GRAVITY *(lgm0 * lgm [ic])
/(distanceSquared)

//计算物体的力量
//幅度乘以方向
lfx + =幅度*(dx);
lfy + = magnitude *(dy);
lfz + = magnitude *(dz);
}
__syncthreads();
}

//将本地内存存储到全局内存
gfx [ia] = lfx;
gfy [ia] = lfy;
gfz [ia] = lfz;
}

你需要做一些有点不同的粒子数外面的块大小的好多倍,可能是第二节,将不会展开。使用__syncthreads()调用观察经线偏离的可能性,如果不小心,可能会导致内核挂起。


I'm developing an N-body algorithm in CUDA and I would like to learn some tips and tricks for optimization.

I've managed to get 16384 bodies to run at 20Flops on an NVIDIA Geforce GTX 260, which has 27 Streaming Multiprocessors.

The KernelcomputeForces function is the slow poke taking about 95% of the time and I was wondering if there is anymore that I can do to optimize my code.

As far as I see, I've optimized for memory-space-locality and memory-writing. Somewhere in the CUDA docs, it says shared-memory is faster but I dont see how I can make use of that. I've divided the work in 16 blocks with 512 threads on each, but thats a bit fuzzy for me.

Please help and thanks for reading this.

n   is number of bodies

gm  is the gpu mass pointer

gpx is the gpu position x pointer

gpy is the gpu position y pointer

gpz is the gpu position z pointer

gfx is the gpu force x pointer

gfy is the gpu force y pointer

gfz is the gpu force z pointer

The relevant kernel function

__global__ void KernelcomputeForces( unsigned int n, float* gm, float* gpx, float* gpy, float* gpz, float* gfx, float* gfy, float* gfz ){
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    int numThreads = blockDim.x * gridDim.x;

    float GRAVITY = 0.00001f;

    //compare all with all
    for( unsigned int ia=tid; ia<n; ia+=numThreads ){
        float lfx = 0.0f;
        float lfy = 0.0f;
        float lfz = 0.0f;

        for( unsigned int ib=0; ib<n; ib++ ){
            //compute distance
            float dx = ( gpx[ib] - gpx[ia]);
            float dy = ( gpy[ib] - gpy[ia] );
            float dz = ( gpz[ib] - gpz[ia] );
            //float distance = sqrt( dx*dx + dy*dy + dz*dz );
            float distanceSquared = dx*dx + dy*dy + dz*dz;

            //prevent slingshots and division by zero
            //distance += 0.1f;
            distanceSquared += 0.01f;

            //calculate gravitational magnitude between the bodies
            //float magnitude = GRAVITY * ( gm[ia] * gm[ib] ) / ( distance * distance * distance * distance );
            float magnitude = GRAVITY * ( gm[ia] * gm[ib] ) / ( distanceSquared );

            //calculate forces for the bodies
            //magnitude times direction
            lfx += magnitude * ( dx );
            lfy += magnitude * ( dy );
            lfz += magnitude * ( dz );
        }

        //stores local memory to global memory
        gfx[ia] = lfx;
        gfy[ia] = lfy;
        gfz[ia] = lfz;
    }
}

extern void GPUcomputeForces( unsigned int n, float* gm, float* gpx, float* gpy, float* gpz, float* gfx, float* gfy, float* gfz ){  
    dim3 gridDim( 16, 1, 1 ); //specifys how many blocks in three possible dimensions
    dim3 blockDim( 512, 1, 1 ); //threads per block
    KernelcomputeForces<<<gridDim, blockDim>>>( n, gm, gpx, gpy, gpz, gfx, gfy, gfz );
}

解决方案

Shared memory is going to be a useful optimization in this sort of kernel - it allows coalescing of the reads of particle positions and masses, which on a GT200 will be very important. I found this to be about twice as fast as your version (launched with your 16384 particles using a 128 blocks of 128 threads):

template<int blocksize>
__global__
void KernelcomputeForces1( unsigned int n1, float* gm, float* gpx,
                float* gpy, float* gpz, float* gfx, float* gfy, float* gfz )
{
    __shared__ float lgpx[blocksize], lgpy[blocksize],
               lgpz[blocksize], lgm[blocksize];

    const float GRAVITY = 0.00001f;

    //compare all with all
    float lfx = 0.0f, lfy = 0.0f, lfz = 0.0f;
    int ia = blockDim.x * blockIdx.x + threadIdx.x;
    float lgpx0 = gpx[ia], lgpy0 = gpy[ia],
          lgpz0 = gpz[ia], lgm0 = gm[ia];

    for( unsigned int ib=0; ib<n1; ib+=blocksize ){

        lgpx[threadIdx.x] = gpx[ib + threadIdx.x];
        lgpy[threadIdx.x] = gpy[ib + threadIdx.x];
        lgpz[threadIdx.x] = gpz[ib + threadIdx.x];
        lgm[threadIdx.x] = gm[ib + threadIdx.x];
        __syncthreads();

#pragma unroll
        for(unsigned int ic=0; ic<blocksize; ic++) {

            //compute distance
            float dx = ( lgpx[ic] - lgpx0 );
            float dy = ( lgpy[ic] - lgpy0 );
            float dz = ( lgpz[ic] - lgpz0 );
            float distanceSquared = dx*dx + dy*dy + dz*dz;

            //prevent slingshots and division by zero
            distanceSquared += 0.01f;

            //calculate gravitational magnitude between the bodies
            float magnitude = GRAVITY * ( lgm0 * lgm[ic] )
                    / ( distanceSquared );

            //calculate forces for the bodies
            //magnitude times direction
            lfx += magnitude * ( dx );
            lfy += magnitude * ( dy );
            lfz += magnitude * ( dz );
        }
        __syncthreads();
    }

    //stores local memory to global memory
    gfx[ia] = lfx;
    gfy[ia] = lfy;
    gfz[ia] = lfz;
}

You would need to do something a little different for the number of particles which fall outside the nice multiple of block size, probably a second stanza which won't be unrolled. Watch the potential for warp divergence with the __syncthreads() calls, that can make the kernel hang if you are not careful.

这篇关于N体CUDA优化的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆