CUDA结构对齐减慢了我的代码速度(可编译示例) [英] CUDA structure alignment is slowing down my code (compilable example)

查看：75 发布时间：2020/5/8 20:24:40 cuda memory-alignment

本文介绍了CUDA结构对齐减慢了我的代码速度(可编译示例)的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我有一个模拟，可以计算在电场和磁场中移动的带电粒子的3D向量. 我尝试使用__align__说明符在CUDA中加快这一步，认为可能的限制因素是全局内存读写，但是使用__align__却减慢了它的速度(可能是因为它增加了总内存需求).我也尝试使用float3和float4，但是它们的性能相似

I have a simulation that calculates the 3D vectors of charged particles moving in an electric and magnetic field. I attempted to speed this up in CUDA using the __align__ specifier, thinking that perhaps the limiting factor was the global memory reading and writing, but using __align__ ended up slowing it down (maybe because it increased the total memory requirement). I also tried using float3 and float4 but their performance was similar

我已经创建了此代码的简化版本，并将其粘贴在下面以显示我的问题. 下面的代码应该是可编译的，并且通过将第四行中的CASE的定义更改为0，1或2，可以尝试我上面描述的不同选项.定义了两个功能ParticleMoverCPU和ParticleMoverGPU来比较CPU和GPU的性能.

I've created a simplified version of this code and pasted it below to show my problem. The code below should be compilable and by changing the definition of CASE on the fourth line to 0, 1, or 2, the different options I described above can be tried. Two functions, ParticleMoverCPU and ParticleMoverGPU are defined to compare CPU vs GPU performance.

是否有理由使我的内存合并尝试放慢而不是加快了我的代码的速度?
是否还有其他明显的事情要做，我不能做得比令人尴尬的并行"代码的60倍加速更好?

谢谢！

CPU-英特尔至强E5620 @ 2.40GHz

CPU - Intel Xeon E5620 @2.40GHz

GPU-NVIDIA Tesla C2070

GPU - NVIDIA Tesla C2070

// CASE 0: Regular struct with 3 floats
// CASE 1: Aligned struct using __align__(16) with 3 floats
// CASE 2: float3
#define CASE        0   // define to either 0, 1 or 2 as described above

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <Windows.h>

#include <stdio.h>
#include <math.h>
#include <time.h>
#include <malloc.h>
#include <sys/stat.h>

#define CEX         10  // x-value of electric field (dimensionless and arbitrary)
#define CEY         0.1 // y-value of electric field (dimensionless and arbitrary)
#define CEZ         0.1 // z-value of electric field (dimensionless and arbitrary)
#define CBX         0.1 // x-value of magnetic field (dimensionless and arbitrary)
#define CBY         0.1 // x-value of magnetic field (dimensionless and arbitrary)
#define CBZ         10  // x-value of magnetic field (dimensionless and arbitrary)

#define FACTOR      15  // I played around with these numbers until I got the best speedup
#define THREADS     256 // I played around with these numbers until I got the best speedup

typedef struct{
    float x;
    float y;
    float z;
} VecCPU;           //Struct for vectors for CPU calculation

// Fastest method seems to be a regular unaligned struct with 3 floats
#if CASE==0
typedef struct {
    float x;
    float y;
    float z;
} VecGPU;
#endif

#if CASE==1
// This method seems to be less fast.  It is an attempt to align for memory coalescence
typedef struct __align__(16){
    float x;
    float y;
    float z;
} VecGPU;
#endif

// Using float3 seems to be about the same as defining our own vector3 structure
#if CASE==2
typedef float3 VecGPU;
#endif

VecCPU *pos_c, *vel_c;                  // global position and velocity vectors for CPU calculation
__constant__ VecGPU *pos_d, *vel_d;     // pointers in constant memory which we will point to data in global memory

void ParticleMoverCPU(int np, int ts, float dt){

    int n = 0;
    while (n < np){

        VecCPU vminus, tvec, vprime, vplus;
        float tvec_fact;
        int it = 0;
        while (it < ts){
            // ----- Update velocities by the Boris method ------ //
            vminus.x = vel_c[n].x + CEX*0.5*dt;
            vminus.y = vel_c[n].y + CEY*0.5*dt;
            vminus.z = vel_c[n].z + CEZ*0.5*dt;
            tvec.x = CBX*0.5*dt;
            tvec.y = CBY*0.5*dt;
            tvec.z = CBZ*0.5*dt;
            tvec_fact = 2 / (1 + tvec.x*tvec.x + tvec.y*tvec.y + tvec.z*tvec.z);
            vprime.x = vminus.x + vminus.y*tvec.z - vminus.z*tvec.y;
            vprime.y = vminus.y + vminus.z*tvec.x - vminus.x*tvec.z;
            vprime.z = vminus.z + vminus.x*tvec.y - vminus.y*tvec.x;
            vplus.x = vminus.x + (vprime.y*tvec.z - vprime.z*tvec.y)*tvec_fact;
            vplus.y = vminus.y + (vprime.z*tvec.x - vprime.x*tvec.z)*tvec_fact;
            vplus.z = vminus.z + (vprime.x*tvec.y - vprime.y*tvec.x)*tvec_fact;
            vel_c[n].x = vplus.x + CEX*0.5*dt;
            vel_c[n].y = vplus.y + CEY*0.5*dt;
            vel_c[n].z = vplus.z + CEZ*0.5*dt;

            // ------ Update Particle positions -------------- //
            pos_c[n].x += vel_c[n].x*dt;
            pos_c[n].y += vel_c[n].y*dt;
            pos_c[n].z += vel_c[n].z*dt;
            it++;
        }
        n++;
    }
}

__global__ void ParticleMoverGPU(register int np,register int ts, register float dt){

    register int n = threadIdx.x + blockDim.x * blockIdx.x;
    while (n < np){

        register VecGPU vminus, tvec, vprime, vplus;// , vtemp;
        register float tvec_fact;
        register int it = 0;
        while (it < ts){
            // ----- Update velocities by the Boris method ------ //
            vminus.x = vel_d[n].x + CEX*0.5*dt;
            vminus.y = vel_d[n].y + CEY*0.5*dt;
            vminus.z = vel_d[n].z + CEZ*0.5*dt;
            tvec.x = CBX*0.5*dt;
            tvec.y = CBY*0.5*dt;
            tvec.z = CBZ*0.5*dt;
            tvec_fact = 2 / (1 + tvec.x*tvec.x + tvec.y*tvec.y + tvec.z*tvec.z);
            vprime.x = vminus.x + vminus.y*tvec.z - vminus.z*tvec.y;
            vprime.y = vminus.y + vminus.z*tvec.x - vminus.x*tvec.z;
            vprime.z = vminus.z + vminus.x*tvec.y - vminus.y*tvec.x;
            vplus.x = vminus.x + (vprime.y*tvec.z - vprime.z*tvec.y)*tvec_fact;
            vplus.y = vminus.y + (vprime.z*tvec.x - vprime.x*tvec.z)*tvec_fact;
            vplus.z = vminus.z + (vprime.x*tvec.y - vprime.y*tvec.x)*tvec_fact;
            vel_d[n].x = vplus.x + CEX*0.5*dt;
            vel_d[n].y = vplus.y + CEY*0.5*dt;
            vel_d[n].z = vplus.z + CEZ*0.5*dt;
            // ------ Update Particle positions -------------- //
            pos_d[n].x += vel_d[n].x*dt;
            pos_d[n].y += vel_d[n].y*dt;
            pos_d[n].z += vel_d[n].z*dt;
            it++;
        }
        n += blockDim.x*gridDim.x;
    }
}

int main(void){

    int np = 50000;                                         // Number of Particles
    const int ts = 1000;                                    // Number of Time-steps
    const float dt = 1E-3;                                  // Time-step value


    // ----------- CPU ----------- //

    pos_c = (VecCPU*)malloc(sizeof(VecCPU)*np);             // allocate memory for position
    vel_c = (VecCPU*)malloc(sizeof(VecCPU)*np);             // allocate memory for velocity

    for (int n = 0; n < np; n++){
        pos_c[n].x = 0; pos_c[n].y = 0; pos_c[n].z = 0;     // zero out position for CPU variables
        vel_c[n].x = 0; vel_c[n].y = 0; vel_c[n].z = 0;     // zero out velocity for CPU variables
    }

    printf("Starting CPU kernel\n");
    clock_t startCPU;
    float CPUtime;
    startCPU = clock();
    ParticleMoverCPU(np, ts, dt);                           // Launch CPU kernel
    CPUtime = ((float)(clock() - startCPU)) / CLOCKS_PER_SEC;
    printf("CPU kernel finished\n");
    // Ouput final CPU computation time
    printf("CPUtime = %6.1f ms\n", ((float)CPUtime)*1E3);

    // ------------ GPU ----------- //

    cudaFuncSetCacheConfig(ParticleMoverGPU, cudaFuncCachePreferL1);    //Set memory preference to L1 (doesn't have much effect)
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, 0);
    int blocks = deviceProp.multiProcessorCount;

    VecGPU *pos_g, *vel_g, *pos_l, *vel_l;

    pos_g = (VecGPU*)malloc(sizeof(VecGPU)*np);         // allocate memory for positions on the CPU
    vel_g = (VecGPU*)malloc(sizeof(VecGPU)*np);         // allocate memory for velocities on the CPU

    cudaMalloc((void**)&pos_l, sizeof(VecGPU)*np);      // allocate memory for positions on the GPU
    cudaMalloc((void**)&vel_l, sizeof(VecGPU)*np);      // allocate memory for velocities on the GPU

    cudaMemcpyToSymbol(pos_d, &pos_l, sizeof(void*));   // copy memory address of position to the constant memory pointer pos_d
    cudaMemcpyToSymbol(vel_d, &vel_l, sizeof(void*));   // copy memory address of velocity to the constant memory pointer vel_d

    for (int n = 0; n < np; n++){
        pos_g[n].x = 0; pos_g[n].y = 0; pos_g[n].z = 0; // zero out position for GPU variables (before copying to GPU)
        vel_g[n].x = 0; vel_g[n].y = 0; vel_g[n].z = 0; // zero out velocity for GPU variables (before copying to GPU)
    }

    cudaMemcpy(pos_l, pos_g, sizeof(VecGPU)*np, cudaMemcpyHostToDevice);    // Copy positions to GPU global memory
    cudaMemcpy(vel_l, vel_g, sizeof(VecGPU)*np, cudaMemcpyHostToDevice);    // Copy velocities to GPU global memory

    printf("Starting GPU kernel\n");
    // start cuda timer
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    ParticleMoverGPU <<<blocks*FACTOR, THREADS >>>(np, ts, dt);             // Launch GPU kernel

    //stop cuda timer
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    float elapsedTime;
    cudaEventElapsedTime(&elapsedTime, start, stop);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    printf("GPU kernel finished\n");

    cudaMemcpy(pos_g, pos_l, sizeof(VecGPU)*np, cudaMemcpyDeviceToHost);    // Copy positions from GPU memory back to CPU
    cudaMemcpy(vel_g, vel_l, sizeof(VecGPU)*np, cudaMemcpyDeviceToHost);    // Copy velocities from GPU memory back to CPU

    // Ouput GPU computation time
    printf("GPUtime = %6.1f ms\n", elapsedTime);

    // Output speedup factor
    printf("CASE=%i, Speedup = %4.2f\n",CASE, CPUtime*1E3 / elapsedTime);

    // free allocated memory
    cudaFree(pos_l);
    cudaFree(vel_l);
    free(pos_g);
    free(vel_g);
    free(pos_c);
    free(vel_c);
}

对于CASE 0(常规向量结构)，我得到:

For CASE 0 (regular vector struct) I get:

CPUtime = 1302.0 ms
GPUtime =   21.8 ms
Speedup = 59.79

对于CASE 1(__align__(16)矢量结构)，我得到:

For CASE 1 (__align__(16) vector struct) I get:

CPUtime = 1298.0 ms
GPUtime =   24.5 ms
Speedup = 53.08

对于CASE 2(使用float3)，我得到:

For CASE 2 (using float3) I get:

CPUtime = 1305.0 ms
GPUtime =   21.8 ms
Speedup = 59.80

如果我使用float4而不是float3，则会得到与__align__(16)方法类似的东西.

If I use float4 instead of float3 I get something similar to the __align__(16) method.

谢谢！

推荐答案

__constant__内存中的指针浪费您的时间.我不确定为什么你会跳那么大的圈.
在任何地方拖放register都是在浪费时间.告诉编译器尽可能使用寄存器，您并不比编译器聪明.
如果不是，请使用正确的cuda错误检查.那只是我的表述.我认为这段代码中没有API级错误.
尚不清楚您是否理解凝聚"是什么.数据对齐仅切向影响内存事务合并的能力.更重要的是，对于给定的内存事务，扭曲中的相邻线程生成的实际地址 –它们是否引用相邻的内存位置?如果是这样，事情可能会很好地融合在一起.如果没有，可能不是.因此，您拥有一个自然地"占用12个字节的数据结构，并且在一种情况下(较慢的一种)，您要告诉它占用16个字节.这到底是做什么的?要回答这个问题，我们必须查看给定的交易:

The pointers in __constant__ memory is a waste of your time. I'm not sure why you would jump though all those hoops.
Dropping register everywhere is a waste of your time. You're not smarter than the compiler in telling it to use registers wherever possible.
In case you're not, you should use proper cuda error checking. That's just a boiler-plate statement I make. I don't think there are any API level errors in this code.
It's not clear you understand what "coalescence" is. Alignment of data only tangentially affects the ability of memory transactions to coalesce. What is more important is the actual addresses being generated by adjacent threads in the warp for a given memory transaction -- do they refer to adjacent memory locations? If so, things are probably coalescing nicely. If not, probably not. So you have a data structure which "naturally" occupies 12 bytes, and in one case (the slower one) you are telling it to occupy 16 bytes instead. What does this do exactly? To answer that we have to look at a given transaction:

    vminus.x = vel_d[n].x + CEX*0.5*dt;

以上交易要求vel_d向量的x分量.在不对齐"情况下，该数据将以这种方式存储，并且上述交易将询问"已加星标的数量(每个经线为32):

The above transaction is requesting the x-component of the vel_d vector. In the "non-align" case, that data will be stored like this, and the above transaction will "ask" for the starred quantities (32 per warp):

mem idx: 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 ...
vel_d:  x0 y0 z0 x1 y1 z1 x2 y2 z2 x3 y3 z3 x4 y4 z4 x5 y5 z5 ...
         *        *        *        *        *        *       ...

在"align"情况下，上述模式如下所示:

In the "align" case, the above pattern would look like:

mem idx: 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17    ...
vel_d:  x0 y0 z0 ?? x1 y1 z1 ?? x2 y2 z2 ?? x3 y3 z3 ?? x4 y4 z4 ...
         *           *           *           *           *       ...

因此，我们看到当您指定align指令时，打包的密度降低了，并且给定的128字节高速缓存行为给定事务提供了更少的必要项目.因此，在对齐的情况下，必须从全局内存中检索更多高速缓存行，以满足这一读取请求.这可能是您看到的〜10-20％差异的原因.

So we see that when you specify the align directive, the packing is less dense, and a given 128 byte cacheline supplies fewer of the necessary items for the given transaction. Therefore more cachelines must be retrieved from global memory to satisfy this one read request in the align case. This is likely accounting for the ~10-20% difference you are seeing.

但是我们可以做得比上面更好.您有一个经典的AoS(结构数组)数据存储方案，这对GPU编程来说通常是不好的.标准的性能增强是从AoS转换为SoA存储.这意味着将pos和vel向量的x，y，z分量分解为单独的数组，然后分别访问它们. (或者，由于要在单个线程中处理所有组件，因此可以尝试进行 vector 加载.但这是

But we can do better than above. You have a classic AoS (Array of structures) data storage scheme, and that is canonically bad for GPU programming. A standard performance enhancement is to convert from AoS to SoA storage. This means breaking out the x,y,z components of your pos and vel vectors into separate arrays, each, and then accessing those. (Alternatively since you are processing all components in a single thread, you could try doing a vector load. But that is a separate discussion.) The desired storage and load pattern then becomes:

mem idx:  0  1  2  3  4  5  6  7  8  9  ...
vel_d_x: x0 x1 x2 x3 x4 x5 x6 x7 x8 x9  ...
          *  *  *  *  *  *  *  *  *  *  ...

，代码可能如下所示:

    vminus.x = vel_d_x[n] + CEX*0.5*dt;
    vminus.y = vel_d_y[n] + CEY*0.5*dt;
    vminus.z = vel_d_z[n] + CEZ*0.5*dt;

以下代码实现了上述功能，包括针对GPU端的AoS-> SoA转换，并且比任何情况下都要快.

The following code implements some of the above, including the AoS -> SoA transformation for the GPU side, and should be faster than any of your cases.

$ cat t895.cu
// CASE 0: Regular struct with 3 floats
// CASE 1: Aligned struct using __align__(16) with 3 floats
// CASE 2: float3
#define CASE        0   // define to either 0, 1 or 2 as described above

#include <stdio.h>
#include <math.h>
#include <time.h>
#include <malloc.h>
#include <sys/stat.h>

#define CEX         10  // x-value of electric field (dimensionless and arbitrary)
#define CEY         0.1 // y-value of electric field (dimensionless and arbitrary)
#define CEZ         0.1 // z-value of electric field (dimensionless and arbitrary)
#define CBX         0.1 // x-value of magnetic field (dimensionless and arbitrary)
#define CBY         0.1 // x-value of magnetic field (dimensionless and arbitrary)
#define CBZ         10  // x-value of magnetic field (dimensionless and arbitrary)

#define FACTOR      15  // I played around with these numbers until I got the best speedup
#define THREADS     256 // I played around with these numbers until I got the best speedup

typedef struct{
    float x;
    float y;
    float z;
} VecCPU;           //Struct for vectors for CPU calculation

// Fastest method seems to be a regular unaligned struct with 3 floats
#if CASE==0
typedef struct {
    float x;
    float y;
    float z;
} VecGPU;
#endif

#if CASE==1
// This method seems to be less fast.  It is an attempt to align for memory coalescence
typedef struct __align__(16){
    float x;
    float y;
    float z;
} VecGPU;
#endif

// Using float3 seems to be about the same as defining our own vector3 structure
#if CASE==2
typedef float3 VecGPU;
#endif

VecCPU *pos_c, *vel_c;                  // global position and velocity vectors for CPU calculation

void ParticleMoverCPU(int np, int ts, float dt){

    int n = 0;
    while (n < np){

        VecCPU vminus, tvec, vprime, vplus;
        float tvec_fact;
        int it = 0;
        while (it < ts){
            // ----- Update velocities by the Boris method ------ //
            vminus.x = vel_c[n].x + CEX*0.5*dt;
            vminus.y = vel_c[n].y + CEY*0.5*dt;
            vminus.z = vel_c[n].z + CEZ*0.5*dt;
            tvec.x = CBX*0.5*dt;
            tvec.y = CBY*0.5*dt;
            tvec.z = CBZ*0.5*dt;
            tvec_fact = 2 / (1 + tvec.x*tvec.x + tvec.y*tvec.y + tvec.z*tvec.z);
            vprime.x = vminus.x + vminus.y*tvec.z - vminus.z*tvec.y;
            vprime.y = vminus.y + vminus.z*tvec.x - vminus.x*tvec.z;
            vprime.z = vminus.z + vminus.x*tvec.y - vminus.y*tvec.x;
            vplus.x = vminus.x + (vprime.y*tvec.z - vprime.z*tvec.y)*tvec_fact;
            vplus.y = vminus.y + (vprime.z*tvec.x - vprime.x*tvec.z)*tvec_fact;
            vplus.z = vminus.z + (vprime.x*tvec.y - vprime.y*tvec.x)*tvec_fact;
            vel_c[n].x = vplus.x + CEX*0.5*dt;
            vel_c[n].y = vplus.y + CEY*0.5*dt;
            vel_c[n].z = vplus.z + CEZ*0.5*dt;

            // ------ Update Particle positions -------------- //
            pos_c[n].x += vel_c[n].x*dt;
            pos_c[n].y += vel_c[n].y*dt;
            pos_c[n].z += vel_c[n].z*dt;
            it++;
        }
        n++;
    }
}

__global__ void ParticleMoverGPU(float *vel_d_x, float *vel_d_y, float *vel_d_z, float *pos_d_x, float *pos_d_y, float *pos_d_z, int np,int ts, float dt){

    int n = threadIdx.x + blockDim.x * blockIdx.x;
    while (n < np){

        VecGPU vminus, tvec, vprime, vplus;// , vtemp;
        register float tvec_fact;
        register int it = 0;
        while (it < ts){
            // ----- Update velocities by the Boris method ------ //
            vminus.x = vel_d_x[n] + CEX*0.5*dt;
            vminus.y = vel_d_y[n] + CEY*0.5*dt;
            vminus.z = vel_d_z[n] + CEZ*0.5*dt;
            tvec.x = CBX*0.5*dt;
            tvec.y = CBY*0.5*dt;
            tvec.z = CBZ*0.5*dt;
            tvec_fact = 2 / (1 + tvec.x*tvec.x + tvec.y*tvec.y + tvec.z*tvec.z);
            vprime.x = vminus.x + vminus.y*tvec.z - vminus.z*tvec.y;
            vprime.y = vminus.y + vminus.z*tvec.x - vminus.x*tvec.z;
            vprime.z = vminus.z + vminus.x*tvec.y - vminus.y*tvec.x;
            vplus.x = vminus.x + (vprime.y*tvec.z - vprime.z*tvec.y)*tvec_fact;
            vplus.y = vminus.y + (vprime.z*tvec.x - vprime.x*tvec.z)*tvec_fact;
            vplus.z = vminus.z + (vprime.x*tvec.y - vprime.y*tvec.x)*tvec_fact;
            vel_d_x[n] = vplus.x + CEX*0.5*dt;
            vel_d_y[n] = vplus.y + CEY*0.5*dt;
            vel_d_z[n] = vplus.z + CEZ*0.5*dt;
            // ------ Update Particle positions -------------- //
            pos_d_x[n] += vel_d_x[n]*dt;
            pos_d_y[n] += vel_d_y[n]*dt;
            pos_d_z[n] += vel_d_z[n]*dt;
            it++;
        }
        n += blockDim.x*gridDim.x;
    }
}

int main(void){

    int np = 50000;                                         // Number of Particles
    const int ts = 1000;                                    // Number of Time-steps
    const float dt = 1E-3;                                  // Time-step value


    // ----------- CPU ----------- //

    pos_c = (VecCPU*)malloc(sizeof(VecCPU)*np);             // allocate memory for position
    vel_c = (VecCPU*)malloc(sizeof(VecCPU)*np);             // allocate memory for velocity

    for (int n = 0; n < np; n++){
        pos_c[n].x = 0; pos_c[n].y = 0; pos_c[n].z = 0;     // zero out position for CPU variables
        vel_c[n].x = 0; vel_c[n].y = 0; vel_c[n].z = 0;     // zero out velocity for CPU variables
    }

    printf("Starting CPU kernel\n");
    clock_t startCPU;
    float CPUtime;
    startCPU = clock();
    ParticleMoverCPU(np, ts, dt);                           // Launch CPU kernel
    CPUtime = ((float)(clock() - startCPU)) / CLOCKS_PER_SEC;
    printf("CPU kernel finished\n");
    // Ouput final CPU computation time
    printf("CPUtime = %6.1f ms\n", ((float)CPUtime)*1E3);

    // ------------ GPU ----------- //

    cudaFuncSetCacheConfig(ParticleMoverGPU, cudaFuncCachePreferL1);    //Set memory preference to L1 (doesn't have much effect)
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, 0);
    int blocks = deviceProp.multiProcessorCount;

    float *pos_g_x, *pos_g_y, *pos_g_z, *vel_g_x, *vel_g_y, *vel_g_z, *pos_l_x, *pos_l_y, *pos_l_z, *vel_l_x, *vel_l_y, *vel_l_z;

    pos_g_x = (float*)malloc(sizeof(float)*np);         // allocate memory for positions on the CPU
    vel_g_x = (float*)malloc(sizeof(float)*np);         // allocate memory for velocities on the CPU
    pos_g_y = (float*)malloc(sizeof(float)*np);         // allocate memory for positions on the CPU
    vel_g_y = (float*)malloc(sizeof(float)*np);         // allocate memory for velocities on the CPU
    pos_g_z = (float*)malloc(sizeof(float)*np);         // allocate memory for positions on the CPU
    vel_g_z = (float*)malloc(sizeof(float)*np);         // allocate memory for velocities on the CPU

    cudaMalloc((void**)&pos_l_x, sizeof(float)*np);      // allocate memory for positions on the GPU
    cudaMalloc((void**)&vel_l_x, sizeof(float)*np);      // allocate memory for velocities on the GPU
    cudaMalloc((void**)&pos_l_y, sizeof(float)*np);      // allocate memory for positions on the GPU
    cudaMalloc((void**)&vel_l_y, sizeof(float)*np);      // allocate memory for velocities on the GPU
    cudaMalloc((void**)&pos_l_z, sizeof(float)*np);      // allocate memory for positions on the GPU
    cudaMalloc((void**)&vel_l_z, sizeof(float)*np);      // allocate memory for velocities on the GPU

    for (int n = 0; n < np; n++){
        pos_g_x[n] = 0; pos_g_y[n] = 0; pos_g_z[n] = 0; // zero out position for GPU variables (before copying to GPU)
        vel_g_x[n] = 0; vel_g_y[n] = 0; vel_g_z[n] = 0; // zero out velocity for GPU variables (before copying to GPU)
    }

    cudaMemcpy(pos_l_x, pos_g_x, sizeof(float)*np, cudaMemcpyHostToDevice);    // Copy positions to GPU global memory
    cudaMemcpy(vel_l_x, vel_g_x, sizeof(float)*np, cudaMemcpyHostToDevice);    // Copy velocities to GPU global memory
    cudaMemcpy(pos_l_y, pos_g_y, sizeof(float)*np, cudaMemcpyHostToDevice);    // Copy positions to GPU global memory
    cudaMemcpy(vel_l_y, vel_g_y, sizeof(float)*np, cudaMemcpyHostToDevice);    // Copy velocities to GPU global memory
    cudaMemcpy(pos_l_z, pos_g_z, sizeof(float)*np, cudaMemcpyHostToDevice);    // Copy positions to GPU global memory
    cudaMemcpy(vel_l_z, vel_g_z, sizeof(float)*np, cudaMemcpyHostToDevice);    // Copy velocities to GPU global memory

    printf("Starting GPU kernel\n");
    // start cuda timer
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    ParticleMoverGPU <<<blocks*FACTOR, THREADS >>>(vel_l_x, vel_l_y, vel_l_z, pos_l_x, pos_l_y, pos_l_z, np, ts, dt);             // Launch GPU kernel

    //stop cuda timer
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    float elapsedTime;
    cudaEventElapsedTime(&elapsedTime, start, stop);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    printf("GPU kernel finished\n");


    // Ouput GPU computation time
    printf("GPUtime = %6.1f ms\n", elapsedTime);

    // Output speedup factor
    printf("CASE=%i, Speedup = %4.2f\n",CASE, CPUtime*1E3 / elapsedTime);

}

$ nvcc -O3 -o t895 t895.cu
$ ./t895
Starting CPU kernel
CPU kernel finished
CPUtime =  923.6 ms
Starting GPU kernel
GPU kernel finished
GPUtime =   12.3 ms
CASE=0, Speedup = 74.95
$

这篇关于CUDA结构对齐减慢了我的代码速度(可编译示例)的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持IT屋！

查看全文

CUDA结构对齐减慢了我的代码速度(可编译示例) [英] CUDA structure alignment is slowing down my code (compilable example)

问题描述

推荐答案

相关文章

其他开发最新文章

热门教程

热门工具

登录关闭

CUDA结构对齐减慢了我的代码速度(可编译示例) [英] CUDA structure alignment is slowing down my code (compilable example)

问题描述

推荐答案

相关文章

其他开发最新文章

热门教程

热门工具

登录 关闭

登录关闭