如何正确使用投在CUDA的uint4载体,以提高内存吞吐量全局存储器阵列? [英] How to properly cast a global memory array using the uint4 vector in CUDA to increase memory throughput?

查看:502
本文介绍了如何正确使用投在CUDA的uint4载体,以提高内存吞吐量全局存储器阵列?的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

通常有两种方法来提高存储吞吐量计算能力的GPU 1.3 CUDA一个内核全局内存的;存储器存取聚结和至少4个字节的访问的话。与第一技术由相同半warp的线程访问同一存储器段被合并到更少的交易数据,而至少是4个字节此存储器段被有效地从32字节增加至128的访问的话。

更新:根据talonmies解决方案回答。访问16字节,而不是当有存储在全局存储器无符号字​​符1字节即,uint4矢量通常由铸造存储器阵列uint4使用。来从uint4向量中的值,它可以被recasted到如下uchar4:

 的#include< cuda.​​h>
#包括LT&;&stdio.h中GT;
#包括LT&;&stdlib.h中GT;__global__无效内核(无符号字符* d_text,无符号字符* D_OUT){    INT IDX = blockIdx.x * blockDim.x + threadIdx.x;    的extern __shared__ unsigned char型s_array [];    uint4 * uint4_text = reinter pret_cast< uint4 *>(d_text);
    uint4 uint4_var;    //存储器事务
    uint4_var = uint4_text [0];    //重铸数据uchar4
    uchar4 C0 = * reinter pret_cast< uchar4 *>(安培; uint4_var.x);
    uchar4 C4 = * reinter pret_cast< uchar4 *>(安培; uint4_var.y);
    uchar4 C8 = * reinter pret_cast< uchar4 *>(安培; uint4_var.z);
    uchar4 C12 = * reinter pret_cast< uchar4 *>(安培; uint4_var.w);    D_OUT [IDX] = c0.y;
}诠释主要(无效){    无符号字符* d_text,* D_OUT;    无符号字符* h_out =(无符号字符*)malloc的(16 * sizeof的(无符号字符));
    无符号字符* h_text =(无符号字符*)malloc的(16 * sizeof的(无符号字符));    INT I;    对于(i = 0; I< 16;我++)
            h_text [I] = 65 + I;    cudaMalloc((无效**)及d_text,16 *的sizeof(unsigned char型));
    cudaMalloc((无效**)及D_OUT,16 *的sizeof(unsigned char型));    cudaMemcpy(d_text,h_text,16 *的sizeof(unsigned char型),cudaMemcpyHostToDevice);    内核<<< 1,16>>>(d_text,D_OUT);    cudaMemcpy(h_out,D_OUT,16 *的sizeof(unsigned char型),cudaMemcpyDeviceToHost);    对于(i = 0; I< 16;我++)
            的printf(%C \\ N,h_out [I]);    返回0;
}


解决方案

如果我明白你正在尝试做的,逻辑的方法是使用C ++ reinter pret_cast 机制,以使编译器生成正确的向量加载指令,然后使用CUDA内置的字节大小的矢量类型 uchar4 内四个32访问每个字节从全局内存加载位字。使用这种方法,你真的把信任编译器知道的最佳方式做到每个32位寄存器中的字节明智的访问。

一个完全人为的例子可能是这样的:

 的#include< cstdio>
#包括LT&; cstdlib>__全球__
无效的内核(无符号整数*中,无符号的字符*总分)
{
    INT TID = threadIdx.x;    uint4 * P = reinter pret_cast< uint4 *>(中);
    uint4 I4 = P [TID]这里//载体负载    uchar4 C0 = * reinter pret_cast< uchar4 *>(安培; i4.x);
    uchar4 C4 = * reinter pret_cast< uchar4 *>(安培; i4.y);
    uchar4 C8 = * reinter pret_cast< uchar4 *>(安培; i4.z);
    uchar4 C12 = * reinter pret_cast< uchar4 *>(安培; i4.w);    出[TID * 4 + 0] = c0.x;
    出[TID * 4 + 1] = c4.y;
    出[TID * 4 + 2] = c8.z;
    出[TID * 4 + 3] = c12.w;
}INT主要(无效)
{
    unsigned int类型C [8] = {
        2021161062,2021158776,2020964472,1920497784,
        2021161058,2021161336,2020898936,1702393976};    无符号整型* _c;
    cudaMalloc((无效**)及_c,sizeof的(INT)*为size_t(8));
    cudaMemcpy(_c,C的sizeof(int)的*为size_t(8),cudaMemcpyHostToDevice);
    无符号字符* _M;
    cudaMalloc((无效**)及_m,sizeof的(无符号字符)*为size_t(8));    内核<<< 1,2 GT;>>(_ C,_M);    unsigned char型M [8];
    cudaMemcpy(男,_m,sizeof的(无符号字符)*为size_t(8),cudaMemcpyDeviceToHost);    的for(int i = 0; I< 8;我++)
        fprintf中(标准输出,%d个%C \\ N,I,M [I]);    返回0;
}

应产生嵌入供给到内核无符号整数的阵列中字符的可读的字符串。

一个需要注意的是,用于计算1.x的目标open64编译器经常会打败试图生成矢量加载此策略,如果它可以检测到,并非所有的向量中的话被实际使用。因此,请确保你接触的所有输入字的输入向量类型,以确保编译器播放好听。

There are generally two techniques to increase the memory throughput of the global memory on a CUDA kernel on compute capability 1.3 GPUs; memory accesses coalescence and accessing words of at least 4 bytes. With the first technique accesses to the same memory segment by threads of the same half-warp are coalesced to fewer transactions while be accessing words of at least 4 bytes this memory segment is effectively increased from 32 bytes to 128.

Update: solution based on talonmies answer. To access 16-byte instead of 1-byte words when there are unsigned chars stored in the global memory, the uint4 vector is commonly used by casting the memory array to uint4. To get the values from the uint4 vector, it can be recasted to uchar4 as below:

#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>

__global__ void kernel ( unsigned char *d_text, unsigned char *d_out ) {

    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    extern __shared__ unsigned char s_array[];

    uint4 *uint4_text = reinterpret_cast<uint4 *>(d_text);
    uint4 uint4_var;

    //memory transaction
    uint4_var = uint4_text[0];

    //recast data to uchar4
    uchar4 c0 = *reinterpret_cast<uchar4 *>(&uint4_var.x);
    uchar4 c4 = *reinterpret_cast<uchar4 *>(&uint4_var.y);
    uchar4 c8 = *reinterpret_cast<uchar4 *>(&uint4_var.z);
    uchar4 c12 = *reinterpret_cast<uchar4 *>(&uint4_var.w);

    d_out[idx] = c0.y;
}

int main ( void ) {

    unsigned char *d_text, *d_out;

    unsigned char *h_out = ( unsigned char * ) malloc ( 16 * sizeof ( unsigned char ) );
    unsigned char *h_text = ( unsigned char * ) malloc ( 16 * sizeof ( unsigned char ) );

    int i;

    for ( i = 0; i < 16; i++ )
            h_text[i] = 65 + i;

    cudaMalloc ( ( void** ) &d_text, 16 * sizeof ( unsigned char ) );
    cudaMalloc ( ( void** ) &d_out, 16 * sizeof ( unsigned char ) );

    cudaMemcpy ( d_text, h_text, 16 * sizeof ( unsigned char ), cudaMemcpyHostToDevice );

    kernel<<<1,16>>>(d_text, d_out );

    cudaMemcpy ( h_out, d_out, 16 * sizeof ( unsigned char ), cudaMemcpyDeviceToHost );

    for ( i = 0; i < 16; i++ )
            printf("%c\n", h_out[i]);

    return 0;
}

解决方案

If I have understood what you are trying to do, the logical approach is to use the C++ reinterpret_cast mechanism to make the compiler generate the correct vector load instruction, then use the CUDA built in byte sized vector type uchar4 to access each byte within each of the four 32 bit words loaded from global memory. Using this approach, you are really putting trust in the compiler knowing the optimal way to do byte wise access within each 32 bit register.

A completely contrived example could look like this:

#include <cstdio>
#include <cstdlib>

__global__
void kernel(unsigned int *in, unsigned char* out)
{
    int tid = threadIdx.x;

    uint4* p = reinterpret_cast<uint4*>(in);
    uint4  i4 = p[tid]; // vector load here

    uchar4 c0 = *reinterpret_cast<uchar4 *>(&i4.x);
    uchar4 c4 = *reinterpret_cast<uchar4 *>(&i4.y);
    uchar4 c8 = *reinterpret_cast<uchar4 *>(&i4.z);
    uchar4 c12 = *reinterpret_cast<uchar4 *>(&i4.w);

    out[tid*4+0] = c0.x;
    out[tid*4+1] = c4.y;
    out[tid*4+2] = c8.z;
    out[tid*4+3] = c12.w;
}

int main(void)
{
    unsigned int c[8] = { 
        2021161062, 2021158776, 2020964472, 1920497784, 
        2021161058, 2021161336, 2020898936, 1702393976 };

    unsigned int * _c;
    cudaMalloc((void **)&_c, sizeof(int)*size_t(8));
    cudaMemcpy(_c, c, sizeof(int)*size_t(8), cudaMemcpyHostToDevice);
    unsigned char * _m;
    cudaMalloc((void **)&_m, sizeof(unsigned char)*size_t(8));

    kernel<<<1,2>>>(_c, _m);

    unsigned char m[8];
    cudaMemcpy(m, _m, sizeof(unsigned char)*size_t(8), cudaMemcpyDeviceToHost);

    for(int i=0; i<8; i++)
        fprintf(stdout, "%d %c\n", i, m[i]);

    return 0;
}

which should produce a readable string of characters embedded in the array of unsigned integers supplied to the kernel.

One caveat is that the open64 compiler used for compute 1.x targets would often defeat this strategy of trying to generate vector loads if it could detect that not all of the words in the vector were actually used. So make sure that you touch all of the input words in the input vector type to ensure the compiler plays nicely.

这篇关于如何正确使用投在CUDA的uint4载体,以提高内存吞吐量全局存储器阵列?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆