来自2D阵列CUDA的2D纹理 [英] 2D Texture from 2D array CUDA

查看:106
本文介绍了来自2D阵列CUDA的2D纹理的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我正在尝试将Nx3数组传递给内核,并像在纹理内存中一样从中读取并写入第二个数组。这是我的简化代码,其中N = 8:

  #include< cstdio> 
#include handle.h
使用命名空间std;

texture< float,2> tex_w;

__global__ void kernel(int imax,float(* w)[3],float(* f)[3])
{
int i = threadIdx.x;
int j = threadIdx.y;

if(i <imax)
f [i] [j] = tex2D(tex_w,i,j);
}

void print_to_stdio(int imax,float(* w)[3])
{
for(int i = 0; i {
printf(%2d%3.6f\t%3.6f\t%3.6f\n,i,w [i] [0],w [i] [1 ],w [i] [2]);
}
}

int main(void)
{
int imax = 8;
浮点数(* w)[3];
float(* d_w)[3],(* d_f)[3];
dim3 grid(imax,3);

w =(float(*)[3])malloc(imax * 3 * sizeof(float));

for(int i = 0; i {
for(int j = 0; j <3; j ++)
{
w [i] [j] = i + 0.01f * j;
}
}

cudaMalloc((void **)& d_w,3 * imax * sizeof(float));
cudaMalloc((void **)& d_f,3 * imax * sizeof(float));

cudaChannelFormatDesc desc = cudaCreateChannelDesc< float>();
HANDLE_ERROR(cudaBindTexture2D(NULL,tex_w,d_w,desc,imax,3,sizeof(float)* imax));

cudaMemcpy(d_w,w,3 * imax * sizeof(float),cudaMemcpyHostToDevice);

//为简单起见仅使用线程
kernel <1,grid>>(imax,d_w,d_f);

cudaMemcpy(w,d_f,3 * imax * sizeof(float),cudaMemcpyDeviceToHost);

cudaUnbindTexture(tex_w);
cudaFree(d_w);
cudaFree(d_f);

print_to_stdio(imax,w);

free(w);
返回0;
}

运行此代码,我希望得到:

  0 0.000000 0.010000 0.020000 
1 1.000000 1.010000 1.020000
2 2.000000 2.010000 2.020000
3 3.000000 3.010000 3.020000
4 4.000000 4.010000 4.020000
5 5.000000 5.010000 5.020000
6 6.000000 6.010000 6.020000
7 7.000000 7.010000 7.020000

但我却得到:

  0 0.000000 2.020000 5.010000 
1 0.010000 3.000000 5.020000
2 0.020000 3.010000 6.000000
3 1.000000 3.020000 6.010000
4 1.010000 4.000000 6.020000
5 1.020000 4.010000 7.000000
6 2.000000 4.020000 7.010000
7 2.010000 5.000000 7.020000

我认为这与我给cudaBindTexture2D提供的pitch参数有关,但是使用较小的值会导致无效的参数错误。 / p>

预先感谢!

解决方案

在brano做出回应并深入了解音高的工作原理之后,我将回答我自己的问题。这是修改后的代码:

  #include< cstdio> 
#include< iostream>
#include handle.cu

使用命名空间std;

texture< float,2,cudaReadModeElementType> tex_w;

__global__ void kernel(int imax,float(* f)[3])
{
int i = threadIdx.x;
int j = threadIdx.y;
//宽度= 3,高度= imax
// //但是我们在x中有imax个线程,在y中有3个
//因此高度对应于x个线程(i)
/ /和宽度对应于y个线程(j)
if(i< imax)
{
//索引之间的线性过滤
f [i] [j] = tex2D(tex_w ,j + 0.5f,i + 0.5f);
}
}

void print_to_stdio(int imax,float(* w)[3])
{
for(int i = 0; i< ; imax; i ++)
{
printf(%2d%3.3f%3.3f%3.3f\n,i,w [i] [0],w [i] [1] ,w [i] [2]);
}
printf( \n);
}

int main(void)
{
int imax = 8;
浮点数(* w)[3];
float(* d_f)[3],* d_w;
dim3 grid(imax,3);

w =(float(*)[3])malloc(imax * 3 * sizeof(float));

for(int i = 0; i {
for(int j = 0; j <3; j ++)
{
w [i] [j] = i + 0.01f * j;
}
}

print_to_stdio(imax,w);

size_t间距;
HANDLE_ERROR(cudaMallocPitch((void **)& d_w,& pitch,3 * sizeof(float),imax));

HANDLE_ERROR(cudaMemcpy2D(d_w,//设备目标
节距,//设备节距(以上计算)
w,//主机上的src
3 * sizeof( float),// src上的间距(没有填充,仅行宽)
3 * sizeof(float),//数据宽度以字节为单位
imax,//数据高度
cudaMemcpyHostToDevice));

HANDLE_ERROR(cudaBindTexture2D(NULL,tex_w,d_w,tex_w.channelDesc,3,imax,pitch));

tex_w.normalized = false; //不要使用标准化值
tex_w.filterMode = cudaFilterModeLinear;
tex_w.addressMode [0] = cudaAddressModeClamp; //不要环绕索引
tex_w.addressMode [1] = cudaAddressModeClamp;

// d_f将有结果数组
cudaMalloc(& d_f,3 * imax * sizeof(float));

//为简单起见仅使用线程
kernel <1,grid>>(imax,d_f);

cudaMemcpy(w,d_f,3 * imax * sizeof(float),cudaMemcpyDeviceToHost);

cudaUnbindTexture(tex_w);
cudaFree(d_w);
cudaFree(d_f);

print_to_stdio(imax,w);

free(w);
返回0;
}

不是使用memcpy()而是必须处理主机上的音调,使用memcpy2D()可以为设备数据和主机数据都接受一个pitch参数。由于我们在主机上使用的是简单分配的数据,因此我的理解是,间距将只是行宽,即3 * sizeof(float)。


I am trying to pass an Nx3 array to a kernel and read from it as in texture memory and write to a second array. Here is my simplified code with N=8:

#include <cstdio>
#include "handle.h"
using namespace std;

texture<float,2> tex_w;

__global__ void kernel(int imax, float(*w)[3], float (*f)[3])
{
  int i = threadIdx.x;
  int j = threadIdx.y;

  if(i<imax)
      f[i][j] = tex2D(tex_w, i, j);
}

void print_to_stdio(int imax, float (*w)[3])
{
  for (int i=0; i<imax; i++)
    {
      printf("%2d  %3.6f\t  %3.6f\t %3.6f\n",i, w[i][0], w[i][1], w[i][2]);
    }
}

int main(void)
{
  int imax = 8;
  float (*w)[3];
  float (*d_w)[3], (*d_f)[3];
  dim3 grid(imax,3);

  w = (float (*)[3])malloc(imax*3*sizeof(float));

  for(int i=0; i<imax; i++)
    {
      for(int j=0; j<3; j++)
        {
          w[i][j] = i + 0.01f*j;
        }
    }

  cudaMalloc( (void**) &d_w, 3*imax*sizeof(float) );
  cudaMalloc( (void**) &d_f, 3*imax*sizeof(float) );

  cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();
  HANDLE_ERROR( cudaBindTexture2D(NULL, tex_w, d_w, desc, imax, 3, sizeof(float)*imax ) );

  cudaMemcpy(d_w, w, 3*imax*sizeof(float), cudaMemcpyHostToDevice);

  // just use threads for simplicity                                                                  
  kernel<<<1,grid>>>(imax, d_w, d_f);

  cudaMemcpy(w, d_f, 3*imax*sizeof(float), cudaMemcpyDeviceToHost);

  cudaUnbindTexture(tex_w);
  cudaFree(d_w);
  cudaFree(d_f);

  print_to_stdio(imax, w);

  free(w);
  return 0;
}

Running this code I would expect to get:

0  0.000000   0.010000   0.020000
1  1.000000   1.010000   1.020000
2  2.000000   2.010000   2.020000
3  3.000000   3.010000   3.020000
4  4.000000   4.010000   4.020000
5  5.000000   5.010000   5.020000
6  6.000000   6.010000   6.020000
7  7.000000   7.010000   7.020000

but instead i get:

0  0.000000   2.020000   5.010000
1  0.010000   3.000000   5.020000
2  0.020000   3.010000   6.000000
3  1.000000   3.020000   6.010000
4  1.010000   4.000000   6.020000
5  1.020000   4.010000   7.000000
6  2.000000   4.020000   7.010000
7  2.010000   5.000000   7.020000

I think this has something to do with the pitch parameter I give to cudaBindTexture2D but using smaller values gives an invalid argument error.

Thanks in advance!

解决方案

After brano's response and looking more into how pitch works, I'll answer my own question. Here is the modified code:

#include <cstdio>
#include <iostream>
#include "handle.cu"

using namespace std;

texture<float,2,cudaReadModeElementType> tex_w;

__global__ void kernel(int imax, float (*f)[3])
{
  int i = threadIdx.x;
  int j = threadIdx.y;
  // width = 3, height = imax                                                                         
  // but we have imax threads in x, 3 in y                                                            
  // therefore height corresponds to x threads (i)                                                    
  // and width corresponds to y threads (j)                                                           
  if(i<imax)
    {
      // linear filtering looks between indices                                                       
      f[i][j] = tex2D(tex_w, j+0.5f, i+0.5f);
    }
}

void print_to_stdio(int imax, float (*w)[3])
{
  for (int i=0; i<imax; i++)
    {
      printf("%2d  %3.3f  %3.3f  %3.3f\n",i, w[i][0], w[i][1], w[i][2]);
    }
  printf("\n");
}

int main(void)
{
  int imax = 8;
  float (*w)[3];
  float (*d_f)[3], *d_w;
  dim3 grid(imax,3);

  w = (float (*)[3])malloc(imax*3*sizeof(float));

  for(int i=0; i<imax; i++)
    {
      for(int j=0; j<3; j++)
        {
          w[i][j] = i + 0.01f*j;
        }
    }

  print_to_stdio(imax, w);

  size_t pitch;
  HANDLE_ERROR( cudaMallocPitch((void**)&d_w, &pitch, 3*sizeof(float), imax) );

  HANDLE_ERROR( cudaMemcpy2D(d_w,             // device destination                                   
                             pitch,           // device pitch (calculated above)                      
                             w,               // src on host                                          
                             3*sizeof(float), // pitch on src (no padding so just width of row)       
                             3*sizeof(float), // width of data in bytes                               
                             imax,            // height of data                                       
                             cudaMemcpyHostToDevice) );

  HANDLE_ERROR( cudaBindTexture2D(NULL, tex_w, d_w, tex_w.channelDesc, 3, imax, pitch) );

  tex_w.normalized = false;  // don't use normalized values                                           
  tex_w.filterMode = cudaFilterModeLinear;
  tex_w.addressMode[0] = cudaAddressModeClamp; // don't wrap around indices                           
  tex_w.addressMode[1] = cudaAddressModeClamp;

  // d_f will have result array                                                                       
  cudaMalloc( &d_f, 3*imax*sizeof(float) );

  // just use threads for simplicity                                                                  
  kernel<<<1,grid>>>(imax, d_f);

  cudaMemcpy(w, d_f, 3*imax*sizeof(float), cudaMemcpyDeviceToHost);

  cudaUnbindTexture(tex_w);
  cudaFree(d_w);
  cudaFree(d_f);

  print_to_stdio(imax, w);

  free(w);
  return 0;
}

Instead of using memcpy() and having to deal with pitch on the host machine, using memcpy2D() accepts a pitch argument for both the device data and host data. Since we are using simply allocated data on the host, my understanding is that the pitch would simply be the row width, or 3*sizeof(float).

这篇关于来自2D阵列CUDA的2D纹理的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆