在内联ptx中加载函数参数 [英] load function parameters in inlined ptx

查看:192
本文介绍了在内联ptx中加载函数参数的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我有以下函数与内联汇编在32位Visual Studio 2008调试模式下工作正常:

  __ device__ void add(int * pa,int * pb)
{
asm(.reg .u32 s 3;::);
asm(。reg .u32 r 14&;::);

asm(ld.global.b32 s0,[%0];::r(& pa)); //加载pa的地址,pb
printf(...);
asm(ld.global.b32 s1,[%0];::r(& pb));
printf(...);
asm(ld.global.b32 r1,[s0 + 8];::);
printf(...);
asm(ld.global.b32 r2,[s1 + 8];::);
printf(...);

... //执行一些操作
}

pa和pb在设备上全局分配,例如

  __ device__ int pa [3] = {0,0x927c0000,0x20000011}; 
__device__ int pb [3] = {0,0xbb900000,0x2000000b};然而,这个代码在发布模式下失败,在线 asm(ld .global.b32 r1,[s0 + 8];::);
如何在发布模式下使用inline ptx正确加载函数参数?



PS使用-G标志(生成GPU调试信息)构建发布模式会导致代码在发布模式下正确运行。
谢谢你,

解决方案

希望这段代码会有所帮助。我仍然在猜测你正在尝试做什么,但我开始与你的代码,并决定添加一些值在 pa pb 和 pb [0] / p>

此代码是为64位机器编写的,但将其转换为32位指针不应该很困难。我已经标记需要更改为32位指针与注释的行。希望这将回答你关于如何使用作为设备内存指针的函数参数的问题:

  #include< stdio.h> ; 

__device__ int pa [3] = {0,0x927c0000,0x20000011};
__device__ int pb [3] = {0,0xbb900000,0x2000000b};

__device__ void add(int * mpa,int * mpb)
{
asm(。reg .u64 s 2;::); // change to .u32 for 32 bit pointers
asm(。reg .u32 r< 6> ;;::);

asm(mov.u64 s0,%0;::l(mpa)); //改为.u32,32位为r
asm(mov.u64 s1,%0;::l(mpb)); //改为.u32,32位为r
asm(ld.global.u32 r0,[s0 + 4];::);
asm(ld.global.u32 r1,[s1 + 4];::);
asm(ld.global.u32 r2,[s0 + 8];::);
asm(ld.global.u32 r3,[s1 + 8];::);
asm(add.u32 r4,r0,r2;::);
asm(add.u32 r5,r1,r3;::);
asm(st.global.u32 [s0],r4;::);
asm(st.global.u32 [s1],r5;::);
}

__global__ void mykernel(){
printf(pa [0] =%x,pb [0] =%x \\\
,pa [0] ,pb [0]);
add(pa,pb);
printf(pa [0] =%x,pb [0] =%x \\\
,pa [0],pb [0]);
}

int main(){
mykernel<<< 1,1>>>();
cudaDeviceSynchronize();
return 0;
}

当我运行这个代码时,我得到:

  $ ./t128 
pa [0] = 0,pb [0] = 0
pa [0] = b27c0011,pb [ 0] = db90000b
$

我相信这是正确的输出。



我编译它:

  nvcc -O3 -arch = sm_20 -o t128 t128 .cu 


I have the following function with inline assembly that works fine on debug mode in 32 bit Visual Studio 2008:

__device__ void add(int* pa, int* pb)
{
  asm(".reg .u32   s<3>;"::);
  asm(".reg .u32   r<14>;"::);

  asm("ld.global.b32    s0, [%0];"::"r"(&pa));      //load addresses of pa, pb
  printf(...);
  asm("ld.global.b32    s1, [%0];"::"r"(&pb));
  printf(...);
  asm("ld.global.b32    r1, [s0+8];"::);
  printf(...);  
  asm("ld.global.b32    r2, [s1+8];"::);
  printf(...);

  ...// perform some operations
}

pa and pb are globally allocated on the device such as

__device__ int pa[3] = {0, 0x927c0000, 0x20000011};  
__device__ int pb[3] = {0, 0xbb900000, 0x2000000b};

However, this code fails on release mode, on line asm("ld.global.b32 r1, [s0+8];"::); How can I load function parameters correctly with inline ptx on release mode?

P.S. building the release mode with -G flag (Generates GPU debug info) causes the code to run correctly on release mode. Thank you,

解决方案

Hopefully this code will help. I'm still guessing at what you are trying to do exactly, but I started with your code and decided to add some values in the pa and pb arrays and store them back into pa[0] and pb[0].

This code is written for a 64 bit machine but converting it to 32 bit pointers should not be difficult. I have marked the lines that need to be changed for 32 bit pointers with a comment. Hopefully this will answer your question about how to use function parameters that are pointers to device memory:

#include <stdio.h>

__device__ int pa[3] = {0, 0x927c0000, 0x20000011};
__device__ int pb[3] = {0, 0xbb900000, 0x2000000b};

__device__ void add(int* mpa, int* mpb)
{
  asm(".reg .u64   s<2>;"::);  // change to .u32 for 32 bit pointers
  asm(".reg .u32   r<6>;"::);

  asm("mov.u64    s0, %0;"::"l"(mpa));      //change to .u32 and "r" for 32 bit
  asm("mov.u64    s1, %0;"::"l"(mpb));      //change to .u32 and "r" for 32 bit
  asm("ld.global.u32    r0, [s0+4];"::);
  asm("ld.global.u32    r1, [s1+4];"::);
  asm("ld.global.u32    r2, [s0+8];"::);
  asm("ld.global.u32    r3, [s1+8];"::);
  asm("add.u32    r4, r0, r2;"::);
  asm("add.u32    r5, r1, r3;"::);
  asm("st.global.u32    [s0], r4;"::);
  asm("st.global.u32   [s1], r5;"::);
}

__global__ void mykernel(){
  printf("pa[0] = %x, pb[0] = %x\n", pa[0], pb[0]);
  add(pa, pb);
  printf("pa[0] = %x, pb[0] = %x\n", pa[0], pb[0]);
}

int  main() {
  mykernel<<<1,1>>>();
  cudaDeviceSynchronize();
  return 0;
}

When I run this code I get:

$ ./t128
pa[0] = 0, pb[0] = 0
pa[0] = b27c0011, pb[0] = db90000b
$

which I believe is correct output.

I compiled it with:

nvcc -O3 -arch=sm_20 -o t128 t128.cu

这篇关于在内联ptx中加载函数参数的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆