Cuda Parallelize内核 [英] Cuda Parallelize Kernel

查看：82 发布时间：2020/5/24 21:54:09 python parallel-processing cuda numba

本文介绍了Cuda Parallelize内核的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我正在尝试并行化GPU上模拟的简单更新循环.基本上，有一些由圆圈表示的生物"，它们在每个更新循环中都会移动，然后将检查它们是否相交.半径是不同类型的生物的半径.

import numpy as np
import math
from numba import cuda


@cuda.jit('void(float32[:], float32[:], float32[:], uint8[:], float32[:], float32[:], float32, uint32, uint32)')
def update(p_x, p_y, radii, types, velocities, max_velocities, acceleration, num_creatures, cycles):
    for c in range(cycles):
        for i in range(num_creatures):
            velocities[i] = velocities[i] + acceleration
            if velocities[i] > max_velocities[i]:
                velocities[i] = max_velocities[i]
            p_x[i] = p_x[i] + (math.cos(1.0) * velocities[i])
            p_y[i] = p_y[i] + (math.sin(1.0) * velocities[i])
        for i in range(num_creatures):
            for j in range(i, num_creatures):
                delta_x = p_x[j] - p_x[i]
                delta_y = p_y[j] - p_y[i]
                distance_squared = (delta_x * delta_x) + (delta_y * delta_y)
                sum_of_radii = radii[types[i]] + radii[types[i]]
                if distance_squared < sum_of_radii * sum_of_radii:
                    pass


acceleration = .1
creature_radius = 10
spacing = 20
food_radius = 3

max_num_creatures = 1500
num_creatures = 0
max_num_food = 500
num_food = 0
max_num_entities = max_num_creatures + max_num_food
num_entities = 0
cycles = 1


p_x = np.zeros(max_num_entities, dtype=np.float32)
p_y = np.zeros(max_num_entities, dtype=np.float32)
radii = np.array([creature_radius, creature_radius, food_radius], dtype=np.float32)
types = np.zeros(max_num_entities, dtype=np.uint8)

velocities = np.zeros(max_num_creatures, dtype=np.float32)
max_velocities = np.zeros(max_num_creatures, dtype=np.float32)
# types:
# male - 0
# female - 1
# food - 2
for x in range(1, 800 // spacing):
    for y in range(1, 600 // spacing):
        if num_creatures % 2 == 0:
            types[num_creatures] = 0
        else:
            types[num_creatures] = 1
        p_x[num_creatures] = x * spacing
        p_y[num_creatures] = y * spacing
        max_velocities[num_creatures] = 5
        num_creatures += 1


device_p_x = cuda.to_device(p_x)
device_p_y = cuda.to_device(p_y)
device_radii = cuda.to_device(radii)
device_types = cuda.to_device(types)
device_velocities = cuda.to_device(velocities)
device_max_velocities = cuda.to_device(max_velocities)
threadsperblock = 64
blockspergrid = 16
update[blockspergrid, threadsperblock](device_p_x, device_p_y, device_radii, device_types, device_velocities, device_max_velocities,
        acceleration, num_creatures, cycles)
print(device_p_x.copy_to_host())

math.cos和math.sin中的1.0只是单个生物方向的占位符.

现在有多个线程，但是它们执行相同的代码. 我必须对内核进行什么更改才能使其并行化?

解决方案

对我而言，最明显的并行化维度似乎是内核中i中的循环，即对num_creatures进行迭代.因此，我将描述如何做到这一点.

我们的目标是删除num_creatures上的循环，而是让循环的每次迭代都由单独的CUDA线程处理.之所以可行，是因为每个循环迭代中完成的工作(大部分)都是独立的-它不依赖于其他循环迭代的结果(但请参见下面的2).
我们将要遇到的挑战是，num_creatures中的第二个i for循环大概取决于第一个的结果.如果我们将所有内容都保留为在单个线程中运行的串行代码，那么串行代码执行的性质就可以解决这种依赖性.但是，我们要并行化这一点.因此，我们需要在num_creatures中的第一个for循环和第二个之间的全局同步. CUDA中一个简单，方便的全局同步是内核启动，因此我们将内核代码分为两个内核函数.我们将其称为update1和update2
然后，这提出了有关如何处理cycles中的上拱循环的挑战.我们不能简单地在两个内核中复制该循环，因为这会改变功能行为-例如，我们将在计算delta_x之前计算cycles对p_x的更新.大概这不是想要的.因此，为简单起见，我们将从内核代码中取消此循环，然后再返回到宿主代码中.然后，主机代码将调用update1和update2内核进行cycles迭代.
我们还希望使内核处理适应于num_creatures的不同大小.因此，我们将为threadsperblock选择一个硬编码的大小，但是将根据num_creatures的大小将启动的块数设为可变.为此，我们需要在每个内核中进行 thread-check (初始if语句)，以使额外"线程不执行任何操作.

有了这样的描述，我们最终得到了这样的东西:

$ cat t11.py
import numpy as np
import math
from numba import cuda


@cuda.jit('void(float32[:], float32[:], float32[:], float32[:], float32, uint32)')
def update1(p_x, p_y, velocities, max_velocities, acceleration, num_creatures):
    i = cuda.grid(1)
    if i < num_creatures:
            velocities[i] = velocities[i] + acceleration
            if velocities[i] > max_velocities[i]:
                velocities[i] = max_velocities[i]
            p_x[i] = p_x[i] + (math.cos(1.0) * velocities[i])
            p_y[i] = p_y[i] + (math.sin(1.0) * velocities[i])

@cuda.jit('void(float32[:], float32[:], float32[:], uint8[:], uint32)')
def update2(p_x, p_y, radii, types, num_creatures):
    i = cuda.grid(1)
    if i < num_creatures:
            for j in range(i, num_creatures):
                delta_x = p_x[j] - p_x[i]
                delta_y = p_y[j] - p_y[i]
                distance_squared = (delta_x * delta_x) + (delta_y * delta_y)
                sum_of_radii = radii[types[i]] + radii[types[i]]
                if distance_squared < sum_of_radii * sum_of_radii:
                    pass


acceleration = .1
creature_radius = 10
spacing = 20
food_radius = 3

max_num_creatures = 1500
num_creatures = 0
max_num_food = 500
num_food = 0
max_num_entities = max_num_creatures + max_num_food
num_entities = 0
cycles = 1


p_x = np.zeros(max_num_entities, dtype=np.float32)
p_y = np.zeros(max_num_entities, dtype=np.float32)
radii = np.array([creature_radius, creature_radius, food_radius], dtype=np.float32)
types = np.zeros(max_num_entities, dtype=np.uint8)

velocities = np.zeros(max_num_creatures, dtype=np.float32)
max_velocities = np.zeros(max_num_creatures, dtype=np.float32)
# types:
# male - 0
# female - 1
# food - 2
for x in range(1, 800 // spacing):
    for y in range(1, 600 // spacing):
        if num_creatures % 2 == 0:
            types[num_creatures] = 0
        else:
            types[num_creatures] = 1
        p_x[num_creatures] = x * spacing
        p_y[num_creatures] = y * spacing
        max_velocities[num_creatures] = 5
        num_creatures += 1


device_p_x = cuda.to_device(p_x)
device_p_y = cuda.to_device(p_y)
device_radii = cuda.to_device(radii)
device_types = cuda.to_device(types)
device_velocities = cuda.to_device(velocities)
device_max_velocities = cuda.to_device(max_velocities)
threadsperblock = 64
blockspergrid = (num_creatures // threadsperblock) + 1
for i in range(cycles):
    update1[blockspergrid, threadsperblock](device_p_x, device_p_y, device_velocities, device_max_velocities, acceleration, num_creatures)
    update2[blockspergrid, threadsperblock](device_p_x, device_p_y, device_radii, device_types, num_creatures)
print(device_p_x.copy_to_host())
$ python t11.py
[ 20.05402946  20.05402946  20.05402946 ...,   0.           0.           0.        ]
$

产生与原始发布版本相同的输出(原始代码运行16个64个线程的块，它们执行完全相同的操作，并且在它们写入相同数据时会互相踩踏.因此，我指的是原始发布的版本，但运行一个程序块的一个线程).

请注意，当然还有其他并行化方法，可能还有其他优化方法，但这应该为您提供CUDA工作的明智起点.

如中的您所提到的那样，这里的第二个内核实际上并没有做任何有用的事情，但是我认为这只是将来工作的占位符.我也不确定在这里使用radii会得到想要的东西，但这也不是这个问题的重点.

那么所有这些性能明智的做法是什么?再次，我们将原始发布的版本(下面的t12.py)与一个运行18个64个块的版本进行比较，该版本仅运行一个线程的一个块(而不是16个线程的64个线程，无论如何这只会更糟).线程(t11.py，在下面):

$ nvprof --print-gpu-trace python t11.py
==3551== NVPROF is profiling process 3551, command: python t11.py
[ 20.05402946  20.05402946  20.05402946 ...,   0.           0.           0.        ]
==3551== Profiling application: python t11.py
==3551== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput  SrcMemType  DstMemType           Device   Context    Stream  Name
446.77ms  1.8240us                    -               -         -         -         -  7.8125KB  4.0847GB/s    Pageable      Device  Quadro K2000 (0         1         7  [CUDA memcpy HtoD]
446.97ms  1.7600us                    -               -         -         -         -  7.8125KB  4.2333GB/s    Pageable      Device  Quadro K2000 (0         1         7  [CUDA memcpy HtoD]
447.35ms  1.2160us                    -               -         -         -         -       12B  9.4113MB/s    Pageable      Device  Quadro K2000 (0         1         7  [CUDA memcpy HtoD]
447.74ms  1.3440us                    -               -         -         -         -  1.9531KB  1.3859GB/s    Pageable      Device  Quadro K2000 (0         1         7  [CUDA memcpy HtoD]
447.93ms  1.5040us                    -               -         -         -         -  5.8594KB  3.7154GB/s    Pageable      Device  Quadro K2000 (0         1         7  [CUDA memcpy HtoD]
448.13ms  1.5360us                    -               -         -         -         -  5.8594KB  3.6380GB/s    Pageable      Device  Quadro K2000 (0         1         7  [CUDA memcpy HtoD]
448.57ms  5.4720us             (18 1 1)        (64 1 1)        36        0B        0B         -           -           -           -  Quadro K2000 (0         1         7  cudapy::__main__::update1$241(Array<float, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, float, unsigned int) [49]
448.82ms  1.1200us             (18 1 1)        (64 1 1)         8        0B        0B         -           -           -           -  Quadro K2000 (0         1         7  cudapy::__main__::update2$242(Array<float, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, Array<unsigned char, int=1, A, mutable, aligned>, unsigned int) [50]
448.90ms  2.1120us                    -               -         -         -         -  7.8125KB  3.5277GB/s      Device    Pageable  Quadro K2000 (0         1         7  [CUDA memcpy DtoH]

Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows.
SSMem: Static shared memory allocated per CUDA block.
DSMem: Dynamic shared memory allocated per CUDA block.
SrcMemType: The type of source memory accessed by memory operation/copy
DstMemType: The type of destination memory accessed by memory operation/copy

$ python t12.py
[ 20.05402946  20.05402946  20.05402946 ...,   0.           0.           0.        ]
$ nvprof --print-gpu-trace python t12.py
==3604== NVPROF is profiling process 3604, command: python t12.py
[ 20.05402946  20.05402946  20.05402946 ...,   0.           0.           0.        ]
==3604== Profiling application: python t12.py
==3604== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput  SrcMemType  DstMemType           Device   Context    Stream  Name
296.22ms  1.8240us                    -               -         -         -         -  7.8125KB  4.0847GB/s    Pageable      Device  Quadro K2000 (0         1         7  [CUDA memcpy HtoD]
296.41ms  1.7920us                    -               -         -         -         -  7.8125KB  4.1577GB/s    Pageable      Device  Quadro K2000 (0         1         7  [CUDA memcpy HtoD]
296.79ms  1.2160us                    -               -         -         -         -       12B  9.4113MB/s    Pageable      Device  Quadro K2000 (0         1         7  [CUDA memcpy HtoD]
297.21ms  1.3440us                    -               -         -         -         -  1.9531KB  1.3859GB/s    Pageable      Device  Quadro K2000 (0         1         7  [CUDA memcpy HtoD]
297.40ms  1.5040us                    -               -         -         -         -  5.8594KB  3.7154GB/s    Pageable      Device  Quadro K2000 (0         1         7  [CUDA memcpy HtoD]
297.60ms  1.5360us                    -               -         -         -         -  5.8594KB  3.6380GB/s    Pageable      Device  Quadro K2000 (0         1         7  [CUDA memcpy HtoD]
298.05ms  1.8453ms              (1 1 1)         (1 1 1)        36        0B        0B         -           -           -           -  Quadro K2000 (0         1         7  cudapy::__main__::update$241(Array<float, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, Array<unsigned char, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, float, unsigned int, unsigned int) [38]
299.91ms  2.1120us                    -               -         -         -         -  7.8125KB  3.5277GB/s      Device    Pageable  Quadro K2000 (0         1         7  [CUDA memcpy DtoH]

Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows.
SSMem: Static shared memory allocated per CUDA block.
DSMem: Dynamic shared memory allocated per CUDA block.
SrcMemType: The type of source memory accessed by memory operation/copy
DstMemType: The type of destination memory accessed by memory operation/copy
$

我们看到事件探查器报告的是原始t12.py版本，有一个运行的update内核，带有1个块和1个线程，耗时1.8453毫秒.对于此答案中发布的修改后的t11.py版本，分析器报告了update1和update2内核的18个块，每个块64个线程，并且这两个内核的组合执行时间约为5.47 + 1.12 = 6.59微秒.

基于评论中的一些讨论，应该可以在p_x和p_y上使用双缓冲方案将两个内核合并为一个内核:

$ cat t11.py
import numpy as np
import math
from numba import cuda


@cuda.jit('void(float32[:], float32[:], float32[:], float32[:], float32[:], uint8[:], float32[:], float32[:], float32, uint32)')
def update(p_x, p_y, p_x_new, p_y_new, radii, types, velocities, max_velocities, acceleration, num_creatures):
    i = cuda.grid(1)
    if i < num_creatures:
            velocities[i] = velocities[i] + acceleration
            if velocities[i] > max_velocities[i]:
                velocities[i] = max_velocities[i]
            p_x_new[i] = p_x[i] + (math.cos(1.0) * velocities[i])
            p_y_new[i] = p_y[i] + (math.sin(1.0) * velocities[i])
            for j in range(i, num_creatures):
                delta_x = p_x[j] - p_x[i]
                delta_y = p_y[j] - p_y[i]
                distance_squared = (delta_x * delta_x) + (delta_y * delta_y)
                sum_of_radii = radii[types[i]] + radii[types[i]]
                if distance_squared < sum_of_radii * sum_of_radii:
                    pass


acceleration = .1
creature_radius = 10
spacing = 20
food_radius = 3

max_num_creatures = 1500000
num_creatures = 0
max_num_food = 500
num_food = 0
max_num_entities = max_num_creatures + max_num_food
num_entities = 0
cycles = 2


p_x = np.zeros(max_num_entities, dtype=np.float32)
p_y = np.zeros(max_num_entities, dtype=np.float32)
radii = np.array([creature_radius, creature_radius, food_radius], dtype=np.float32)
types = np.zeros(max_num_entities, dtype=np.uint8)

velocities = np.zeros(max_num_creatures, dtype=np.float32)
max_velocities = np.zeros(max_num_creatures, dtype=np.float32)
# types:
# male - 0
# female - 1
# food - 2
for x in range(1, 80000 // spacing):
    for y in range(1, 6000 // spacing):
        if num_creatures % 2 == 0:
            types[num_creatures] = 0
        else:
            types[num_creatures] = 1
        p_x[num_creatures] = x * spacing
        p_y[num_creatures] = y * spacing
        max_velocities[num_creatures] = 5
        num_creatures += 1


device_p_x = cuda.to_device(p_x)
device_p_y = cuda.to_device(p_y)
device_p_x_new = cuda.to_device(p_x)
device_p_y_new = cuda.to_device(p_y)
device_radii = cuda.to_device(radii)
device_types = cuda.to_device(types)
device_velocities = cuda.to_device(velocities)
device_max_velocities = cuda.to_device(max_velocities)
threadsperblock = 64
blockspergrid = (num_creatures // threadsperblock) + 1
for i in range(cycles):
    if i % 2 == 0:
        update[blockspergrid, threadsperblock](device_p_x, device_p_y, device_p_x_new, device_p_y_new, device_radii, device_types,  device_velocities, device_max_velocities, acceleration, num_creatures)
    else:
        update[blockspergrid, threadsperblock](device_p_x_new, device_p_y_new, device_p_x, device_p_y, device_radii, device_types,  device_velocities, device_max_velocities, acceleration, num_creatures)

print(device_p_x_new.copy_to_host())
print(device_p_x.copy_to_host())
$ python t11.py
[ 20.05402946  20.05402946  20.05402946 ...,   0.           0.           0.        ]
[ 20.1620903  20.1620903  20.1620903 ...,   0.          0.          0.       ]
$

仍然有必要在主机代码的cycles中保留内核调用循环，因为我们仍然需要内核调用提供的全局同步.但是对于给定数量的cycles，这将减少内核调用开销的贡献.

使用此技术时，必须谨慎选择cycles以及使用p_x或p_x_new缓冲区中的数据，以获得一致的结果.

I'm trying to parallelize a simple update loop of a simulation on the GPU. Basically there are a bunch of "creatures" represented by circles that in each update loop will move and then there will be a check of whether any of them intersect. radii is the radius of the different types of creatures.

import numpy as np
import math
from numba import cuda


@cuda.jit('void(float32[:], float32[:], float32[:], uint8[:], float32[:], float32[:], float32, uint32, uint32)')
def update(p_x, p_y, radii, types, velocities, max_velocities, acceleration, num_creatures, cycles):
    for c in range(cycles):
        for i in range(num_creatures):
            velocities[i] = velocities[i] + acceleration
            if velocities[i] > max_velocities[i]:
                velocities[i] = max_velocities[i]
            p_x[i] = p_x[i] + (math.cos(1.0) * velocities[i])
            p_y[i] = p_y[i] + (math.sin(1.0) * velocities[i])
        for i in range(num_creatures):
            for j in range(i, num_creatures):
                delta_x = p_x[j] - p_x[i]
                delta_y = p_y[j] - p_y[i]
                distance_squared = (delta_x * delta_x) + (delta_y * delta_y)
                sum_of_radii = radii[types[i]] + radii[types[i]]
                if distance_squared < sum_of_radii * sum_of_radii:
                    pass


acceleration = .1
creature_radius = 10
spacing = 20
food_radius = 3

max_num_creatures = 1500
num_creatures = 0
max_num_food = 500
num_food = 0
max_num_entities = max_num_creatures + max_num_food
num_entities = 0
cycles = 1


p_x = np.zeros(max_num_entities, dtype=np.float32)
p_y = np.zeros(max_num_entities, dtype=np.float32)
radii = np.array([creature_radius, creature_radius, food_radius], dtype=np.float32)
types = np.zeros(max_num_entities, dtype=np.uint8)

velocities = np.zeros(max_num_creatures, dtype=np.float32)
max_velocities = np.zeros(max_num_creatures, dtype=np.float32)
# types:
# male - 0
# female - 1
# food - 2
for x in range(1, 800 // spacing):
    for y in range(1, 600 // spacing):
        if num_creatures % 2 == 0:
            types[num_creatures] = 0
        else:
            types[num_creatures] = 1
        p_x[num_creatures] = x * spacing
        p_y[num_creatures] = y * spacing
        max_velocities[num_creatures] = 5
        num_creatures += 1


device_p_x = cuda.to_device(p_x)
device_p_y = cuda.to_device(p_y)
device_radii = cuda.to_device(radii)
device_types = cuda.to_device(types)
device_velocities = cuda.to_device(velocities)
device_max_velocities = cuda.to_device(max_velocities)
threadsperblock = 64
blockspergrid = 16
update[blockspergrid, threadsperblock](device_p_x, device_p_y, device_radii, device_types, device_velocities, device_max_velocities,
        acceleration, num_creatures, cycles)
print(device_p_x.copy_to_host())

The 1.0 in math.cos and math.sin is just a placeholder for the directions of the individual creatures.

As it is now there are multiple threads, but they execute the same code. What changes do I have to make to the kernel to parallelize it?

解决方案

The most obvious dimension for parallelization to me seems to be the loop in i in your kernel, that is iterating over num_creatures. So I'll describe how to do that.

Our goal will be to remove the loop on num_creatures, and instead let each iteration of the loop be handled by a separate CUDA thread. This is possible because the work done in each loop iteration is (mostly) independent - it does not depend on the results of other loop iterations (but see 2 below).
A challenge we will run into is that the 2nd i for-loop in num_creatures presumably depends on the results of the first. If we leave everything as serial code running in a single thread, then that dependency is taken care of by the nature of serial code execution. However we want to parallelize this. Therefore we need a global sync in between the first for loop in num_creatures and the 2nd. A simple, convenient global sync in CUDA is the kernel launch, so we'll break the kernel code into two kernel functions. We'll call these update1 and update2
This then presents the challenge about what to do about the over-arching loop in cycles. We can't simply replicate that loop in both kernels, because that would change the functional behavior - we would compute cycles updates to p_x before computing a single calculation of delta_x, for example. That is presumably not what is wanted. So, for simplicity, we'll hoist this loop out of the kernel code and back into host code. The host code will then call update1 and update2 kernels for cycles iterations.
We also want to make the kernel processing adaptable to different sizes of num_creatures. So we'll pick a hard-coded size for threadsperblock but we will make the number of blocks launched variable, based on the size of num_creatures. To facilitate this, we need a thread-check (initial if-statement) in each of our kernels, so that "extra" threads don't do anything.

With that description, we end up with something like this:

$ cat t11.py
import numpy as np
import math
from numba import cuda


@cuda.jit('void(float32[:], float32[:], float32[:], float32[:], float32, uint32)')
def update1(p_x, p_y, velocities, max_velocities, acceleration, num_creatures):
    i = cuda.grid(1)
    if i < num_creatures:
            velocities[i] = velocities[i] + acceleration
            if velocities[i] > max_velocities[i]:
                velocities[i] = max_velocities[i]
            p_x[i] = p_x[i] + (math.cos(1.0) * velocities[i])
            p_y[i] = p_y[i] + (math.sin(1.0) * velocities[i])

@cuda.jit('void(float32[:], float32[:], float32[:], uint8[:], uint32)')
def update2(p_x, p_y, radii, types, num_creatures):
    i = cuda.grid(1)
    if i < num_creatures:
            for j in range(i, num_creatures):
                delta_x = p_x[j] - p_x[i]
                delta_y = p_y[j] - p_y[i]
                distance_squared = (delta_x * delta_x) + (delta_y * delta_y)
                sum_of_radii = radii[types[i]] + radii[types[i]]
                if distance_squared < sum_of_radii * sum_of_radii:
                    pass


acceleration = .1
creature_radius = 10
spacing = 20
food_radius = 3

max_num_creatures = 1500
num_creatures = 0
max_num_food = 500
num_food = 0
max_num_entities = max_num_creatures + max_num_food
num_entities = 0
cycles = 1


p_x = np.zeros(max_num_entities, dtype=np.float32)
p_y = np.zeros(max_num_entities, dtype=np.float32)
radii = np.array([creature_radius, creature_radius, food_radius], dtype=np.float32)
types = np.zeros(max_num_entities, dtype=np.uint8)

velocities = np.zeros(max_num_creatures, dtype=np.float32)
max_velocities = np.zeros(max_num_creatures, dtype=np.float32)
# types:
# male - 0
# female - 1
# food - 2
for x in range(1, 800 // spacing):
    for y in range(1, 600 // spacing):
        if num_creatures % 2 == 0:
            types[num_creatures] = 0
        else:
            types[num_creatures] = 1
        p_x[num_creatures] = x * spacing
        p_y[num_creatures] = y * spacing
        max_velocities[num_creatures] = 5
        num_creatures += 1


device_p_x = cuda.to_device(p_x)
device_p_y = cuda.to_device(p_y)
device_radii = cuda.to_device(radii)
device_types = cuda.to_device(types)
device_velocities = cuda.to_device(velocities)
device_max_velocities = cuda.to_device(max_velocities)
threadsperblock = 64
blockspergrid = (num_creatures // threadsperblock) + 1
for i in range(cycles):
    update1[blockspergrid, threadsperblock](device_p_x, device_p_y, device_velocities, device_max_velocities, acceleration, num_creatures)
    update2[blockspergrid, threadsperblock](device_p_x, device_p_y, device_radii, device_types, num_creatures)
print(device_p_x.copy_to_host())
$ python t11.py
[ 20.05402946  20.05402946  20.05402946 ...,   0.           0.           0.        ]
$

which produces the same output as the original posted version (the original code is running 16 blocks of 64 threads doing exactly the same thing, and stepping on each other as they write to the same data. So I'm referring to the original posted version but running one block of one thread).

Note that there are certainly other ways to parallelize, and probably other optimizations possible, but this should give you a sensible starting point for CUDA work.

As mentioned to you in your previous question, the second kernel here really doesn't do anything useful, but I assume that is just a placeholder for future work. I'm also not sure you'll get what you want with your usage of radii here, but that's also not the focus of this question.

So what is the effect of all this performance wise? Again we will compare the original posted version (t12.py, below) running just one block of one thread (not 16 blocks of 64 threads, which would only be worse, anyway) against this version which happens to be running 18 blocks of 64 threads (t11.py, below):

$ nvprof --print-gpu-trace python t11.py
==3551== NVPROF is profiling process 3551, command: python t11.py
[ 20.05402946  20.05402946  20.05402946 ...,   0.           0.           0.        ]
==3551== Profiling application: python t11.py
==3551== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput  SrcMemType  DstMemType           Device   Context    Stream  Name
446.77ms  1.8240us                    -               -         -         -         -  7.8125KB  4.0847GB/s    Pageable      Device  Quadro K2000 (0         1         7  [CUDA memcpy HtoD]
446.97ms  1.7600us                    -               -         -         -         -  7.8125KB  4.2333GB/s    Pageable      Device  Quadro K2000 (0         1         7  [CUDA memcpy HtoD]
447.35ms  1.2160us                    -               -         -         -         -       12B  9.4113MB/s    Pageable      Device  Quadro K2000 (0         1         7  [CUDA memcpy HtoD]
447.74ms  1.3440us                    -               -         -         -         -  1.9531KB  1.3859GB/s    Pageable      Device  Quadro K2000 (0         1         7  [CUDA memcpy HtoD]
447.93ms  1.5040us                    -               -         -         -         -  5.8594KB  3.7154GB/s    Pageable      Device  Quadro K2000 (0         1         7  [CUDA memcpy HtoD]
448.13ms  1.5360us                    -               -         -         -         -  5.8594KB  3.6380GB/s    Pageable      Device  Quadro K2000 (0         1         7  [CUDA memcpy HtoD]
448.57ms  5.4720us             (18 1 1)        (64 1 1)        36        0B        0B         -           -           -           -  Quadro K2000 (0         1         7  cudapy::__main__::update1$241(Array<float, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, float, unsigned int) [49]
448.82ms  1.1200us             (18 1 1)        (64 1 1)         8        0B        0B         -           -           -           -  Quadro K2000 (0         1         7  cudapy::__main__::update2$242(Array<float, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, Array<unsigned char, int=1, A, mutable, aligned>, unsigned int) [50]
448.90ms  2.1120us                    -               -         -         -         -  7.8125KB  3.5277GB/s      Device    Pageable  Quadro K2000 (0         1         7  [CUDA memcpy DtoH]

Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows.
SSMem: Static shared memory allocated per CUDA block.
DSMem: Dynamic shared memory allocated per CUDA block.
SrcMemType: The type of source memory accessed by memory operation/copy
DstMemType: The type of destination memory accessed by memory operation/copy

$ python t12.py
[ 20.05402946  20.05402946  20.05402946 ...,   0.           0.           0.        ]
$ nvprof --print-gpu-trace python t12.py
==3604== NVPROF is profiling process 3604, command: python t12.py
[ 20.05402946  20.05402946  20.05402946 ...,   0.           0.           0.        ]
==3604== Profiling application: python t12.py
==3604== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput  SrcMemType  DstMemType           Device   Context    Stream  Name
296.22ms  1.8240us                    -               -         -         -         -  7.8125KB  4.0847GB/s    Pageable      Device  Quadro K2000 (0         1         7  [CUDA memcpy HtoD]
296.41ms  1.7920us                    -               -         -         -         -  7.8125KB  4.1577GB/s    Pageable      Device  Quadro K2000 (0         1         7  [CUDA memcpy HtoD]
296.79ms  1.2160us                    -               -         -         -         -       12B  9.4113MB/s    Pageable      Device  Quadro K2000 (0         1         7  [CUDA memcpy HtoD]
297.21ms  1.3440us                    -               -         -         -         -  1.9531KB  1.3859GB/s    Pageable      Device  Quadro K2000 (0         1         7  [CUDA memcpy HtoD]
297.40ms  1.5040us                    -               -         -         -         -  5.8594KB  3.7154GB/s    Pageable      Device  Quadro K2000 (0         1         7  [CUDA memcpy HtoD]
297.60ms  1.5360us                    -               -         -         -         -  5.8594KB  3.6380GB/s    Pageable      Device  Quadro K2000 (0         1         7  [CUDA memcpy HtoD]
298.05ms  1.8453ms              (1 1 1)         (1 1 1)        36        0B        0B         -           -           -           -  Quadro K2000 (0         1         7  cudapy::__main__::update$241(Array<float, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, Array<unsigned char, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, Array<float, int=1, A, mutable, aligned>, float, unsigned int, unsigned int) [38]
299.91ms  2.1120us                    -               -         -         -         -  7.8125KB  3.5277GB/s      Device    Pageable  Quadro K2000 (0         1         7  [CUDA memcpy DtoH]

Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows.
SSMem: Static shared memory allocated per CUDA block.
DSMem: Dynamic shared memory allocated per CUDA block.
SrcMemType: The type of source memory accessed by memory operation/copy
DstMemType: The type of destination memory accessed by memory operation/copy
$

We see that the profiler reports for the original t12.py version, that there is a single update kernel running, with 1 block and 1 thread, and it is taking 1.8453 milliseconds. For the modified t11.py version, posted in this answer, the profiler reports 18 blocks of 64 threads each, for both update1 and update2 kernels, and the combined execution time of these two kernels is approximately 5.47 + 1.12 = 6.59 microseconds.

EDIT: Based on some discussion in the comments, it should be possible to combine both kernels into a single kernel, using a double-buffering scheme on p_x and p_y:

$ cat t11.py
import numpy as np
import math
from numba import cuda


@cuda.jit('void(float32[:], float32[:], float32[:], float32[:], float32[:], uint8[:], float32[:], float32[:], float32, uint32)')
def update(p_x, p_y, p_x_new, p_y_new, radii, types, velocities, max_velocities, acceleration, num_creatures):
    i = cuda.grid(1)
    if i < num_creatures:
            velocities[i] = velocities[i] + acceleration
            if velocities[i] > max_velocities[i]:
                velocities[i] = max_velocities[i]
            p_x_new[i] = p_x[i] + (math.cos(1.0) * velocities[i])
            p_y_new[i] = p_y[i] + (math.sin(1.0) * velocities[i])
            for j in range(i, num_creatures):
                delta_x = p_x[j] - p_x[i]
                delta_y = p_y[j] - p_y[i]
                distance_squared = (delta_x * delta_x) + (delta_y * delta_y)
                sum_of_radii = radii[types[i]] + radii[types[i]]
                if distance_squared < sum_of_radii * sum_of_radii:
                    pass


acceleration = .1
creature_radius = 10
spacing = 20
food_radius = 3

max_num_creatures = 1500000
num_creatures = 0
max_num_food = 500
num_food = 0
max_num_entities = max_num_creatures + max_num_food
num_entities = 0
cycles = 2


p_x = np.zeros(max_num_entities, dtype=np.float32)
p_y = np.zeros(max_num_entities, dtype=np.float32)
radii = np.array([creature_radius, creature_radius, food_radius], dtype=np.float32)
types = np.zeros(max_num_entities, dtype=np.uint8)

velocities = np.zeros(max_num_creatures, dtype=np.float32)
max_velocities = np.zeros(max_num_creatures, dtype=np.float32)
# types:
# male - 0
# female - 1
# food - 2
for x in range(1, 80000 // spacing):
    for y in range(1, 6000 // spacing):
        if num_creatures % 2 == 0:
            types[num_creatures] = 0
        else:
            types[num_creatures] = 1
        p_x[num_creatures] = x * spacing
        p_y[num_creatures] = y * spacing
        max_velocities[num_creatures] = 5
        num_creatures += 1


device_p_x = cuda.to_device(p_x)
device_p_y = cuda.to_device(p_y)
device_p_x_new = cuda.to_device(p_x)
device_p_y_new = cuda.to_device(p_y)
device_radii = cuda.to_device(radii)
device_types = cuda.to_device(types)
device_velocities = cuda.to_device(velocities)
device_max_velocities = cuda.to_device(max_velocities)
threadsperblock = 64
blockspergrid = (num_creatures // threadsperblock) + 1
for i in range(cycles):
    if i % 2 == 0:
        update[blockspergrid, threadsperblock](device_p_x, device_p_y, device_p_x_new, device_p_y_new, device_radii, device_types,  device_velocities, device_max_velocities, acceleration, num_creatures)
    else:
        update[blockspergrid, threadsperblock](device_p_x_new, device_p_y_new, device_p_x, device_p_y, device_radii, device_types,  device_velocities, device_max_velocities, acceleration, num_creatures)

print(device_p_x_new.copy_to_host())
print(device_p_x.copy_to_host())
$ python t11.py
[ 20.05402946  20.05402946  20.05402946 ...,   0.           0.           0.        ]
[ 20.1620903  20.1620903  20.1620903 ...,   0.          0.          0.       ]
$

It is still necessary to preserve the kernel-calling loop in cycles in host code, since we still require the global sync provided by the kernel call. But for a given number of cycles, this will reduce the contribution of the kernel-call overhead.

Using this technique, care must be taken in the choice of cycles as well as the use of data from either the p_x or p_x_new buffer, for coherent results.

这篇关于Cuda Parallelize内核的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持IT屋！

查看全文

Cuda Parallelize内核 [英] Cuda Parallelize Kernel

问题描述

相关文章

Python最新文章

热门教程

热门工具

登录关闭

Cuda Parallelize内核 [英] Cuda Parallelize Kernel

问题描述

相关文章

Python最新文章

热门教程

热门工具

登录 关闭

登录关闭