如何使用最大性能对CUDA中的矩阵列进行归一化？ [英] How to normalize matrix columns in CUDA with max performance?

查看：590 发布时间：2017/3/4 11:55:09 performance matrix cuda thrust cublas

本文介绍了如何使用最大性能对CUDA中的矩阵列进行归一化？的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

如何有效地规范化CUDA中的矩阵列？

How to effectively normalize matrix columns in CUDA?

我的矩阵存储在column-major中，典型大小为2000x200。

My matrix is stored in column-major, and the typical size is 2000x200.

该操作可以用以下matlab代码表示。

The operation can be represented in the following matlab code.

A = rand(2000,200);

A = exp(A);
A = A./repmat(sum(A,1), [size(A,1) 1]);

这可以由Thrust，cuBLAS和/或cuNPP有效完成吗？

Can this be done effectively by Thrust, cuBLAS and/or cuNPP?

包括4个内核的快速实现如下所示。

A rapid implementation including 4 kernels is shown as follows.

想知道如果这些可以在1或2内核中提高性能，特别是对于由cublasDgemv（）实现的列求和步骤，
。

Wondering if these can be done in 1 or 2 kernels to improve the performance, especially for the column summation step implemented by cublasDgemv().

#include <cuda.h>
#include <curand.h>
#include <cublas_v2.h>
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/transform.h>
#include <thrust/iterator/constant_iterator.h>
#include <math.h>

struct Exp
{
    __host__ __device__ void operator()(double& x)
    {
        x = exp(x);
    }
};

struct Inv
{
    __host__ __device__ void operator()(double& x)
    {
        x = (double) 1.0 / x;
    }
};

int main()
{
    cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
    cublasHandle_t hd;
    curandGenerator_t rng;
    cublasCreate(&hd);
    curandCreateGenerator(&rng, CURAND_RNG_PSEUDO_DEFAULT);

    const size_t m = 2000, n = 200;
    const double c1 = 1.0;
    const double c0 = 0.0;

    thrust::device_vector<double> A(m * n);
    thrust::device_vector<double> sum(1 * n);
    thrust::device_vector<double> one(m * n, 1.0);

    double* pA = thrust::raw_pointer_cast(&A[0]);
    double* pSum = thrust::raw_pointer_cast(&sum[0]);
    double* pOne = thrust::raw_pointer_cast(&one[0]);

    for (int i = 0; i < 100; i++)
    {
        curandGenerateUniformDouble(rng, pA, A.size());


        thrust::for_each(A.begin(), A.end(), Exp());

        cublasDgemv(hd, CUBLAS_OP_T, m, n,
                &c1, pA, m, pOne, 1, &c0, pSum, 1);

        thrust::for_each(sum.begin(), sum.end(), Inv());

        cublasDdgmm(hd, CUBLAS_SIDE_RIGHT, m, n, pA, m, pSum, 1, pA, m);
    }

    curandDestroyGenerator(rng);
    cublasDestroy(hd);

    return 0;
}

推荐答案

[173.179 us] cublas实现，如问题

[733.734 us]纯推力实施与 thrust :: reduce_by_key from @talonmies

[173.179 us] cublas implementation as shown in the question
[733.734 us] pure Thrust implementation with thrust::reduce_by_key from @talonmies
[1.508 ms] pure Thrust implementation with thrust::inclusive_scan_by_key

可以看出，

cublas在这种情况下表现最好;

两者 thrust :: reduce_by_key & thrust :: inclusive_scan_by_key 启动多个内核，导致额外的开销;

thrust :: inclusive_scan_by_key 与 thrust :: reduce_by_key 相比，将更多的数据写入DRAM，这可能是较长内核时间的原因之一;

 
  cublas和推力法之间的主要性能差异是矩阵列求和。推力减慢可能是因为 thrust :: reduce_by_key 旨在对具有变体长度的段执行缩减，但 cublas_gemv（）



cublas has highest performance in this case;
both thrust::reduce_by_key & thrust::inclusive_scan_by_key launch multiple kernels, which lead to extra overhead;
thrust::inclusive_scan_by_key writes much more data to DRAM compared to thrust::reduce_by_key, which can be one of the reasons for longer kernel time;
the main performance difference between cublas and thrust approach is the matrix column summation. thrust is slower possibly because thrust::reduce_by_key is designed to do reduction on segments with variant length, but cublas_gemv() can only apply to fixed length segments (row/col).

当矩阵A大到足以忽略内核启动开销时， cublas appoach仍然表现最好。 A_ {20,000 x 2,000}上的分析结果如下所示。
When the matrix A is large enough to ignore the kernel launching overhead, the cublas appoach still performs best. The profiling result on A_{20,000 x 2,000} is shown as follows.
  
融合第一个用@talonmies指示的 cublasSgemv 调用的for_each 操作可以进一步提高性能，但我认为应该使用手写的内核， code> thrust :: reduce_by_key 。 
Fusing the first for_each operation with the cublasSgemv call as indicated by @talonmies may further improve the performance, but I think kernel written by hand should be used instead of thrust::reduce_by_key. 
 3种方法的代码如下所示。
The code for the 3 approaches is shown as follows.
#include <cuda.h>
#include <curand.h>
#include <cublas_v2.h>
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/transform.h>
#include <thrust/reduce.h>
#include <thrust/scan.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <math.h>

struct Exp: public thrust::unary_function<double, double>
{
    __host__ __device__ double operator()(double x)
    {
        return exp(x);
    }
};

struct Inv: public thrust::unary_function<double, double>
{
    __host__ __device__ double operator()(double x)
    {
        return (double) 1.0 / x;
    }
};

template<typename T>
struct MulC: public thrust::unary_function<T, T>
{
    T C;
    __host__ __device__ MulC(T c) :
        C(c)
    {
    }
    __host__ __device__ T operator()(T x)
    {
        return x * C;
    }
};

template<typename T>
struct line2col: public thrust::unary_function<T, T>
{
    T C;
    __host__ __device__ line2col(T C) :
            C(C)
    {
    }

    __host__ __device__ T operator()(T i)
    {
        return i / C;
    }
};

int main()
{
    cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
    cublasHandle_t hd;
    curandGenerator_t rng;
    cublasCreate(&hd);
    curandCreateGenerator(&rng, CURAND_RNG_PSEUDO_DEFAULT);

    const size_t m = 2000, n = 200;
    const double c1 = 1.0;
    const double c0 = 0.0;

    thrust::device_vector<double> A(m * n);
    thrust::device_vector<double> B(m * n);
    thrust::device_vector<double> C(m * n);
    thrust::device_vector<double> sum1(1 * n);
    thrust::device_vector<double> sum2(1 * n);
    thrust::device_vector<double> one(m * n, 1);

    double* pA = thrust::raw_pointer_cast(&A[0]);
    double* pB = thrust::raw_pointer_cast(&B[0]);
    double* pSum1 = thrust::raw_pointer_cast(&sum1[0]);
    double* pSum2 = thrust::raw_pointer_cast(&sum2[0]);
    double* pOne = thrust::raw_pointer_cast(&one[0]);

    curandGenerateUniformDouble(rng, pA, A.size());

    const int count = 2;

    for (int i = 0; i < count; i++)
    {
        thrust::transform(A.begin(), A.end(), B.begin(), Exp());
        cublasDgemv(hd, CUBLAS_OP_T, m, n, &c1, pB, m, pOne, 1, &c0, pSum1, 1);
        thrust::transform(sum1.begin(), sum1.end(), sum1.begin(), Inv());
        cublasDdgmm(hd, CUBLAS_SIDE_RIGHT, m, n, pB, m, pSum2, 1, pB, m);
    }

    for (int i = 0; i < count; i++)
    {
        thrust::reduce_by_key(
                thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col<int>(m)),
                thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col<int>(m)) + A.size(),
                thrust::make_transform_iterator(A.begin(), Exp()),
                thrust::make_discard_iterator(),
                sum2.begin());
        thrust::transform(
                A.begin(), A.end(),
                thrust::make_permutation_iterator(
                        sum2.begin(),
                        thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col<int>(m))),
                C.begin(),
                thrust::divides<double>());
    }

    for (int i = 0; i < count; i++)
    {
        thrust::inclusive_scan_by_key(
                thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col<int>(m)),
                thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col<int>(m)) + A.size(),
                thrust::make_transform_iterator(A.begin(), Exp()),
                C.begin());
        thrust::copy(
                thrust::make_permutation_iterator(
                        C.begin() + m - 1,
                        thrust::make_transform_iterator(thrust::make_counting_iterator(0), MulC<int>(m))),
                thrust::make_permutation_iterator(
                        C.begin() + m - 1,
                        thrust::make_transform_iterator(thrust::make_counting_iterator(0), MulC<int>(m))) + n,
                sum2.begin());
        thrust::transform(
                A.begin(), A.end(),
                thrust::make_permutation_iterator(
                        sum2.begin(),
                        thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col<int>(m))),
                C.begin(),
                thrust::divides<double>());
    }

    curandDestroyGenerator(rng);
    cublasDestroy(hd);

    return 0;
}


                        这篇关于如何使用最大性能对CUDA中的矩阵列进行归一化？的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持IT屋！

查看全文

如何使用最大性能对CUDA中的矩阵列进行归一化？ [英] How to normalize matrix columns in CUDA with max performance?

问题描述

推荐答案

相关文章

其它硬件开发最新文章

热门教程

热门工具

登录关闭

如何使用最大性能对CUDA中的矩阵列进行归一化？ [英] How to normalize matrix columns in CUDA with max performance?

问题描述

推荐答案

相关文章

其它硬件开发最新文章

热门教程

热门工具

登录 关闭

登录关闭