用CUDA求解线性系统AX = B [英] Solving dense linear systems AX = B with CUDA

查看：150 发布时间：2020/11/20 0:50:01 cuda gpu cusolver

本文介绍了用CUDA求解线性系统AX = B的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我可以使用新的cuSOLVER库(CUDA 7)求解形式为的线性系统

Can I use the new cuSOLVER library (CUDA 7) to solve linear systems of the form

AX = B

其中A，X和B是NxN密集矩阵?

推荐答案

是.

方法nr. 1

在cuSOLVER框架中，您可以使用QR分解，请参见 QR分解以解决CUDA中的线性系统.

In the framework of cuSOLVER you can use QR decomposition, see QR decomposition to solve linear systems in CUDA.

方法nr. 2

或者，您可以通过

 cublas<t>getrfBatched()

计算矩阵的LU分解，并且

which calculates the LU decomposition of a matrix, and

cublas<t>getriBatched()

从LU分解开始计算矩阵的逆.

which calculates the inverse of the matrix starting from its LU decomposition.

方法nr. 3

最终的可能性是使用

cublas<t>getrfBatched()

其次是

cublas<t>trsm()

求解上三角三角形系统或下三角三角形系统.

which solves upper or lower triangular linear systems.

罗伯特·克罗维拉(Robert Crovella)指出，答案可能在所涉及矩阵的大小和类型上有所不同.

As pointed out by Robert Crovella, the answer may vary on the size and the type of the involved matrices.

方法nr的代码. 1

请参阅 QR分解以解决CUDA中的线性系统.

方法代码nr. 2和nr. 3

下面，我要报告一个实现nr方法的有效示例. 2和3. Hankel矩阵用于为条件良好的可逆矩阵提供方法.请注意方法nr. 3要求根据调用cublas<t>getrfBatched()后获得的数据透视表数组对系统系数向量进行置换(重新排列).此排列可以在CPU上方便地完成.

Below, I'm reporting a worked example for the implementation of approaches nr. 2 and 3. Hankel matrices are used to feed the approaches with well-conditioned, invertible matrices. Please, note that approach nr. 3 requires permuting (rearranging) the system coefficients vector according to the pivot array obtained following the invokation of cublas<t>getrfBatched(). This permutation can be conveniently done on the CPU.

#include <stdio.h>
#include <fstream>
#include <iomanip>
#include <stdlib.h>     /* srand, rand */
#include <time.h>       /* time */

#include "cuda_runtime.h" 
#include "device_launch_parameters.h"

#include "cublas_v2.h"

#include "Utilities.cuh"
#include "TimingGPU.cuh"

#define prec_save 10

#define BLOCKSIZE 256

#define BLOCKSIZEX 16
#define BLOCKSIZEY 16

/************************************/
/* SAVE REAL ARRAY FROM CPU TO FILE */
/************************************/
template <class T>
void saveCPUrealtxt(const T * h_in, const char *filename, const int M) {

    std::ofstream outfile;
    outfile.open(filename);
    for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n";
    outfile.close();

}

/************************************/
/* SAVE REAL ARRAY FROM GPU TO FILE */
/************************************/
template <class T>
void saveGPUrealtxt(const T * d_in, const char *filename, const int M) {

    T *h_in = (T *)malloc(M * sizeof(T));

    gpuErrchk(cudaMemcpy(h_in, d_in, M * sizeof(T), cudaMemcpyDeviceToHost));

    std::ofstream outfile;
    outfile.open(filename);
    for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n";
    outfile.close();

}

/***************************************************/
/* FUNCTION TO SET THE VALUES OF THE HANKEL MATRIX */
/***************************************************/
// --- https://en.wikipedia.org/wiki/Hankel_matrix
void setHankelMatrix(double * __restrict h_A, const int N) {

    double *h_atemp = (double *)malloc((2 * N - 1) * sizeof(double));

    // --- Initialize random seed
    srand(time(NULL));

    // --- Generate random numbers
    for (int k = 0; k < 2 * N - 1; k++) h_atemp[k] = rand();

    // --- Fill the Hankel matrix. The Hankel matrix is symmetric, so filling by row or column is equivalent.
    for (int i = 0; i < N; i++)
        for (int j = 0; j < N; j++)
            h_A[i * N + j] = h_atemp[(i + 1) + (j + 1) - 2];

    free(h_atemp);

}

/***********************************************/
/* FUNCTION TO COMPUTE THE COEFFICIENTS VECTOR */
/***********************************************/
void computeCoefficientsVector(const double * __restrict h_A, const double * __restrict h_xref, 
                               double * __restrict h_y, const int N) {

    for (int k = 0; k < N; k++) h_y[k] = 0.f;

    for (int m = 0; m < N; m++)
        for (int n = 0; n < N; n++)
            h_y[m] = h_y[m] + h_A[n * N + m] * h_xref[n];

}

/************************************/
/* COEFFICIENT REARRANGING FUNCTION */
/************************************/
void rearrange(double *vec, int *pivotArray, int N){
    for (int i = 0; i < N; i++) {
        double temp = vec[i];
        vec[i] = vec[pivotArray[i] - 1];
        vec[pivotArray[i] - 1] = temp;
    }   
}

/********/
/* MAIN */
/********/
int main() {

    const unsigned int N = 1000;

    const unsigned int Nmatrices = 1;

    // --- CUBLAS initialization
    cublasHandle_t cublas_handle;
    cublasSafeCall(cublasCreate(&cublas_handle));

    TimingGPU timerLU, timerApproach1, timerApproach2;
    double timingLU, timingApproach1, timingApproach2;

    /***********************/
    /* SETTING THE PROBLEM */
    /***********************/
    // --- Matrices to be inverted (only one in this example)
    double *h_A = (double *)malloc(N * N * Nmatrices * sizeof(double));

    // --- Setting the Hankel matrix
    setHankelMatrix(h_A, N);

    // --- Defining the solution
    double *h_xref = (double *)malloc(N * sizeof(double));
    for (int k = 0; k < N; k++) h_xref[k] = 1.f;

    // --- Coefficient vectors (only one in this example)
    double *h_y = (double *)malloc(N * sizeof(double));

    computeCoefficientsVector(h_A, h_xref, h_y, N);

    // --- Result (only one in this example)
    double *h_x = (double *)malloc(N * sizeof(double));

    // --- Allocate device space for the input matrices 
    double *d_A; gpuErrchk(cudaMalloc(&d_A, N * N * Nmatrices * sizeof(double)));
    double *d_y; gpuErrchk(cudaMalloc(&d_y, N *                 sizeof(double)));
    double *d_x; gpuErrchk(cudaMalloc(&d_x, N *                 sizeof(double)));

    // --- Move the relevant matrices from host to device
    gpuErrchk(cudaMemcpy(d_A, h_A, N * N * Nmatrices * sizeof(double), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_y, h_y, N *                 sizeof(double), cudaMemcpyHostToDevice));

    /**********************************/
    /* COMPUTING THE LU DECOMPOSITION */
    /**********************************/
    timerLU.StartCounter();

    // --- Creating the array of pointers needed as input/output to the batched getrf
    double **h_inout_pointers = (double **)malloc(Nmatrices * sizeof(double *));
    for (int i = 0; i < Nmatrices; i++) h_inout_pointers[i] = d_A + i * N * N;

    double **d_inout_pointers;
    gpuErrchk(cudaMalloc(&d_inout_pointers, Nmatrices * sizeof(double *)));
    gpuErrchk(cudaMemcpy(d_inout_pointers, h_inout_pointers, Nmatrices * sizeof(double *), cudaMemcpyHostToDevice));
    free(h_inout_pointers);

    int *d_pivotArray; gpuErrchk(cudaMalloc(&d_pivotArray, N * Nmatrices * sizeof(int)));
    int *d_InfoArray;  gpuErrchk(cudaMalloc(&d_InfoArray,      Nmatrices * sizeof(int)));

    int *h_InfoArray  = (int *)malloc(Nmatrices * sizeof(int));

    cublasSafeCall(cublasDgetrfBatched(cublas_handle, N, d_inout_pointers, N, d_pivotArray, d_InfoArray, Nmatrices));
    //cublasSafeCall(cublasDgetrfBatched(cublas_handle, N, d_inout_pointers, N, NULL, d_InfoArray, Nmatrices));

    gpuErrchk(cudaMemcpy(h_InfoArray, d_InfoArray, Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));

    for (int i = 0; i < Nmatrices; i++)
        if (h_InfoArray[i] != 0) {
            fprintf(stderr, "Factorization of matrix %d Failed: Matrix may be singular\n", i);
            cudaDeviceReset();
            exit(EXIT_FAILURE);
        }

    timingLU = timerLU.GetCounter();
    printf("Timing LU decomposition %f [ms]\n", timingLU);

    /*********************************/
    /* CHECKING THE LU DECOMPOSITION */
    /*********************************/
    saveCPUrealtxt(h_A,          "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\A.txt", N * N);
    saveCPUrealtxt(h_y,          "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\y.txt", N);
    saveGPUrealtxt(d_A,          "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\Adecomposed.txt", N * N);
    saveGPUrealtxt(d_pivotArray, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\pivotArray.txt", N);

    /******************************************************************************/
    /* APPROACH NR.1: COMPUTE THE INVERSE OF A STARTING FROM ITS LU DECOMPOSITION */
    /******************************************************************************/
    timerApproach1.StartCounter();

    // --- Allocate device space for the inverted matrices 
    double *d_Ainv; gpuErrchk(cudaMalloc(&d_Ainv, N * N * Nmatrices * sizeof(double)));

    // --- Creating the array of pointers needed as output to the batched getri
    double **h_out_pointers = (double **)malloc(Nmatrices * sizeof(double *));
    for (int i = 0; i < Nmatrices; i++) h_out_pointers[i] = (double *)((char*)d_Ainv + i * ((size_t)N * N) * sizeof(double));

    double **d_out_pointers;
    gpuErrchk(cudaMalloc(&d_out_pointers, Nmatrices*sizeof(double *)));
    gpuErrchk(cudaMemcpy(d_out_pointers, h_out_pointers, Nmatrices*sizeof(double *), cudaMemcpyHostToDevice));
    free(h_out_pointers);

    cublasSafeCall(cublasDgetriBatched(cublas_handle, N, (const double **)d_inout_pointers, N, d_pivotArray, d_out_pointers, N, d_InfoArray, Nmatrices));

    gpuErrchk(cudaMemcpy(h_InfoArray, d_InfoArray, Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));

    for (int i = 0; i < Nmatrices; i++)
        if (h_InfoArray[i] != 0) {
        fprintf(stderr, "Inversion of matrix %d Failed: Matrix may be singular\n", i);
        cudaDeviceReset();
        exit(EXIT_FAILURE);
        }

    double alpha1 = 1.f;
    double beta1 = 0.f;

    cublasSafeCall(cublasDgemv(cublas_handle, CUBLAS_OP_N, N, N, &alpha1, d_Ainv, N, d_y, 1, &beta1, d_x, 1));

    timingApproach1 = timingLU + timerApproach1.GetCounter();
    printf("Timing approach 1 %f [ms]\n", timingApproach1);

    /**************************/
    /* CHECKING APPROACH NR.1 */
    /**************************/
    saveGPUrealtxt(d_x, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\xApproach1.txt", N);

    /*************************************************************/
    /* APPROACH NR.2: INVERT UPPER AND LOWER TRIANGULAR MATRICES */
    /*************************************************************/
    timerApproach2.StartCounter();

    double *d_P; gpuErrchk(cudaMalloc(&d_P, N * N * sizeof(double)));

    gpuErrchk(cudaMemcpy(h_y, d_y, N * Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));
    int *h_pivotArray = (int *)malloc(N * Nmatrices*sizeof(int));
    gpuErrchk(cudaMemcpy(h_pivotArray, d_pivotArray, N * Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));

    rearrange(h_y, h_pivotArray, N);
    gpuErrchk(cudaMemcpy(d_y, h_y, N * Nmatrices * sizeof(double), cudaMemcpyHostToDevice));

    // --- Now P*A=L*U
    //     Linear system A*x=y => P.'*L*U*x=y => L*U*x=P*y

    // --- 1st phase - solve Ly = b 
    const double alpha = 1.f;

    // --- Function solves the triangular linear system with multiple right hand sides, function overrides b as a result 

    // --- Lower triangular part
    cublasSafeCall(cublasDtrsm(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT, N, 1, &alpha, d_A, N, d_y, N));

    // --- Upper triangular part
    cublasSafeCall(cublasDtrsm(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, N, 1, &alpha, d_A, N, d_y, N));

    timingApproach2 = timingLU + timerApproach2.GetCounter();
    printf("Timing approach 2 %f [ms]\n", timingApproach2);

    /**************************/
    /* CHECKING APPROACH NR.2 */
    /**************************/
    saveGPUrealtxt(d_y, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\xApproach2.txt", N);

    return 0;
}

运行此示例所需的Utilities.cu和Utilities.cuh文件在此 github页面上进行维护. TimingGPU.cu和TimingGPU.cuh文件在此 github页面中维护.

The Utilities.cu and Utilities.cuh files needed to run such an example are maintained at this github page. The TimingGPU.cu and TimingGPU.cuh files are maintained at this github page.

关于第三种方法的一些有用的参考文献:

Some useful references on the third approach:

NAG Fortran库常规文件

科学计算软件库(SCSL)用户指南

https://www.cs.drexel.edu/~jjohnson/2010-11/summer/cs680/programs/lapack/Danh/verify_sequential.c

编辑

方法nr的计时(以毫秒为单位). 2和3(在GTX960卡上执行的测试，版本5.2).

Timings (in ms) for approaches nr. 2 and 3 (tests performed on a GTX960 card, cc. 5.2).

N        LU decomposition       Approach nr. 2       Approach nr. 3
100      1.08                   2.75                 1.28
500      45.4                   161                  45.7
1000     302                    1053                 303

出现时，请接近nr. 3更为方便，其成本本质上是计算LU分解的成本.此外:

As it emerges, approach nr. 3 is more convenient and its cost is essentially the cost of computing the LU factorization. Furthermore:

通过LU分解解决线性系统比使用QR分解更快(请参阅

Solving linear systems by LU decomposition is faster than using QR decomposition (see QR decomposition to solve linear systems in CUDA);
LU decomposition is limited to square linear systems, while QR decomposition helps in case of non-square linear systems.

下面的Matlab代码可用于检查结果

The below Matlab code can be used for checking the results

clear all
close all
clc

warning off

N = 1000;

% --- Setting the problem solution
x = ones(N, 1);

%%%%%%%%%%%%%%%%%%%%%
% NxN HANKEL MATRIX %
%%%%%%%%%%%%%%%%%%%%%
% --- https://en.wikipedia.org/wiki/Hankel_matrix
load A.txt
load y.txt

A = reshape(A, N, N);

yMatlab = A * x;
fprintf('Percentage rms between coefficients vectors in Matlab and CUDA %f\n', 100 * sqrt(sum(sum(abs(yMatlab - y).^2)) / sum(sum(abs(yMatlab).^2))));

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% COMPUTATION OF THE LU DECOMPOSITION %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
[Lmatlab, Umatlab] = lu(A);

load Adecomposed.txt
Adecomposed = reshape(Adecomposed, N, N);

L = eye(N);
for k = 1 : N
    L(k + 1 : N, k) = Adecomposed(k + 1 : N, k);
end
U = zeros(N);
for k = 1 : N
    U(k, k : N) = Adecomposed(k, k : N);
end

load pivotArray.txt
Pj = eye(N);
for j = 1 : N
   tempVector = Pj(j, :);
   Pj(j, :) = Pj(pivotArray(j), :);
   Pj(pivotArray(j), :) = tempVector;
end

fprintf('Percentage rms between Pj * A and L * U in CUDA %f\n', 100 * sqrt(sum(sum(abs(Pj * A - L * U).^2)) / sum(sum(abs(Pj * A).^2))));

xprime      = inv(Lmatlab) * yMatlab;
xMatlab     = inv(Umatlab) * xprime;

fprintf('Percentage rms between reference solution and solution in Matlab %f\n', 100 * sqrt(sum(sum(abs(xMatlab - x).^2)) / sum(sum(abs(x).^2))));

load xApproach1.txt
fprintf('Percentage rms between reference solution and solution in CUDA for approach nr.1 %f\n', 100 * sqrt(sum(sum(abs(xApproach1 - x).^2)) / sum(sum(abs(x).^2))));

load xApproach2.txt
fprintf('Percentage rms between reference solution and solution in CUDA for approach nr.2 %f\n', 100 * sqrt(sum(sum(abs(xApproach2 - x).^2)) / sum(sum(abs(x).^2))));

这篇关于用CUDA求解线性系统AX = B的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持IT屋！

查看全文

用CUDA求解线性系统AX = B [英] Solving dense linear systems AX = B with CUDA

问题描述

推荐答案

相关文章

其他开发最新文章

热门教程

热门工具

登录关闭

用CUDA求解线性系统AX = B [英] Solving dense linear systems AX = B with CUDA

问题描述

推荐答案

相关文章

其他开发最新文章

热门教程

热门工具

登录 关闭

登录关闭