为什么尖峰不具有尖锐的cscmm？ [英] Why cusparse does not have a cusparse<t>cscmm?

查看：190 发布时间：2017/3/5 18:45:55 cuda

本文介绍了为什么尖峰不具有尖锐的cscmm？的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

cusparse手册只提供了一个函数cusparsecsrmm，它将稀疏矩阵与CSR格式乘以密集矩阵，但是为什么它不为CSC格式的稀疏矩阵提供cusparsecscmm函数（因为它被引入作为稀疏矩阵之一数据格式在手册）？我缺少一些东西？

我试过像Eric的建议，但它失败了：

  cusparse错误：3在cusparse_test.cpp在第106行

和错误代码3是

  CUSPARSE_STATUS_INVALID_VALUE传递了无效的参数（m，n，k，nnz <0或ldb和ldc不正确）。 
 
 
 
 
 我的源代码：
  #include< iostream> 
 #include< stdlib.h> 
 #include< cuda_runtime.h> 
 #include< cusparse_v2.h> 
 
 #include< gsl / gsl_blas.h> 
 #include< gsl / gsl_vector.h> 
 #include< gsl / gsl_matrix.h> 
 
 
使用namespace std; 
 
 
 #define TYPE double 
 #define Zratio 0.3 //零值的百分比
 #define M 3 
 #define K 2 
 #define N 3 
 
 #define CALL_CUDA（err）\ 
 {if（err！= cudaSuccess）\ 
 {cout<<cuda Error <<在线<< __ LINE __<\ n中的<__ FILE __ < exit（EXIT_FAILURE）; } \ 
} 
 
 static void HandleError（cusparseStatus_t err，
 const char * file，
 int line）{
 if（err！= CUSPARSE_STATUS_SUCCESS ）<<<at line<<<<<< endl;{
 cout  exit（EXIT_FAILURE）; 
} 
} 
 #define HANDLE_ERROR（err）（HandleError（err，__FILE__，__LINE__））
 
 int main（）
 {
 cout  
 const TYPE alpha = 1.0，beta = 0.0; 
 int i，j，k; 
 double dif; 
 
 float elapsedTime; 
 cudaEvent_t start，stop; 
 CALL_CUDA（cudaEventCreate（& start））; 
 CALL_CUDA（cudaEventCreate（& stop））; 
 
 cusparseHandle_t hdl; 
 HANDLE_ERROR（cusparseCreate（& hdl））; 
 cusparseMatDescr_t descr; 
 HANDLE_ERROR（cusparseCreateMatDescr（& descr））; 
 HANDLE_ERROR（cusparseSetMatType（descr，CUSPARSE_MATRIX_TYPE_GENERAL））; 
 HANDLE_ERROR（cusparseSetMatIndexBase（descr，CUSPARSE_INDEX_BASE_ZERO））; 
 
 
 //分配主机内存
 TYPE * A，* B，* Out，* x，* y; 
 CALL_CUDA（cudaEventRecord（start，0））; 
 CALL_CUDA（cudaHostAlloc（（void **）& A，M * K * sizeof（TYPE），cudaHostAllocMapped））; 
 CALL_CUDA（cudaHostAlloc（（void **）& B，K * N * sizeof（TYPE），cudaHostAllocMapped））; 
 CALL_CUDA（cudaHostAlloc（（void **）& Out，M * N * sizeof（TYPE），cudaHostAllocMapped））; 
 CALL_CUDA（cudaEventRecord（stop，0））; 
 CALL_CUDA（cudaEventSynchronize（stop））; 
 CALL_CUDA（cudaEventElapsedTime（& elapsedTime，start，stop））; 
 cout<< endl<<Host allocation time：<< elapsedTime<<milliseconds<< endl; 
 TYPE randtmp; 
 for（i = 0; i  {
 randtmp =（float）rand ; 
 A [j * M + i] =（randtmp< Zratio）0：randtmp; $（j = 0; j  
 cout<<A：<< endl; （i = 0; i  cout<<B：<< endl; for（i = 0; i  
 //将密集矩阵转换为CSR格式
 TYPE * cscVal; 
 int * cscColPtr，* cscRowInd; 
 int nnz，* nnzPerCol; 
 CALL_CUDA（cudaHostAlloc（（void **）& nnzPerCol，K * sizeof（int），cudaHostAllocMapped））; 
 HANDLE_ERROR（cusparseDnnz（hdl，CUSPARSE_DIRECTION_COLUMN，M，K，descr，A，M，nnzPerCol，& nnz））; 
 cout<<A nnz<< nnz<< endl; 
 cout<<A nnz per col：; for（i = 0; i  CALL_CUDA（cudaMalloc（（void **）& cscVal，nnz * sizeof（TYPE）））; 
 CALL_CUDA（cudaMalloc（（void **）& cscColPtr，（K + 1）* sizeof（int））） 
 CALL_CUDA（cudaMalloc（（void **）& cscRowInd，nnz * sizeof（int）））; 
 HANDLE_ERROR（cusparseDdense2csc（hdl，M，K，descr，A，M，nnzPerCol，cscVal，cscRowInd，cscColPtr））; 
 TYPE * hcscVal; int * hcscColPtr，* hcscRowInd; 
 CALL_CUDA（cudaHostAlloc（（void **）& hcscVal，nnz * sizeof（TYPE），cudaHostAllocMapped））; 
 CALL_CUDA（cudaHostAlloc（（void **）& hcscColPtr，（K + 1）* sizeof（int），cudaHostAllocMapped））; 
 CALL_CUDA（cudaHostAlloc（（void **）& hcscRowInd，nnz * sizeof（int），cudaHostAllocMapped））; 
 CALL_CUDA（cudaMemcpy（hcscVal，cscVal，nnz * sizeof（TYPE），cudaMemcpyDeviceToHost））; 
 CALL_CUDA（cudaMemcpy（hcscColPtr，cscColPtr，（K + 1）* sizeof（int），cudaMemcpyDeviceToHost））; 
 CALL_CUDA（cudaMemcpy（hcscRowInd，cscRowInd，nnz * sizeof（int），cudaMemcpyDeviceToHost））; 
 cout<<cscVal：<< endl; for（i = 0; i  cout<<cscColPtr：<< endl; for（i = 0; i  cout<<cscRowInd：<< endl; for（i = 0; i  
 
 // GPU-cusparse计算
 TYPE * gB，* gOut; 
 CALL_CUDA（cudaMalloc（（void **）& gB，K * N * sizeof（TYPE）））; 
 CALL_CUDA（cudaMalloc（（void **）& gOut，M * N * sizeof（TYPE）））; 
 CALL_CUDA（cudaMemcpy（gB，B，K * N * sizeof（TYPE），cudaMemcpyHostToDevice））; 
 HANDLE_ERROR（cusparseDcsrmm（hdl，CUSPARSE_OPERATION_TRANSPOSE，M，N，K，nnz，& alpha，descr，cscVal，cscColPtr，cscRowInd，gB，K，& beta，gOut，M） 
 CALL_CUDA（cudaMemcpy（Out，gOut，M * N * sizeof（TYPE），cudaMemcpyDeviceToHost））; 
 cout<<Out：<< endl;对于（i = 0; i  
 
 //清除
 CALL_CUDA（cudaFreeHost（A））; 
 CALL_CUDA（cudaFreeHost（B））; 
 CALL_CUDA（cudaFreeHost（hcscVal））; 
 CALL_CUDA（cudaFreeHost（hcscColPtr））; 
 CALL_CUDA（cudaFreeHost（hcscRowInd））; 
 CALL_CUDA（cudaFreeHost（Out））; 
 CALL_CUDA（cudaFree（gB））; 
 CALL_CUDA（cudaFree（cscVal））; 
 CALL_CUDA（cudaFree（cscColPtr））; 
 CALL_CUDA（cudaFree（cscRowInd））; 
 CALL_CUDA（cudaFree（gOut））; 
 return 0; 
} 
  
 
 
解决方案
提供单独的 cscmm（）。
 
 
 现有的 csrmm $ c>可以做 cscmm（）的确切操作，只有当你翻转参数 cusparseOperation_t transA  。
 
 
 你可以这样做的原因是矩阵 A 的CSC表示形式与转置矩阵的CSR表示 A' 
 
The cusparse manual only provides a function cusparsecsrmm which will multiply a sparse matrix with CSR format by a dense matrix, but why it does not provide a cusparsecscmm function for sparse matrix with CSC format (since it was introduced as one of the sparse data format in the manual)? Am I missing something?

I tried as suggested by Eric, but it failed with:
cusparse Error: 3 in cusparse_test.cpp at line 106
and error code 3 is
CUSPARSE_STATUS_INVALID_VALUE invalid parameters were passed (m,n,k,nnz<0 or ldb and ldc are incorrect).
from the manual.

Following is my source code:
#include <iostream>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cusparse_v2.h>

#include <gsl/gsl_blas.h>
#include <gsl/gsl_vector.h>
#include <gsl/gsl_matrix.h>


using namespace std;


#define TYPE double
#define Zratio 0.3 //The percentage of zero values
#define M 3
#define K 2
#define N 3

#define CALL_CUDA( err ) \
{ if (err != cudaSuccess) \
    {cout<<"cuda Error "<<err<<" in "<<__FILE__<<" at line "<<__LINE__<<"\n"; exit(EXIT_FAILURE); }\
}

static void HandleError( cusparseStatus_t err,
                         const char *file,
                         int line ) { 
    if (err != CUSPARSE_STATUS_SUCCESS) {
        cout<<"cusparse Error: "<<err<<" in "<<file<<" at line "<<line<<endl;
        exit( EXIT_FAILURE );
    }   
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))

int main()
{
    cout<<"M: "<<M<<"; K: "<<K<<"; N: "<<N<<endl;

    const TYPE alpha =1.0, beta = 0.0;
    int i,j,k;
    double dif;

    float elapsedTime;
    cudaEvent_t start,stop;
    CALL_CUDA( cudaEventCreate( &start ) );
    CALL_CUDA( cudaEventCreate( &stop ) );

    cusparseHandle_t hdl;
    HANDLE_ERROR(cusparseCreate(&hdl));
    cusparseMatDescr_t descr;
    HANDLE_ERROR(cusparseCreateMatDescr(&descr));
    HANDLE_ERROR(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
    HANDLE_ERROR(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO));


    // Allocate host memory
    TYPE *A, *B, *Out, *x, *y;
    CALL_CUDA( cudaEventRecord( start, 0 ) );
    CALL_CUDA(cudaHostAlloc((void**)&A, M*K*sizeof(TYPE), cudaHostAllocMapped));
    CALL_CUDA(cudaHostAlloc((void**)&B, K*N*sizeof(TYPE), cudaHostAllocMapped));
    CALL_CUDA(cudaHostAlloc((void**)&Out, M*N*sizeof(TYPE), cudaHostAllocMapped));
    CALL_CUDA( cudaEventRecord( stop, 0 ) );
    CALL_CUDA( cudaEventSynchronize( stop ) );
    CALL_CUDA( cudaEventElapsedTime( &elapsedTime,start, stop ) );
    cout<<endl<<"Host allocation time: "<<elapsedTime<<" milliseconds"<<endl;
    TYPE randtmp;
    for(i=0; i<M; i++) for (j=0; j<K; j++) 
    {
        randtmp = (float)rand()/(float)RAND_MAX;
        A[j*M+i] = (randtmp<Zratio)?0:randtmp;
    }
    for(i=0; i<K; i++) for (j=0; j<N; j++) B[j*K+i] = (float)rand()/(float)RAND_MAX;

    cout<<"A:"<<endl; for (i=0; i<M; i++) {for (j=0; j<K; j++) cout<<A[j*M+i]<<" "; cout<<endl;}
    cout<<"B:"<<endl; for (i=0; i<K; i++) {for (j=0; j<N; j++) cout<<B[j*K+i]<<" "; cout<<endl;} cout<<endl;

    // Convert dense matrix to CSR format
    TYPE * cscVal;
    int * cscColPtr, * cscRowInd;
    int nnz, * nnzPerCol;
    CALL_CUDA(cudaHostAlloc((void**)&nnzPerCol, K*sizeof(int), cudaHostAllocMapped));
    HANDLE_ERROR(cusparseDnnz(hdl, CUSPARSE_DIRECTION_COLUMN, M, K, descr, A, M, nnzPerCol, &nnz));
    cout<<"A nnz "<<nnz<<endl;
    cout<<"A nnz per col: "; for (i=0; i<K; i++) cout<<nnzPerCol[i]<<" "; cout<<endl<<endl;
    CALL_CUDA(cudaMalloc((void**)&cscVal, nnz*sizeof(TYPE)));
    CALL_CUDA(cudaMalloc((void**)&cscColPtr, (K+1)*sizeof(int)));
    CALL_CUDA(cudaMalloc((void**)&cscRowInd, nnz*sizeof(int)));
    HANDLE_ERROR(cusparseDdense2csc(hdl, M, K, descr, A, M, nnzPerCol, cscVal, cscRowInd, cscColPtr));
    TYPE * hcscVal; int *hcscColPtr, *hcscRowInd;
    CALL_CUDA(cudaHostAlloc((void**)&hcscVal, nnz*sizeof(TYPE), cudaHostAllocMapped));
    CALL_CUDA(cudaHostAlloc((void**)&hcscColPtr, (K+1)*sizeof(int), cudaHostAllocMapped));
    CALL_CUDA(cudaHostAlloc((void**)&hcscRowInd, nnz*sizeof(int), cudaHostAllocMapped));
    CALL_CUDA(cudaMemcpy(hcscVal, cscVal, nnz*sizeof(TYPE), cudaMemcpyDeviceToHost));
    CALL_CUDA(cudaMemcpy(hcscColPtr, cscColPtr, (K+1)*sizeof(int), cudaMemcpyDeviceToHost));
    CALL_CUDA(cudaMemcpy(hcscRowInd, cscRowInd, nnz*sizeof(int), cudaMemcpyDeviceToHost));
    cout<<"cscVal: "<<endl; for (i=0; i<nnz; i++) cout<<hcscVal[i]<<" "; cout<<endl;
    cout<<"cscColPtr: "<<endl; for (i=0; i<K+1; i++) cout<<hcscColPtr[i]<<" "; cout<<endl;
    cout<<"cscRowInd: "<<endl; for (i=0; i<nnz; i++) cout<<hcscRowInd[i]<<" "; cout<<endl<<endl;


    // GPU-cusparse calculation
    TYPE *gB, *gOut;
    CALL_CUDA(cudaMalloc((void**)&gB,K*N*sizeof(TYPE)));
    CALL_CUDA(cudaMalloc((void**)&gOut,M*N*sizeof(TYPE)));
    CALL_CUDA(cudaMemcpy(gB, B, K*N*sizeof(TYPE), cudaMemcpyHostToDevice));
    HANDLE_ERROR(cusparseDcsrmm(hdl, CUSPARSE_OPERATION_TRANSPOSE, M, N, K, nnz, &alpha, descr, cscVal, cscColPtr, cscRowInd, gB, K, &beta, gOut, M));
    CALL_CUDA(cudaMemcpy(Out, gOut, M*N*sizeof(TYPE), cudaMemcpyDeviceToHost));
    cout<<"Out:"<<endl; for (i=0; i<M; i++) {for (j=0; j<N; j++) cout<<Out[j*M+i]<<" "; cout<<endl;}; cout<<endl;


    //clean up
    CALL_CUDA(cudaFreeHost(A));
    CALL_CUDA(cudaFreeHost(B));
    CALL_CUDA(cudaFreeHost(hcscVal));
    CALL_CUDA(cudaFreeHost(hcscColPtr));
    CALL_CUDA(cudaFreeHost(hcscRowInd));
    CALL_CUDA(cudaFreeHost(Out));
    CALL_CUDA(cudaFree(gB));
    CALL_CUDA(cudaFree(cscVal));
    CALL_CUDA(cudaFree(cscColPtr));
    CALL_CUDA(cudaFree(cscRowInd));
    CALL_CUDA(cudaFree(gOut));
    return 0;
}

 解决方案 
Because there's no such need to provide a seperate cscmm().

The existing csrmm() can do the exact operation that cscmm() will do, only if you flip over the parameter cusparseOperation_t transA.

The reason you can do this is that the CSC representation of a matrix A is exactly the same as the CSR representation of the transposed matrix A'

                        这篇关于为什么尖峰不具有尖锐的cscmm？的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持IT屋！


                    
                        查看全文

为什么尖峰不具有尖锐的cscmm？ [英] Why cusparse does not have a cusparse<t>cscmm?

问题描述

相关文章

其它硬件开发最新文章

热门教程

热门工具

登录关闭

为什么尖峰不具有尖锐的cscmm？ [英] Why cusparse does not have a cusparse&lt;t&gt;cscmm?

问题描述

相关文章

其它硬件开发最新文章

热门教程

热门工具

登录 关闭

为什么尖峰不具有尖锐的cscmm？ [英] Why cusparse does not have a cusparse<t>cscmm?

登录关闭