为什么尖峰不具有尖锐的cscmm? [英] Why cusparse does not have a cusparse<t>cscmm?
问题描述
我试过像Eric的建议,但它失败了:
cusparse错误:3在cusparse_test.cpp在第106行
和错误代码3是
CUSPARSE_STATUS_INVALID_VALUE传递了无效的参数(m,n,k,nnz <0或ldb和ldc不正确)。
我的源代码: #include< iostream>
#include< stdlib.h>
#include< cuda_runtime.h>
#include< cusparse_v2.h>
#include< gsl / gsl_blas.h>
#include< gsl / gsl_vector.h>
#include< gsl / gsl_matrix.h>
使用namespace std;
#define TYPE double
#define Zratio 0.3 //零值的百分比
#define M 3
#define K 2
#define N 3
#define CALL_CUDA(err)\
{if(err!= cudaSuccess)\
{cout<<cuda Error <<在线<< __ LINE __<\ n中的<__ FILE __ < exit(EXIT_FAILURE); } \
}
static void HandleError(cusparseStatus_t err,
const char * file,
int line){
if(err!= CUSPARSE_STATUS_SUCCESS )<<<at line<<<<<< endl;{
cout exit(EXIT_FAILURE);
}
}
#define HANDLE_ERROR(err)(HandleError(err,__FILE__,__LINE__))
int main()
{
cout
const TYPE alpha = 1.0,beta = 0.0;
int i,j,k;
double dif;
float elapsedTime;
cudaEvent_t start,stop;
CALL_CUDA(cudaEventCreate(& start));
CALL_CUDA(cudaEventCreate(& stop));
cusparseHandle_t hdl;
HANDLE_ERROR(cusparseCreate(& hdl));
cusparseMatDescr_t descr;
HANDLE_ERROR(cusparseCreateMatDescr(& descr));
HANDLE_ERROR(cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL));
HANDLE_ERROR(cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO));
//分配主机内存
TYPE * A,* B,* Out,* x,* y;
CALL_CUDA(cudaEventRecord(start,0));
CALL_CUDA(cudaHostAlloc((void **)& A,M * K * sizeof(TYPE),cudaHostAllocMapped));
CALL_CUDA(cudaHostAlloc((void **)& B,K * N * sizeof(TYPE),cudaHostAllocMapped));
CALL_CUDA(cudaHostAlloc((void **)& Out,M * N * sizeof(TYPE),cudaHostAllocMapped));
CALL_CUDA(cudaEventRecord(stop,0));
CALL_CUDA(cudaEventSynchronize(stop));
CALL_CUDA(cudaEventElapsedTime(& elapsedTime,start,stop));
cout<< endl<<Host allocation time:<< elapsedTime<<milliseconds<< endl;
TYPE randtmp;
for(i = 0; i {
randtmp =(float)rand ;
A [j * M + i] =(randtmp< Zratio)0:randtmp; $(j = 0; j
cout<<A:<< endl; (i = 0; i cout<<B:<< endl; for(i = 0; i
//将密集矩阵转换为CSR格式
TYPE * cscVal;
int * cscColPtr,* cscRowInd;
int nnz,* nnzPerCol;
CALL_CUDA(cudaHostAlloc((void **)& nnzPerCol,K * sizeof(int),cudaHostAllocMapped));
HANDLE_ERROR(cusparseDnnz(hdl,CUSPARSE_DIRECTION_COLUMN,M,K,descr,A,M,nnzPerCol,& nnz));
cout<<A nnz<< nnz<< endl;
cout<<A nnz per col:; for(i = 0; i CALL_CUDA(cudaMalloc((void **)& cscVal,nnz * sizeof(TYPE)));
CALL_CUDA(cudaMalloc((void **)& cscColPtr,(K + 1)* sizeof(int)))
CALL_CUDA(cudaMalloc((void **)& cscRowInd,nnz * sizeof(int)));
HANDLE_ERROR(cusparseDdense2csc(hdl,M,K,descr,A,M,nnzPerCol,cscVal,cscRowInd,cscColPtr));
TYPE * hcscVal; int * hcscColPtr,* hcscRowInd;
CALL_CUDA(cudaHostAlloc((void **)& hcscVal,nnz * sizeof(TYPE),cudaHostAllocMapped));
CALL_CUDA(cudaHostAlloc((void **)& hcscColPtr,(K + 1)* sizeof(int),cudaHostAllocMapped));
CALL_CUDA(cudaHostAlloc((void **)& hcscRowInd,nnz * sizeof(int),cudaHostAllocMapped));
CALL_CUDA(cudaMemcpy(hcscVal,cscVal,nnz * sizeof(TYPE),cudaMemcpyDeviceToHost));
CALL_CUDA(cudaMemcpy(hcscColPtr,cscColPtr,(K + 1)* sizeof(int),cudaMemcpyDeviceToHost));
CALL_CUDA(cudaMemcpy(hcscRowInd,cscRowInd,nnz * sizeof(int),cudaMemcpyDeviceToHost));
cout<<cscVal:<< endl; for(i = 0; i cout<<cscColPtr:<< endl; for(i = 0; i cout<<cscRowInd:<< endl; for(i = 0; i
// GPU-cusparse计算
TYPE * gB,* gOut;
CALL_CUDA(cudaMalloc((void **)& gB,K * N * sizeof(TYPE)));
CALL_CUDA(cudaMalloc((void **)& gOut,M * N * sizeof(TYPE)));
CALL_CUDA(cudaMemcpy(gB,B,K * N * sizeof(TYPE),cudaMemcpyHostToDevice));
HANDLE_ERROR(cusparseDcsrmm(hdl,CUSPARSE_OPERATION_TRANSPOSE,M,N,K,nnz,& alpha,descr,cscVal,cscColPtr,cscRowInd,gB,K,& beta,gOut,M)
CALL_CUDA(cudaMemcpy(Out,gOut,M * N * sizeof(TYPE),cudaMemcpyDeviceToHost));
cout<<Out:<< endl;对于(i = 0; i
//清除
CALL_CUDA(cudaFreeHost(A));
CALL_CUDA(cudaFreeHost(B));
CALL_CUDA(cudaFreeHost(hcscVal));
CALL_CUDA(cudaFreeHost(hcscColPtr));
CALL_CUDA(cudaFreeHost(hcscRowInd));
CALL_CUDA(cudaFreeHost(Out));
CALL_CUDA(cudaFree(gB));
CALL_CUDA(cudaFree(cscVal));
CALL_CUDA(cudaFree(cscColPtr));
CALL_CUDA(cudaFree(cscRowInd));
CALL_CUDA(cudaFree(gOut));
return 0;
}
解决方案提供单独的 cscmm()
。
现有的 csrmm $ c>可以做 cscmm()
的确切操作,只有当你翻转参数 cusparseOperation_t transA
。
你可以这样做的原因是矩阵 A
的CSC表示形式与转置矩阵的CSR表示 A'
The cusparse manual only provides a function cusparsecsrmm which will multiply a sparse matrix with CSR format by a dense matrix, but why it does not provide a cusparsecscmm function for sparse matrix with CSC format (since it was introduced as one of the sparse data format in the manual)? Am I missing something?
I tried as suggested by Eric, but it failed with:
cusparse Error: 3 in cusparse_test.cpp at line 106
and error code 3 is
CUSPARSE_STATUS_INVALID_VALUE invalid parameters were passed (m,n,k,nnz<0 or ldb and ldc are incorrect).
from the manual.
Following is my source code:
#include <iostream>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cusparse_v2.h>
#include <gsl/gsl_blas.h>
#include <gsl/gsl_vector.h>
#include <gsl/gsl_matrix.h>
using namespace std;
#define TYPE double
#define Zratio 0.3 //The percentage of zero values
#define M 3
#define K 2
#define N 3
#define CALL_CUDA( err ) \
{ if (err != cudaSuccess) \
{cout<<"cuda Error "<<err<<" in "<<__FILE__<<" at line "<<__LINE__<<"\n"; exit(EXIT_FAILURE); }\
}
static void HandleError( cusparseStatus_t err,
const char *file,
int line ) {
if (err != CUSPARSE_STATUS_SUCCESS) {
cout<<"cusparse Error: "<<err<<" in "<<file<<" at line "<<line<<endl;
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
int main()
{
cout<<"M: "<<M<<"; K: "<<K<<"; N: "<<N<<endl;
const TYPE alpha =1.0, beta = 0.0;
int i,j,k;
double dif;
float elapsedTime;
cudaEvent_t start,stop;
CALL_CUDA( cudaEventCreate( &start ) );
CALL_CUDA( cudaEventCreate( &stop ) );
cusparseHandle_t hdl;
HANDLE_ERROR(cusparseCreate(&hdl));
cusparseMatDescr_t descr;
HANDLE_ERROR(cusparseCreateMatDescr(&descr));
HANDLE_ERROR(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
HANDLE_ERROR(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO));
// Allocate host memory
TYPE *A, *B, *Out, *x, *y;
CALL_CUDA( cudaEventRecord( start, 0 ) );
CALL_CUDA(cudaHostAlloc((void**)&A, M*K*sizeof(TYPE), cudaHostAllocMapped));
CALL_CUDA(cudaHostAlloc((void**)&B, K*N*sizeof(TYPE), cudaHostAllocMapped));
CALL_CUDA(cudaHostAlloc((void**)&Out, M*N*sizeof(TYPE), cudaHostAllocMapped));
CALL_CUDA( cudaEventRecord( stop, 0 ) );
CALL_CUDA( cudaEventSynchronize( stop ) );
CALL_CUDA( cudaEventElapsedTime( &elapsedTime,start, stop ) );
cout<<endl<<"Host allocation time: "<<elapsedTime<<" milliseconds"<<endl;
TYPE randtmp;
for(i=0; i<M; i++) for (j=0; j<K; j++)
{
randtmp = (float)rand()/(float)RAND_MAX;
A[j*M+i] = (randtmp<Zratio)?0:randtmp;
}
for(i=0; i<K; i++) for (j=0; j<N; j++) B[j*K+i] = (float)rand()/(float)RAND_MAX;
cout<<"A:"<<endl; for (i=0; i<M; i++) {for (j=0; j<K; j++) cout<<A[j*M+i]<<" "; cout<<endl;}
cout<<"B:"<<endl; for (i=0; i<K; i++) {for (j=0; j<N; j++) cout<<B[j*K+i]<<" "; cout<<endl;} cout<<endl;
// Convert dense matrix to CSR format
TYPE * cscVal;
int * cscColPtr, * cscRowInd;
int nnz, * nnzPerCol;
CALL_CUDA(cudaHostAlloc((void**)&nnzPerCol, K*sizeof(int), cudaHostAllocMapped));
HANDLE_ERROR(cusparseDnnz(hdl, CUSPARSE_DIRECTION_COLUMN, M, K, descr, A, M, nnzPerCol, &nnz));
cout<<"A nnz "<<nnz<<endl;
cout<<"A nnz per col: "; for (i=0; i<K; i++) cout<<nnzPerCol[i]<<" "; cout<<endl<<endl;
CALL_CUDA(cudaMalloc((void**)&cscVal, nnz*sizeof(TYPE)));
CALL_CUDA(cudaMalloc((void**)&cscColPtr, (K+1)*sizeof(int)));
CALL_CUDA(cudaMalloc((void**)&cscRowInd, nnz*sizeof(int)));
HANDLE_ERROR(cusparseDdense2csc(hdl, M, K, descr, A, M, nnzPerCol, cscVal, cscRowInd, cscColPtr));
TYPE * hcscVal; int *hcscColPtr, *hcscRowInd;
CALL_CUDA(cudaHostAlloc((void**)&hcscVal, nnz*sizeof(TYPE), cudaHostAllocMapped));
CALL_CUDA(cudaHostAlloc((void**)&hcscColPtr, (K+1)*sizeof(int), cudaHostAllocMapped));
CALL_CUDA(cudaHostAlloc((void**)&hcscRowInd, nnz*sizeof(int), cudaHostAllocMapped));
CALL_CUDA(cudaMemcpy(hcscVal, cscVal, nnz*sizeof(TYPE), cudaMemcpyDeviceToHost));
CALL_CUDA(cudaMemcpy(hcscColPtr, cscColPtr, (K+1)*sizeof(int), cudaMemcpyDeviceToHost));
CALL_CUDA(cudaMemcpy(hcscRowInd, cscRowInd, nnz*sizeof(int), cudaMemcpyDeviceToHost));
cout<<"cscVal: "<<endl; for (i=0; i<nnz; i++) cout<<hcscVal[i]<<" "; cout<<endl;
cout<<"cscColPtr: "<<endl; for (i=0; i<K+1; i++) cout<<hcscColPtr[i]<<" "; cout<<endl;
cout<<"cscRowInd: "<<endl; for (i=0; i<nnz; i++) cout<<hcscRowInd[i]<<" "; cout<<endl<<endl;
// GPU-cusparse calculation
TYPE *gB, *gOut;
CALL_CUDA(cudaMalloc((void**)&gB,K*N*sizeof(TYPE)));
CALL_CUDA(cudaMalloc((void**)&gOut,M*N*sizeof(TYPE)));
CALL_CUDA(cudaMemcpy(gB, B, K*N*sizeof(TYPE), cudaMemcpyHostToDevice));
HANDLE_ERROR(cusparseDcsrmm(hdl, CUSPARSE_OPERATION_TRANSPOSE, M, N, K, nnz, &alpha, descr, cscVal, cscColPtr, cscRowInd, gB, K, &beta, gOut, M));
CALL_CUDA(cudaMemcpy(Out, gOut, M*N*sizeof(TYPE), cudaMemcpyDeviceToHost));
cout<<"Out:"<<endl; for (i=0; i<M; i++) {for (j=0; j<N; j++) cout<<Out[j*M+i]<<" "; cout<<endl;}; cout<<endl;
//clean up
CALL_CUDA(cudaFreeHost(A));
CALL_CUDA(cudaFreeHost(B));
CALL_CUDA(cudaFreeHost(hcscVal));
CALL_CUDA(cudaFreeHost(hcscColPtr));
CALL_CUDA(cudaFreeHost(hcscRowInd));
CALL_CUDA(cudaFreeHost(Out));
CALL_CUDA(cudaFree(gB));
CALL_CUDA(cudaFree(cscVal));
CALL_CUDA(cudaFree(cscColPtr));
CALL_CUDA(cudaFree(cscRowInd));
CALL_CUDA(cudaFree(gOut));
return 0;
}
解决方案 Because there's no such need to provide a seperate cscmm()
.
The existing csrmm()
can do the exact operation that cscmm()
will do, only if you flip over the parameter cusparseOperation_t transA
.
The reason you can do this is that the CSC representation of a matrix A
is exactly the same as the CSR representation of the transposed matrix A'
这篇关于为什么尖峰不具有尖锐的cscmm?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!