Rcpp并行或openmp用于矩阵向量乘积 [英] Rcpp Parallel or openmp for matrixvector product

查看：167 发布时间：2020/5/7 19:45:19 openmp rcpp matrix-multiplication armadillo rcppparallel

本文介绍了Rcpp并行或openmp用于矩阵向量乘积的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我正在尝试对Conjugate渐变的原始并行版本进行编程，因此我从简单的Wikipedia算法开始，并且我想通过相应的并行版本更改dot-products和MatrixVector产品，Rcppparallel文档具有使用parallelReduce的dot-product代码；我想我将在该版本的代码中使用该版本，但是我正在尝试进行MatrixVector乘法，但是与R base(无并行)

I am trying to program the naive parallel version of Conjugate gradient, so I started with the simple Wikipedia algorithm, and I want to change the dot-products and MatrixVector products by their appropriate parallel version, The Rcppparallel documentation has the code for the dot-product using parallelReduce; I think I'm gonna use that version for my code, but I'm trying to make the MatrixVector multiplication, but I haven't achieved good results compared to R base (no parallel)

并行矩阵乘法的某些版本:使用OpenMP，Rcppparallel，串行版本，带有Armadillo的串行版本以及基准测试

Some versions of parallel matrix multiplication: using OpenMP, Rcppparallel, serial version, a serial version with Armadillo, and the benchmark

// [[Rcpp::depends(RcppParallel)]]
#include <Rcpp.h>
#include <RcppParallel.h>
#include <numeric>
// #include <cstddef>
// #include <cstdio>
#include <iostream>
using namespace RcppParallel;
using namespace Rcpp;

struct InnerProduct : public Worker
{   
   // source vectors
   const RVector<double> x;
   const RVector<double> y;

   // product that I have accumulated
   double product;

   // constructors
   InnerProduct(const NumericVector x, const NumericVector y) 
      : x(x), y(y), product(0) {}
   InnerProduct(const InnerProduct& innerProduct, Split) 
      : x(innerProduct.x), y(innerProduct.y), product(0) {}

   // process just the elements of the range I've been asked to
   void operator()(std::size_t begin, std::size_t end) {
      product += std::inner_product(x.begin() + begin, 
                                    x.begin() + end, 
                                    y.begin() + begin, 
                                    0.0);
   }

   // join my value with that of another InnerProduct
   void join(const InnerProduct& rhs) { 
     product += rhs.product; 
   }
};

struct MatrixMultiplication : public Worker
{
   // source matrix
   const RMatrix<double> A;

    //source vector
   const RVector<double> x;

   // destination matrix
   RMatrix<double> out;

   // initialize with source and destination
   MatrixMultiplication(const NumericMatrix A, const NumericVector x, NumericMatrix out) 
     : A(A), x(x), out(out) {}

   // take the square root of the range of elements requested
   void operator()(std::size_t begin, std::size_t end) {
      for (std::size_t i = begin; i < end; i++) {
            // rows we will operate on
            //RMatrix<double>::Row rowi = A.row(i);
            RMatrix<double>::Row rowi = A.row(i);

            //double res = std::inner_product(rowi.begin(), rowi.end(), x.begin(), 0.0);
            //Rcout << "res" << res << std::endl;
            out(i,1) = std::inner_product(rowi.begin(), rowi.end(), x.begin(), 0.0);
            //Rcout << "res" << out(i,1) << std::endl;
      }
    }  
};

// [[Rcpp::export]]
double parallelInnerProduct(NumericVector x, NumericVector y) {

   // declare the InnerProduct instance that takes a pointer to the vector data
   InnerProduct innerProduct(x, y);

   // call paralleReduce to start the work
   parallelReduce(0, x.length(), innerProduct);

   // return the computed product
   return innerProduct.product;
}
//librar(Rbenchmark)

// [[Rcpp::export]]
NumericVector matrixXvectorRcppParallel(NumericMatrix A, NumericVector x) {

   // // declare the InnerProduct instance that takes a pointer to the vector data
   // InnerProduct innerProduct(x, y);
   int nrows = A.nrow();
   NumericVector out(nrows);
   for(int i = 0; i< nrows;i++ )
   {
      out(i) = parallelInnerProduct(A(i,_),x);
   }
   // return the computed product
   return out;
}

// [[Rcpp::export]]
arma::rowvec matrixXvectorParallel(arma::mat A, arma::colvec x){
    arma::rowvec y = A.row(0)*0;
    int filas = A.n_rows;
    int columnas = A.n_cols;
    #pragma omp parallel for
    for(int j=0;j<columnas;j++)
    {
        //y(j) = A.row(j)*x(j))
        y(j) = dotproduct(A.row(j),x);
    }
    return y;
} 

arma::mat matrixXvector2(arma::mat A, arma::mat x){
  //arma::rowvec y = A.row(0)*0;
  //y=A*x;
  return A*x;
}

arma::rowvec matrixXvectorParallel2(arma::mat A, arma::colvec x){
    arma::rowvec y = A.row(0)*0;
    int filas = A.n_rows;
    int columnas = A.n_cols;

 #pragma omp parallel for
    for(int j = 0; j < columnas ; j++){
        double result = 0;
        for(int i = 0; i < filas; i++){
                result += x(i)*A(j,i);   
        }
        y(j) = result;
    }
    return y;
}

基准

                             test replications elapsed relative user.self sys.self user.child sys.child
1                         M %*% a           20   0.026    1.000     0.140    0.060          0         0
2 matrixXvector2(M, as.matrix(a))           20   0.040    1.538     0.101    0.217          0         0
4    matrixXvectorParallel2(M, a)           20   0.063    2.423     0.481    0.000          0         0
3     matrixXvectorParallel(M, a)           20   0.146    5.615     0.745    0.398          0         0
5 matrixXvectorRcppParallel(M, a)           20   0.335   12.885     2.305    0.079          0         0

目前，我的最后一次试用是将Parallefor与Rcppparallel一起使用，但是我遇到了内存错误，而且我不知道问题出在哪里

My last trial at the moment was using parallefor with Rcppparallel, but I'm getting memory errors and I dont have idea where the problem is

// [[Rcpp::export]]
NumericVector matrixXvectorRcppParallel2(NumericMatrix A, NumericVector x) {

   // // declare the InnerProduct instance that takes a pointer to the vector data
   int nrows = A.nrow();
   NumericMatrix out(nrows,1); //allocar mempria de vector de salida
   //crear worker
   MatrixMultiplication matrixMultiplication(A, x, out);


   parallelFor(0,A.nrow(),matrixMultiplication);

   // return the computed product
   return out;
}

我注意到的是，当我使用htop检入终端时，处理器的工作方式时，我在htop中看到了当我使用基于R-base的常规矩阵矢量乘法(即使用所有处理器)时，矩阵是否乘法默认并行执行?因为从理论上讲，如果是串行版本，则只有一个处理器可以工作.

What I notice is that when I check in my terminal using htop how the processors are working, I see in htop when I apply the conventional Matrix vector multiplication using R-base, that is using all the processors, so Does the matrix multiplication perform parallel by default? because in theory, only one processor should be working if is the serial version.

如果有人知道哪种更好的方法，OpenMP或Rcppparallel或另一种方法，比明显的R-base串行版本给我更好的性能.

If someone knows which is the better path, OpenMP or Rcppparallel, or another way, that gives me better performance than the apparently serial version of R-base.

目前共轭梯度的序列码

// [[Rcpp::export]]
arma::colvec ConjugateGradient(arma::mat A, arma::colvec xini, arma::colvec b, int num_iteraciones){
    //arma::colvec xnew = xini*0 //inicializar en 0's
    arma::colvec x= xini; //inicializar en 0's
    arma::colvec rkold = b - A*xini;
    arma::colvec rknew = b*0;
    arma::colvec pk = rkold;
    int k=0;
    double alpha_k=0;
    double betak=0;
    double normak = 0.0;

    for(k=0; k<num_iteraciones;k++){
         Rcout << "iteracion numero " << k << std::endl;
        alpha_k =  sum(rkold.t() * rkold) / sum(pk.t()*A*pk); //sum de un elemento para realizar casting
        (pk.t()*A*pk);
        x = x+ alpha_k * pk;
        rknew = rkold - alpha_k*A*pk;
        normak =  sum(rknew.t()*rknew);
        if( normak < 0.000001){
            break;
        }
        betak = sum(rknew.t()*rknew) / sum( rkold.t() * rkold );

        //actualizar valores para siguiente iteracion
        pk = rknew + betak*pk;
        rkold = rknew;

    }

    return x;

}

感谢Hong Ooi和tim18，我不知道在R中使用BLAS，所以使用option(matprod ="internal")和option(matprod ="blas")的新基准测试

I wasn't aware of the use of BLAS in R, thanks Hong Ooi and tim18, so the new benchmark using option(matprod="internal") and option(matprod="blas")

options(matprod = "internal")
res<-benchmark(M%*%a,matrixXvector2(M,as.matrix(a)),matrixXvectorParallel(M,a),matrixXvectorParallel2(M,a),matrixXvectorRcppParallel(M,a),order="relative",replications = 20)
res

                             test replications elapsed relative user.self sys.self user.child sys.child
2 matrixXvector2(M, as.matrix(a))           20   0.043    1.000     0.107    0.228          0         0
4    matrixXvectorParallel2(M, a)           20   0.069    1.605     0.530    0.000          0         0
1                         M %*% a           20   0.072    1.674     0.071    0.000          0         0
3     matrixXvectorParallel(M, a)           20   0.140    3.256     0.746    0.346          0         0
5 matrixXvectorRcppParallel(M, a)           20   0.343    7.977     2.272    0.175          0         0

options(matprod ="blas")

options(matprod="blas")

options(matprod = "blas")

res<-benchmark(M%*%a,matrixXvector2(M,as.matrix(a)),matrixXvectorParallel(M,a),matrixXvectorParallel2(M,a),matrixXvectorRcppParallel(M,a),order="relative",replications = 20)
res
                             test replications elapsed relative user.self sys.self user.child sys.child
1                         M %*% a           20   0.021    1.000     0.093    0.054          0         0
2 matrixXvector2(M, as.matrix(a))           20   0.092    4.381     0.177    0.464          0         0
5 matrixXvectorRcppParallel(M, a)           20   0.328   15.619     2.143    0.109          0         0
4    matrixXvectorParallel2(M, a)           20   0.438   20.857     3.036    0.000          0         0
3     matrixXvectorParallel(M, a)           20   0.546   26.000     3.667    0.127          0         0

Rcpp并行或openmp用于矩阵向量乘积 [英] Rcpp Parallel or openmp for matrixvector product

问题描述

推荐答案

相关文章

其他开发最新文章

热门教程

热门工具

登录关闭

Rcpp并行或openmp用于矩阵向量乘积 [英] Rcpp Parallel or openmp for matrixvector product

问题描述

推荐答案

相关文章

其他开发最新文章

热门教程

热门工具

登录 关闭

登录关闭