RBM尚未在代码上使用OpenACC进行任何改进 [英] RBM no improvement with OpenACC on the code yet

查看:77
本文介绍了RBM尚未在代码上使用OpenACC进行任何改进的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

RBM算法是开源算法 源代码可在此处获取: https://github.com/yusugomori/DeepLearning/tree /master/cpp

RBM algorithm is open source algorithm the source code is available here: https://github.com/yusugomori/DeepLearning/tree/master/cpp

我尝试通过不同的方式对OpenACC进行改进,但是顺序代码仍然更好 那么,您能告诉我应该怎么做(需要改进的部分)以获得高水平的改进​​

I tried to get improvement with OpenACC by different ways but the sequential code still better So can you tell me what should be done (part needs to improved) to get high improvement

#include <iostream>
#include <math.h>
#include "utils.h"
#include "RBM.h"
using namespace std;
using namespace utils;


RBM::RBM(int size, int n_v, int n_h, double **w, double *hb, double *vb) {
N = size;
n_visible = n_v;
n_hidden = n_h;

#pragma acc enter data copyin ( this)

//#pragma acc enter data copy ( W[0:n_hidden][0:n_visible] )
if(w == NULL) {
W = new double*[n_hidden];
for(int i=0; i<n_hidden; i++) W[i] = new double[n_visible];
double a = 1.0 / n_visible;

for(int i=0; i<n_hidden; i++) {
for(int j=0; j<n_visible; j++) {
W[i][j] = uniform(-a, a);
}
}
} else {
W = w;
}

if(hb == NULL) {
hbias = new double[n_hidden];
for(int i=0; i<n_hidden; i++) hbias[i] = 0;
} else {
hbias = hb;
}

if(vb == NULL) {
vbias = new double[n_visible];
for(int i=0; i<n_visible; i++) vbias[i] = 0;
} else {
vbias = vb;
}
}

RBM::~RBM() {

#pragma acc exit data delete ( W[0:n_hidden][0:n_visible],this )

for(int i=0; i<n_hidden; i++) delete[] W[i];
delete[] W;
delete[] hbias;
delete[] vbias;
}


void RBM::contrastive_divergence(int *input, double lr, int k) {
double *ph_mean = new double[n_hidden];
int *ph_sample = new int[n_hidden];
double *nv_means = new double[n_visible];
int *nv_samples = new int[n_visible];
double *nh_means = new double[n_hidden];
int *nh_samples = new int[n_hidden];

/* CD-k */
sample_h_given_v(input, ph_mean, ph_sample);

for(int step=0; step<k; step++) {
if(step == 0) {
gibbs_hvh(ph_sample, nv_means, nv_samples, nh_means, nh_samples);
} else {
gibbs_hvh(nh_samples, nv_means, nv_samples, nh_means, nh_samples);
}
}

for(int i=0; i<n_hidden; i++) {
for(int j=0; j<n_visible; j++) {
// W[i][j] += lr * (ph_sample[i] * input[j] - nh_means[i] * nv_samples[j]) / N;
W[i][j] += lr * (ph_mean[i] * input[j] - nh_means[i] * nv_samples[j]) / N;
}
hbias[i] += lr * (ph_sample[i] - nh_means[i]) / N;
}

for(int i=0; i<n_visible; i++) {
vbias[i] += lr * (input[i] - nv_samples[i]) / N;
}

delete[] ph_mean;
delete[] ph_sample;
delete[] nv_means;
delete[] nv_samples;
delete[] nh_means;
delete[] nh_samples;
}

void RBM::sample_h_given_v(int *v0_sample, double *mean, int *sample) {
for(int i=0; i<n_hidden; i++) {
mean[i] = propup(v0_sample, W[i], hbias[i]);
sample[i] = binomial(1, mean[i]);
}
}

void RBM::sample_v_given_h(int *h0_sample, double *mean, int *sample) {
for(int i=0; i<n_visible; i++) {
mean[i] = propdown(h0_sample, i, vbias[i]);
sample[i] = binomial(1, mean[i]);
}
}

double RBM::propup(int *v, double *w, double b) {

double pre_sigmoid_activation = 0.0;
#pragma acc enter data present ( this )
#pragma acc data copyin(v[0:n_visible],w[0:n_visible])


#pragma acc parallel
{

#pragma acc loop reduction(+:pre_sigmoid_activation) 
for(int j=0; j<n_visible; j++) {
pre_sigmoid_activation += w[j] * v[j];
}
}

pre_sigmoid_activation += b;
return sigmoid(pre_sigmoid_activation);
}
double RBM::propdown(int *h, int i, double b) {

double pre_sigmoid_activation = 0.0;

#pragma acc enter data present ( this)//,W[0:n_hidden][0:n_visible] )
#pragma acc enter data copyin ( W[0:n_hidden][0:n_visible] )
#pragma acc data copyin(h[0:n_hidden]) 

#pragma acc parallel 

{
#pragma acc loop reduction(+:pre_sigmoid_activation) 
for(int j=0; j<n_hidden; j++) {
pre_sigmoid_activation += W[j][i] * h[j];
}
}

pre_sigmoid_activation += b;

return sigmoid(pre_sigmoid_activation);

}
void RBM::gibbs_hvh(int *h0_sample, double *nv_means, int *nv_samples, \
        double *nh_means, int *nh_samples) {
sample_v_given_h(h0_sample, nv_means, nv_samples);
sample_h_given_v(nv_samples, nh_means, nh_samples);
}

void RBM::reconstruct(int *v, double *reconstructed_v) {
double *h = new double[n_hidden];
double pre_sigmoid_activation;

for(int i=0; i<n_hidden; i++) {
h[i] = propup(v, W[i], hbias[i]);
}

for(int i=0; i<n_visible; i++) {
pre_sigmoid_activation = 0.0;
for(int j=0; j<n_hidden; j++) {
pre_sigmoid_activation += W[j][i] * h[j];
}
pre_sigmoid_activation += vbias[i];

reconstructed_v[i] = sigmoid(pre_sigmoid_activation);
}

delete[] h;

//----------------------------------------------------The main
void test_rbm() {

srand(0);

double learning_rate = 0.1;

int training_epochs = 1000;

int k = 1;



int train_N = 6;

int test_N = 2;

int n_visible = 6;

int n_hidden = 3;



// training data

int train_X[6][6] = {

{1, 1, 1, 0, 0, 0},

{1, 0, 1, 0, 0, 0},

{1, 1, 1, 0, 0, 0},

{0, 0, 1, 1, 1, 0},

{0, 0, 1, 0, 1, 0},

{0, 0, 1, 1, 1, 0}

};





// construct RBM

RBM rbm(train_N, n_visible, n_hidden, NULL, NULL, NULL);



// train

for(int epoch=0; epoch<training_epochs; epoch++) {

for(int i=0; i<train_N; i++) {

rbm.contrastive_divergence(train_X[i], learning_rate, k);

}

}



// test data

int test_X[2][6] = {

{1, 1, 0, 0, 0, 0},

{0, 0, 0, 1, 1, 0}

};

double reconstructed_X[2][6];





// test

for(int i=0; i<test_N; i++) {

rbm.reconstruct(test_X[i], reconstructed_X[i]);

for(int j=0; j<n_visible; j++) {

printf("%.5f ", reconstructed_X[i][j]);

}

cout << endl;

}



}







int main() {

test_rbm();

return 0;

推荐答案

您有一些错误,这些错误为您提供了错误的答案.我在下面更正了这些.

You have a few errors which were giving you wrong answers. I've correct these below.

关于性能,您没有足够的并行性来按顺序执行代码.您要并行化的循环具有很少的计算量,减少的量并且非常小.要查看主机上的速度,您需要使用更大的大小(数千个长度),并且最好将并行性的组级别推到更高的循环.我试过了,但是二项式例程有一个依赖项(对rand的调用),它阻止了"sample_ [vh] _given [_vh]"中循环的并行化.

As for performance, you don't have enough parallelism to out perform running the code sequentially. The loops you're parallelizing have very little compute, use a reductions, and are very small. To see speed-up over the host, you'll need to use larger sizes (lengths of many thousands) and preferably push the gang level of the parallelism to a higher loop. I tried this, but the binomial routine has a dependency (the call to rand) which prevents the parallelization of the loops in "sample_[vh]_given[_vh]".

#include <iostream>
#include <math.h>
#include "utils.h"
#include "RBM.h"
using namespace std;
using namespace utils;

RBM::RBM(int size, int n_v, int n_h, double **w, double *hb, double *vb) {
        N = size;
        n_visible = n_v;
        n_hidden = n_h;

        if(w == NULL) {
                W = new double*[n_hidden];
                for(int i=0; i<n_hidden; i++) W[i] = new double[n_visible];
                double a = 1.0 / n_visible;

                for(int i=0; i<n_hidden; i++) {
                        for(int j=0; j<n_visible; j++) {
                                W[i][j] = uniform(-a, a);
                        }
                }
        } else {
                W = w;
        }

        if(hb == NULL) {
                hbias = new double[n_hidden];
                for(int i=0; i<n_hidden; i++) hbias[i] = 0;
        } else {
                hbias = hb;
        }

        if(vb == NULL) {
                vbias = new double[n_visible];
                for(int i=0; i<n_visible; i++) vbias[i] = 0;
        } else {
                vbias = vb;
        }
#pragma acc enter data copyin (this,W[0:n_hidden][0:n_visible],hbias[0:n_hidden],vbias[0:n_visible])
}

RBM::~RBM() {

#pragma acc exit data delete ( hbias[0:n_hidden],vbias[0:n_visible],W[0:n_hidden][0:n_visible],this )
        for(int i=0; i<n_hidden; i++) delete[] W[i];
        delete[] W;
        delete[] hbias;
        delete[] vbias;
}

void RBM::contrastive_divergence(int *input, double lr, int k) {
        double *ph_mean = new double[n_hidden];
        int *ph_sample = new int[n_hidden];
        double *nv_means = new double[n_visible];
        int *nv_samples = new int[n_visible];
        double *nh_means = new double[n_hidden];
        int *nh_samples = new int[n_hidden];

        /* CD-k */
        sample_h_given_v(input, ph_mean, ph_sample);

        for(int step=0; step<k; step++) {
                if(step == 0) {
                        gibbs_hvh(ph_sample, nv_means, nv_samples, nh_means, nh_samples);
                } else {
                        gibbs_hvh(nh_samples, nv_means, nv_samples, nh_means, nh_samples);
                }
        }

        for(int i=0; i<n_hidden; i++) {
                for(int j=0; j<n_visible; j++) {
                        // W[i][j] += lr * (ph_sample[i] * input[j] - nh_means[i] * nv_samples[j]) / N;
                        W[i][j] += lr * (ph_mean[i] * input[j] - nh_means[i] * nv_samples[j]) / N;
                }
                hbias[i] += lr * (ph_sample[i] - nh_means[i]) / N;
        }

        for(int i=0; i<n_visible; i++) {
                vbias[i] += lr * (input[i] - nv_samples[i]) / N;
        }
#pragma acc update device(vbias[0:n_visible],hbias[0:n_hidden],W[0:n_hidden][0:n_visible])

        delete[] ph_mean;
        delete[] ph_sample;
        delete[] nv_means;
        delete[] nv_samples;
        delete[] nh_means;
        delete[] nh_samples;
}

void RBM::sample_h_given_v(int *v0_sample, double *mean, int *sample) {

#pragma acc data copyin(v0_sample[0:n_visible])
        {
                for(int i=0; i<n_hidden; i++) {
                        mean[i] = propup(v0_sample, W[i], hbias[i]);
                        sample[i] = binomial(1, mean[i]);
                }
        }
}

void RBM::sample_v_given_h(int *h0_sample, double *mean, int *sample) {
#pragma acc data copyin(h0_sample[0:n_visible])
        {
                for(int i=0; i<n_visible; i++) {
                        mean[i] = propdown(h0_sample, i, vbias[i]);
                        sample[i] = binomial(1, mean[i]);
                }
        }
}

double RBM::propup(int *v, double *w, double b) {

        double pre_sigmoid_activation = 0.0;
#pragma acc parallel present(w,v)
        {
#pragma acc loop reduction(+:pre_sigmoid_activation)
                for(int j=0; j<n_visible; j++) {
                        pre_sigmoid_activation += w[j] * v[j];
                }
        }

        pre_sigmoid_activation += b;
        return sigmoid(pre_sigmoid_activation);
}
double RBM::propdown(int *h, int i, double b) {

        double pre_sigmoid_activation = 0.0;

#pragma acc parallel present(W,h)
        {
#pragma acc loop reduction(+:pre_sigmoid_activation)
                for(int j=0; j<n_hidden; j++) {
                        pre_sigmoid_activation += W[j][i] * h[j];
                }
        }

        pre_sigmoid_activation += b;

        return sigmoid(pre_sigmoid_activation);

}
void RBM::gibbs_hvh(int *h0_sample, double *nv_means, int *nv_samples, \
                double *nh_means, int *nh_samples) {
        sample_v_given_h(h0_sample, nv_means, nv_samples);
        sample_h_given_v(nv_samples, nh_means, nh_samples);
}

void RBM::reconstruct(int *v, double *reconstructed_v) {
        double *h = new double[n_hidden];
        double pre_sigmoid_activation;

#pragma acc data copyin(v[0:n_visible])
        {

                for(int i=0; i<n_hidden; i++) {
                        h[i] = propup(v, W[i], hbias[i]);
                }

                for(int i=0; i<n_visible; i++) {
                        pre_sigmoid_activation = 0.0;
                        for(int j=0; j<n_hidden; j++) {
                                pre_sigmoid_activation += W[j][i] * h[j];
                        }
                        pre_sigmoid_activation += vbias[i];

                        reconstructed_v[i] = sigmoid(pre_sigmoid_activation);
                }
        }
        delete[] h;
}

//----------------------------------------------------The main
void test_rbm() {

        srand(0);
        double learning_rate = 0.1;
        int training_epochs = 1000;
        int k = 1;
        int train_N = 6;
        int test_N = 2;
        int n_visible = 6;
        int n_hidden = 3;

        // training data
        int train_X[6][6] = {
                {1, 1, 1, 0, 0, 0},
                {1, 0, 1, 0, 0, 0},
                {1, 1, 1, 0, 0, 0},
                {0, 0, 1, 1, 1, 0},
                {0, 0, 1, 0, 1, 0},
                {0, 0, 1, 1, 1, 0}
        };

        // construct RBM
        RBM rbm(train_N, n_visible, n_hidden, NULL, NULL, NULL);

        // train
        for(int epoch=0; epoch<training_epochs; epoch++) {
                for(int i=0; i<train_N; i++) {
                        rbm.contrastive_divergence(train_X[i], learning_rate, k);
                }
        }

        // test data
        int test_X[2][6] = {
                {1, 1, 0, 0, 0, 0},
                {0, 0, 0, 1, 1, 0}
        };

        double reconstructed_X[2][6];

        // test
        for(int i=0; i<test_N; i++) {
                rbm.reconstruct(test_X[i], reconstructed_X[i]);
                for(int j=0; j<n_visible; j++) {
                        printf("%20.15f ", reconstructed_X[i][j]);
                }
                cout << endl;
        }
}

int main() {
        test_rbm();
        return 0;
}

这篇关于RBM尚未在代码上使用OpenACC进行任何改进的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆