为什么我的OpenMP C ++代码比串行代码慢? [英] why my OpenMP C++ code is slower than a serial code?

查看:195
本文介绍了为什么我的OpenMP C ++代码比串行代码慢?的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

  #include< iostream> 
#include< iomanip>
#include< fstream>
#include< sstream>
#include< string>
#include< stdio.h>
#include< stdlib.h>
#include< math.h>
#include< omp.h>
using namespace std;


void output(float a [],float X [],float Y [],int I,int J)
{
ofstream ft;
int i;

ft.open(flow.dat);
ft<<variables = \x \,\y \,\a \<<\\\

< <zone f = point<<\ n
<<I =<<<<,J =<< J< \\\

<< endl;

for(int i = 0; i {
ft<< setiosflags(ios :: scientific)
< ; X [i] << }

ft.close();

}

void set(float a [],float X [],float Y [],int I,int J,float hx,float hy)$ b $ (int i = 0; i {
int iC = j * I + i;
X [iC] = i * hx;
Y [iC] = j * hy;
a [iC] = 0.0;
if(j == J-1)a [iC] = 1.0;
}
}

void difference_serial(float a [],int I,int J,const float hx,const float hy)
{
const float aC =(hx * hx + hy * hy)* 2;
const float aX = hy * hy;
const float aY = hx * hx;
for(int j = 1; j for(int i = 1; i {
int iC = j * I + i;
int iL = iC-1;
int iR = iC + 1;
int iU = iC + I;
int iD = iC-I;
a [iC] =(aX *(a [iL] + a [iR])+ aY *(a [iU] + a [iD]))/ aC;
}


}

void difference_omp(float a [],int I,int J,const float hx,const float hy)
{
const float aC =(hx * hx + hy * hy)* 2;
const float aX = hy * hy;
const float aY = hx * hx;

int i,j,iC,iL,iR,iU,iD;
#pragma omp parallel for private(i,j,iC,iL,iR,iU,iD)shared(a,I,J)schedule(dynamic)
for(j = 1; j& -1; j ++)
for(i = 1; i {
iC = j * I +
iL = iC-1;
iR = iC + 1;
iU = iC + I;
iD = iC-I;
a [iC] =(aX *(a [iL] + a [iR])+ aY *(a [iU] + a [iD]))/ aC;
}
}

int main()
{
const int I = 129;
const int J = 129;
const int N = I * J;
const float hx = 1.0 /(I-1);
const float hy = 1.0 /(J-1);

float * a = new float [N];
float * X = new float [N];
float * Y = new float [N];

//设置网格和流动
set(a,X,Y,I,J,hx,hy);

// iteation
clock_t start = clock();
for(int it = 0; it< 10000; it ++)
difference_serial(a,I,J,hx,hy)
clock_t end = clock();
printf(Serial time =%f \\\
,(float)(end-start)/ CLOCKS_PER_SEC);


set(a,X,Y,I,J,hx,hy);
clock_t start2 = clock();
for(int it2 = 0; it2< 10000; it2 ++)
difference_omp(a,I,J,hx,hy)
clock_t end2 = clock();
printf(Omp time =%f \\\
,(float)(end2-start2)/ CLOCKS_PER_SEC);

// output
输出(a,X,Y,I,J);

//可用内存
delete [] a;
delete [] X;
delete [] Y;
}

我写了一段代码来解决一个非常简单的拉普拉斯方程。尝试比较序列代码和OpenMP代码



我试着用
编译代码g ++ tmp.cpp -fopenmp



并得到非常奇怪的结果
输出:
序列时间= 1.620000
Omp时间= 9.820000



有人可以帮我找出这背后的原因,以及如何更正OpenMP代码。

解决方案

结果。

  luk32:〜/ projects / tests $ g ++ -fopenmp -lgomp ./laplace.cpp 
luk32: 〜/ projects / tests $ ./a.out
Omp time = 13.000000
Serial time = 3.000000
luk32:〜/ projects / tests $ g ++ -O3 -fopenmp -lgomp ./laplace。 cpp
luk32:〜/ projects / tests $ ./a.out
Omp time = 31.000000
Serial time = 1.000000

所以使用 O3 ,OpenMP的时间加长,串行版本的时间减少。我的猜测是问题实例是如此之小,以致于调用并行区域的实际开销在这里显现。



您正在尝试并行化在电脑上花费1.5秒/ 10k = 0.15毫秒的内容。 c>



我会尝试做一些测试来确认。不确定随机碰到 I J 是否合法。



测试后:



确定我切换 J = I = 10240; 并设置 for(int it = 0; it <50; it ++)。我还使用 omp_get_wtime()进行时间测量。



以下是结果:

 序列时间= 58.982189 
Omp时间= 9.158118

12逻辑核心机。现在的结果是预期的。你的示例问题太小了, OpenMP 效率很高,直到开销花费的时间比计算时间长。



Diff:

  luk32:〜/ projects / tests $ diff laplace.orig.cpp laplace.cpp 
88 ,89c88,89
< const int I = 129;
< const int J = 129;
---
> const int I = 10000;
> const int J = 10000;
102,103c102,103
< clock_t start = clock();
< for(int it = 0; it< 10000; it ++)
---
> double start = omp_get_wtime();
> for(int it = 0; it <50; it ++)
105,106c105,106
< clock_t end = clock();
< printf(Serial time =%f \\\
,(float)(end-start)/ CLOCKS_PER_SEC);
---
> double end = omp_get_wtime();
> printf(Serial time =%f \\\
,(float)(end-start));
110,111c110,111
< clock_t start2 = clock();
< for(int it2 = 0; it2< 10000; it2 ++)
---
> double start2 = omp_get_wtime();
> for(int it2 = 0; it2< 50; it2 ++)
113,114c113,114
< clock_t end2 = clock();
< printf(Omp time =%f \\\
,(float)(end2-start2)/ CLOCKS_PER_SEC);
-
> double end2 = omp_get_wtime();
> printf(Omp time =%f \\\
,(float)(end2-start2));

编辑:我只是大胆的主要问题,所以任何人谁遇到这将自动聚焦。 / p>

#include <iostream>
#include <iomanip>
#include <fstream> 
#include <sstream>
#include <string>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>
using namespace std;


void output(float a[], float X[], float Y[], int I, int J)
{
  ofstream ft;
  int i;

  ft.open("flow.dat");
  ft<<"variables=\"x\",\"y\",\"a\""<<"\n"
    <<"zone f=point"<<"\n"
    <<"I="<<I<<",J="<<J<<"\n"
    <<endl;

  for(int i=0;i<I*J;i++)
    {
    ft<<setiosflags(ios::scientific)
      <<X[i]<<" "<<Y[i]<<" "<<a[i]<<endl;
    }

  ft.close();

}

void set(float a[], float X[], float Y[], int I, int J, float hx, float hy)
{
  for(int j=0;j<J;j++)
    for(int i=0;i<I;i++)
      {
        int iC=j*I+i;
        X[iC]=i*hx;
        Y[iC]=j*hy;
        a[iC]=0.0;
        if(j==J-1) a[iC]=1.0;
      }
}

void difference_serial(float a[],  int I, int J, const float hx, const float hy)
{
  const float aC=(hx*hx+hy*hy)*2;
  const float aX=hy*hy;
  const float aY=hx*hx;
  for(int j=1;j<J-1;j++)
    for(int i=1;i<I-1;i++)
      {
        int iC=j*I+i;
        int iL=iC-1;
        int iR=iC+1;
        int iU=iC+I;
        int iD=iC-I;
        a[iC]=(aX*(a[iL]+a[iR])+aY*(a[iU]+a[iD]))/aC;
      }


}

void difference_omp(float a[],  int I, int J, const float hx, const float hy)
{
  const float aC=(hx*hx+hy*hy)*2;
  const float aX=hy*hy;
  const float aY=hx*hx;

  int i,j,iC,iL,iR,iU,iD;
#pragma omp parallel for private(i,j,iC,iL,iR,iU,iD) shared(a,I,J) schedule(dynamic) 
  for( j=1;j<J-1;j++)
    for( i=1;i<I-1;i++)
      {
        iC=j*I+i;
        iL=iC-1;
        iR=iC+1;
        iU=iC+I;
        iD=iC-I;
        a[iC]=(aX*(a[iL]+a[iR])+aY*(a[iU]+a[iD]))/aC;
      }
}

int main()
{
  const int I=129;
  const int J=129;
  const int N=I*J;
  const float hx=1.0/(I-1);
  const float hy=1.0/(J-1);

  float *a=new float[N];
  float *X=new float[N];
  float *Y=new float[N];

  //set the grid and flow
  set(a,X,Y,I,J,hx,hy);

  //iteation
  clock_t start=clock();
  for(int it=0;it<10000;it++)
    difference_serial(a,I,J,hx,hy);
  clock_t end=clock();
  printf("Serial time=%f\n",(float)(end-start)/CLOCKS_PER_SEC);


  set(a,X,Y,I,J,hx,hy);
  clock_t start2=clock();
  for(int it2=0;it2<10000;it2++)
    difference_omp(a,I,J,hx,hy);
  clock_t end2=clock();
  printf("Omp time=%f\n",(float)(end2-start2)/CLOCKS_PER_SEC);

  //output
  output(a,X,Y,I,J);

  //free memory
  delete[] a;
  delete[] X;
  delete[] Y;
}

I write a piece of code to solve a very simple Laplace equation in two dimensions. Try to compare the serial code and OpenMP code

I tried to compile the code with g++ tmp.cpp -fopenmp

and get the very strange result output: Serial time=1.620000 Omp time=9.820000

Is there anyone can help me to figure out what's the reason behind this and how to correct the OpenMP code.

解决方案

I ran into funny results.

luk32:~/projects/tests$ g++ -fopenmp -lgomp ./laplace.cpp 
luk32:~/projects/tests$ ./a.out 
Omp time=13.000000
Serial time=3.000000
luk32:~/projects/tests$ g++ -O3 -fopenmp -lgomp ./laplace.cpp 
luk32:~/projects/tests$ ./a.out 
Omp time=31.000000
Serial time=1.000000

So with O3 the time worsened for OpenMP and dropped forthe serial version. My guess is that the problem instance is so small that the actual overhead from invoking parallel region is manifesting here.

You are trying to parallelize something that is taking 1.5s / 10k = 0.15 millisecond on your PC. Initalizing thread pool and scheduling has its overhead especially with schedule(dynamic)

I will try to do some testing to confirm. Not sure if it is legal to randomly bump I and J.

After tests:

OK I switched J=I=10240; and set up for(int it=0;it<50;it++). I also used omp_get_wtime() for time measurments. Below is full diff file.

Here are the results:

Serial time=58.982189
Omp time=9.158118

It was perfromed on a 6-phys/12-logical core machine. Now the results are as expected. Your example problem was way too small for OpenMP to be efficient up to the point the overhead took longer than the calculations.

Diff:

luk32:~/projects/tests$ diff laplace.orig.cpp laplace.cpp
88,89c88,89
<   const int I=129;
<   const int J=129;
---
>   const int I=10000;
>   const int J=10000;
102,103c102,103
<   clock_t start=clock();
<   for(int it=0;it<10000;it++)
---
>   double start=omp_get_wtime();
>   for(int it=0;it<50;it++)
105,106c105,106
<   clock_t end=clock();
<   printf("Serial time=%f\n",(float)(end-start)/CLOCKS_PER_SEC);
---
>   double end=omp_get_wtime();
>   printf("Serial time=%f\n",(float)(end-start));
110,111c110,111
<   clock_t start2=clock();
<   for(int it2=0;it2<10000;it2++)
---
>   double start2=omp_get_wtime();
>   for(int it2=0;it2<50;it2++)
113,114c113,114
<   clock_t end2=clock();
<   printf("Omp time=%f\n",(float)(end2-start2)/CLOCKS_PER_SEC);
---
>   double end2=omp_get_wtime();
>   printf("Omp time=%f\n",(float)(end2-start2));

EDIT: I just bolded the main problem so anyone who comes across this will focus automatically on it.

这篇关于为什么我的OpenMP C ++代码比串行代码慢?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆