gcc auto-vectorisation(未处理的data-ref) [英] gcc auto-vectorisation (unhandled data-ref)

查看:293
本文介绍了gcc auto-vectorisation(未处理的data-ref)的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我不明白为什么这样的代码没有使用gcc 4.4.6进行向量化

  int MyFunc(const float * pfTab,对于(int i = 0; i< iSize; i ++)
pfResult [i] = pfResult [i] + pfTab [iIndex)float * pfResult,int iSize,int iIndex)
{
]。
}

注意:未矢量化:未处理的数据ref

然而,如果我写下面的代码:

  int MyFunc(const float * pfTab,float * pfResult,int iSize,int iIndex )
{
float fTab = pfTab [iIndex];
for(int i = 0; i< iSize; i ++)
pfResult [i] = pfResult [i] + fTab;
}

gcc成功自动矢量化这个循环

如果我添加omp指令

  int MyFunc(const float * pfTab,float * pfResult,int iSize, int iIndex)
{
float fTab = pfTab [iIndex];
#pragma omp parallel for
for(int i = 0; i< iSize; i ++)
pfResult [i] = pfResult [i] + fTab;
}

我有以下错误未矢量化:unhandled data-ref



第二个问题:

请问第一个代码和第三个代码是不是自动矢量化的? b数学操作数似乎不是矢量化的(exp,log等等),例如这个代码

  for(int i = 0; i< iSize; i ++)
pfResult [i] = exp(pfResult [i]);

不是矢量化的。这是由于我的gcc版本?



编辑
与新版本的gcc 4.8.1和openMP 2011(回声| cpp -fopenmp -dM | grep -i open)
i对于所有类型的循环甚至基本上都有以下错误:

  for(iGID = 0; iGID< iSize; iGID ++)
{
pfResult [iGID] = fValue;
}


注意:不能连续访问* _144 = 5.0e-1;
注意:无法SLP基本块。
注意:未矢量化:未能在基本块中找到SLP机会。

Edit2:

 #包括< stdio.h中> 
#include< sys / time.h>
#include< string.h>
#include< math.h>
#include< stdlib.h>
#include

int main()
{
int szGlobalWorkSize = 131072;
int iGID = 0;
int j = 0;
omp_set_dynamic(0);
//预热
#if WARMUP
#pragma omp parallel
{
#pragma omp master
{
printf(%d threads \\\
,omp_get_num_threads());
}
}
#endif
printf(Pagesize =%d\\\
,getpagesize());
float * pfResult =(float *)malloc(szGlobalWorkSize * 100 * sizeof(float));
float fValue = 0.5f;
struct timeval tim;
gettimeofday(& tim,NULL);
double tLaunch1 = tim.tv_sec +(tim.tv_usec / 1000000.0);
double time = omp_get_wtime();
int iChunk = getpagesize();
int iSize =((int)szGlobalWorkSize * 100)/ iChunk;
//#pragma omp parallel for
for(iGID = 0; iGID< iSize; iGID ++)
{
pfResult [iGID] = fValue;
}
time = omp_get_wtime() - time;
gettimeofday(& tim,NULL);
double tLaunch2 = tim.tv_sec +(tim.tv_usec / 1000000.0);
printf(%。6lf Time1 \\\
,tLaunch2-tLaunch1);
printf(%。6lf Time2 \\\
,time);
}

结果

  #define _OPENMP 201107 
gcc(GCC)4.8.2 20140120(Red Hat 4.8.2-15)

gcc -march = native -fopenmp -O3 -ftree-vectorizer-verbose = 2 test.c -lm

很多

 注意:无法SLP基本块。 
注意:未矢量化:未能在基本块中找到SLP机会。
并注意:不能连续访问* _144 = 5.0e-1;

谢谢

解决方案 GCC无法引导循环的第一个版本,因为它无法证明 pfTab [iIndex] 未被包含在由 pfResult [0] ... pfResult [iSize-1] (指针别名)。的确,如果 pfTab [iIndex] 在该内存中的某处,那么它的值必须被循环体中的赋值所覆盖,并且新值必须用于迭代跟随。您应该使用 restrict 关键字来提示编译器,这永远不会发生,然后它应该高兴地引导您的代码:

  $ cat foo.c 
int MyFunc(const float * restrict pfTab,float * restrict pfResult,
int iSize,int iIndex)
{
for(int i = 0; i< iSize; i ++)
pfResult [i] = pfResult [i] + pfTab [iIndex];
}
$ gcc -v
...
gcc版本4.6.1(GCC)
$ gcc -std = c99 -O3 -march = native -ftree -vectorizer -verbose = 2 -c foo.c
foo.c:3:note:LOOP VECTORIZED。
foo.c:1:注意:函数中的向量化1循环。

第二个版本矢量化,因为该值将传输到具有自动存储持续时间的变量。这里的一般假设是 pfResult 不会覆盖存储 fTab 的堆栈内存(粗略地读取C99语言规范并没有说明这个假设是否微弱,或者标准中允许的假设)。

由于OpenMP的方式在GCC中实施。它为并行区域使用代码大纲。

  int MyFunc(const float * pfTab,float * pfResult,int iSize,int iIndex )
{
float fTab = pfTab [iIndex];
#pragma omp parallel for
for(int i = 0; i< iSize; i ++)
pfResult [i] = pfResult [i] + fTab;

code $


有效地变成:

$ b pre> struct omp_data_s
{
float * pfResult;
int iSize;
float * fTab;
};

int MyFunc(const float * pfTab,float * pfResult,int iSize,int iIndex)
{
float fTab = pfTab [iIndex];
struct omp_data_s omp_data_o;

omp_data_o.pfResult = pfResult;
omp_data_o.iSize = iSize;
omp_data_o.fTab = fTab;

GOMP_parallel_start(MyFunc_omp_fn0,& omp_data_o,0);
MyFunc._omp_fn.0(& omp_data_o);
GOMP_parallel_end();
pfResult = omp_data_o.pfResult;
iSize = omp_data_o.iSize;
fTab = omp_data_o.fTab;
}

void MyFunc_omp_fn0(struct omp_data_s * omp_data_i)
{
int start = ...; //计算当前线程的开始迭代
int end = ...; //计算当前线程的结束迭代

for(int i = start; i< end; i ++)
omp_data_i-> pfResult [i] = omp_data_i-> pfResult [i ] + omp_data_i-> fTab;

MyFunc_omp_fn0 包含概述功能代码。编译器无法证明 omp_data_i-> pfResult 没有指向内存,即别名 omp_data_i ,特别是它的成员 fTab



为了引导该循环,必须使 fTab firstprivate 。这会在代码中将它变成一个自动变量,并且相当于您的第二种情况:

  $ cat foo。 c 
int MyFunc(const float * pfTab,float * pfResult,int iSize,int iIndex)
{
float fTab = pfTab [iIndex];
#pragma omp parallel for firstprivate(fTab)
for(int i = 0; i< iSize; i ++)
pfResult [i] = pfResult [i] + fTab;
}
$ gcc -std = c99 -fopenmp -O3 -march = native -ftree -vectorizer -verbose = 2 -c foo.c
foo.c:6:note:LOOP VECTORIZED 。
foo.c:4:注意:在函数中矢量化1循环。


I do not understand why such code is not vectorized with gcc 4.4.6

int MyFunc(const float *pfTab, float *pfResult, int iSize, int iIndex)
{
  for (int i = 0; i < iSize; i++)
     pfResult[i] = pfResult[i] + pfTab[iIndex];
}

 note: not vectorized: unhandled data-ref

However, if I write the following code

   int MyFunc(const float *pfTab, float *pfResult, int iSize, int iIndex)
{
  float fTab =  pfTab[iIndex];
  for (int i = 0; i < iSize; i++)
     pfResult[i] = pfResult[i] + fTab;
}

gcc succeeds auto-vectorize this loop

if I add omp directive

   int MyFunc(const float *pfTab, float *pfResult, int iSize, int iIndex)
{
  float fTab =  pfTab[iIndex];
  #pragma omp parallel for
  for (int i = 0; i < iSize; i++)
     pfResult[i] = pfResult[i] + fTab;
}

i have the following error not vectorized: unhandled data-ref

Could you please help me why the first code and third code is not auto-vectorized ?

Second question: math operand seems to be not vectorized (exp, log , etc...), this code for example

for (int i = 0; i < iSize; i++)
         pfResult[i] = exp(pfResult[i]);

is not vectorized. It is due to my version of gcc ?

Edit: with new version of gcc 4.8.1 and openMP 2011 (echo |cpp -fopenmp -dM |grep -i open) i have the following error for all kind of loop even basically

   for (iGID = 0; iGID < iSize; iGID++)
        {
             pfResult[iGID] = fValue;
        }


note: not consecutive access *_144 = 5.0e-1;
note: Failed to SLP the basic block.
note: not vectorized: failed to find SLP opportunities in basic block.

Edit2:

#include<stdio.h>
#include<sys/time.h>
#include <string.h>
#include <math.h>
#include <stdlib.h>
#include <omp.h>

int main()
{
        int szGlobalWorkSize = 131072;
        int iGID = 0;
        int j = 0;
        omp_set_dynamic(0);
        // warmup
        #if WARMUP
        #pragma omp parallel
        {
        #pragma omp master
        {
        printf("%d threads\n", omp_get_num_threads());
        }
        }
        #endif
        printf("Pagesize=%d\n", getpagesize());
        float *pfResult = (float *)malloc(szGlobalWorkSize * 100* sizeof(float));
        float fValue = 0.5f;
        struct timeval tim;
        gettimeofday(&tim, NULL);
        double tLaunch1=tim.tv_sec+(tim.tv_usec/1000000.0);
        double time = omp_get_wtime();
        int iChunk = getpagesize();
        int iSize = ((int)szGlobalWorkSize * 100) / iChunk;
        //#pragma omp parallel for
        for (iGID = 0; iGID < iSize; iGID++)
        {
             pfResult[iGID] = fValue;
        }
        time = omp_get_wtime() - time;
        gettimeofday(&tim, NULL);
        double tLaunch2=tim.tv_sec+(tim.tv_usec/1000000.0);
        printf("%.6lf Time1\n", tLaunch2-tLaunch1);
        printf("%.6lf Time2\n", time);
}

result with

#define _OPENMP 201107
gcc (GCC) 4.8.2 20140120 (Red Hat 4.8.2-15)

gcc -march=native -fopenmp -O3 -ftree-vectorizer-verbose=2 test.c -lm

lot of

note: Failed to SLP the basic block.
note: not vectorized: failed to find SLP opportunities in basic block.
and note: not consecutive access *_144 = 5.0e-1;

Thanks

解决方案

GCC cannot vectorise the first version of your loop because it cannot prove that pfTab[iIndex] is not contained somewhere within the memory spanned by pfResult[0] ... pfResult[iSize-1] (pointer aliasing). Indeed, if pfTab[iIndex] is somewhere within that memory, then its value must be overwritten by the assignment in the loop body and the new value must be used in the iterations to follow. You should use the restrict keyword to hint the compiler that this could never happen and then it should happily vectorise your code:

$ cat foo.c
int MyFunc(const float *restrict pfTab, float *restrict pfResult,
           int iSize, int iIndex)
{
   for (int i = 0; i < iSize; i++)
     pfResult[i] = pfResult[i] + pfTab[iIndex];
}
$ gcc -v
...
gcc version 4.6.1 (GCC)
$ gcc -std=c99 -O3 -march=native -ftree-vectorizer-verbose=2 -c foo.c
foo.c:3: note: LOOP VECTORIZED.
foo.c:1: note: vectorized 1 loops in function.

The second version vectorises since the value is transferred to a variable with an automatic storage duration. The general assumption here is that pfResult does not span over the stack memory where fTab is stored (a cursory read through the C99 language specification doesn't make it clear if that assumption is weak or something in the standard allows it).

The OpenMP version does not vectorise because of the way OpenMP is implemented in GCC. It uses code outlining for the parallel regions.

int MyFunc(const float *pfTab, float *pfResult, int iSize, int iIndex)
{
  float fTab =  pfTab[iIndex];
  #pragma omp parallel for
  for (int i = 0; i < iSize; i++)
     pfResult[i] = pfResult[i] + fTab;
}

effectively becomes:

struct omp_data_s
{
  float *pfResult;
  int iSize;
  float *fTab;
};

int MyFunc(const float *pfTab, float *pfResult, int iSize, int iIndex)
{
  float fTab =  pfTab[iIndex];
  struct omp_data_s omp_data_o;

  omp_data_o.pfResult = pfResult;
  omp_data_o.iSize = iSize;
  omp_data_o.fTab = fTab;

  GOMP_parallel_start (MyFunc_omp_fn0, &omp_data_o, 0);
  MyFunc._omp_fn.0 (&omp_data_o);
  GOMP_parallel_end ();
  pfResult = omp_data_o.pfResult;
  iSize = omp_data_o.iSize;
  fTab = omp_data_o.fTab;
}

void MyFunc_omp_fn0 (struct omp_data_s *omp_data_i)
{
  int start = ...; // compute starting iteration for current thread
  int end = ...; // compute ending iteration for current thread

  for (int i = start; i < end; i++)
    omp_data_i->pfResult[i] = omp_data_i->pfResult[i] + omp_data_i->fTab;
}

MyFunc_omp_fn0 contains the outlined function code. The compiler is not able to prove that omp_data_i->pfResult does not point to memory that aliases omp_data_i and specifically its member fTab.

In order to vectorise that loop, you have to make fTab firstprivate. This will turn it into an automatic variable in the outlined code and that will be equivalent to your second case:

$ cat foo.c
int MyFunc(const float *pfTab, float *pfResult, int iSize, int iIndex)
{
   float fTab = pfTab[iIndex];
   #pragma omp parallel for firstprivate(fTab)
   for (int i = 0; i < iSize; i++)
     pfResult[i] = pfResult[i] + fTab;
}
$ gcc -std=c99 -fopenmp -O3 -march=native -ftree-vectorizer-verbose=2 -c foo.c
foo.c:6: note: LOOP VECTORIZED.
foo.c:4: note: vectorized 1 loops in function.

这篇关于gcc auto-vectorisation(未处理的data-ref)的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆