如何帮助gcc矢量化C代码 [英] How to help gcc vectorize C code

查看:209
本文介绍了如何帮助gcc矢量化C代码的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我有以下C代码。第一部分是从复合数字矩阵中读入标准矩阵,称为 M 。有趣的部分是第二部分。

  #include< stdio.h> 
#include< complex.h>
#include< stdlib.h>
#include< assert.h>
#include< math.h>

int main(){
int n,m,c,d;
float re,im;

scanf(%d%d,& n,& m);
assert(n == m);
complex float M [n] [n]; (c = 0; c for(d = 0; d scanf(%f%fi)的

,& re,& im);
M [c] [d] = re + im * I; (c = 0; c for(d = 0; d }
}

$ b printf(%。2f%+。2fi,creal(M [c] [d]),cimag(M [c] [d])));
}
printf(\\\
);
}
/ *
例如:输入
2 3
1 + 2i 2 + 3i 74-4i
3 + 4i 4 + 5i -7- 8i
* /
/ *第2部分。M现在是一个n乘n的复数矩阵* /
int s = 1,i,j;
int * f = malloc(n * sizeof * f);
复数浮点数* delta = malloc(n * sizeof * delta);
complex float * v = malloc(n * sizeof * v);
复合浮点数p = 1,prod; (i = 0; i v [i] = 0;


for(j = 0; j v [i] + = M [j] [i];
}
p * = v [i];
f [i] = i;
delta [i] = 1;
}
j = 0; $ j $ b而(j prod = 1。
for(i = 0; i v [i] - = 2 * delta [j] * M [j] [i];
prod * = v [i];
}
delta [j] = -delta [j];
s = -s;
p + = s * prod;
f [0] = 0;
f [j] = f [j + 1];
f [j + 1] = j + 1;
j = f [0];
}
free(delta);
free(f);
free(v);
printf(%f + i%f \ n,creal(p / pow(2.,(n-1))),cimag(p / pow(2。,(n-1)) ));
返回0;
}

我使用 gcc -fopt-info-vec进行编译-all -O3 -ffast-math -march = bdver2 permanent-in-cc -lm 。这解释了为什么几乎没有任何循环被矢量化。



性能最重要的部分是第47--50行:

 <$对于(i = 0; i< n; i ++){
v [i] - = 2 * delta [j] * M [j] [i];
prod * = v [i];
}

gcc告诉我:

  permanent-in-cc:47:7:注意:在循环中使用了缩减。 
permanent-in-c.c:47:7:note:未知的def-use循环模式。
permanent-in-c.c:47:7:注意:在循环中使用了缩减。
permanent-in-c.c:47:7:note:未知的def-use循环模式。
permanent-in-c.c:47:7:note:不支持的模式。
permanent-in-c.c:47:7:note:not vectorized:不支持在stmt中使用。
permanent-in-c.c:47:7:注意:意外模式。
[...]
permanent-in-c.c:48:26:注意:SLP:step不会分割向量大小。
permanent-in-c.c:48:26:note:未知的访问对齐:IMAGPART_EXPR <* M.4_40 [j_202] {lb:0 sz:pretmp_291 * 4} [i_200]>
permanent-in-c.c:48:26:注意:SLP:step不会分割向量大小。
permanent-in-c.c:48:26:note:未知的访问对齐:REALPART_EXPR <* M.4_40 [j_202] {lb:0 sz:pretmp_291 * 4} [i_200]>
[...]
permanent-in-cc:48:26:note:构建SLP失败:在基本块SLP
permanent-in-cc中需要展开展开:48:26:note :无法SLP基本块。
permanent-in-c.c:48:26:注意:未矢量化:无法在基本块中找到SLP机会。




如何解决阻止这部分的问题是
向量化?






奇怪的是这部分是矢量化的,但我不确定为什么:

$ p $ 对于(j = 0; j v [i] + = M [ j]的[I];






gcc -fopt-info- vec-all -O3 -ffast-math -march = bdver2 permanent-in-cc -lm位于 https:// bpaste .net / show / 18ebc3d66a53

解决方案

我想我可能已经知道了。经过很多试验/错误,很明显,构建在矢量化优化中的gcc是一种硬编码,它不能正确理解复数。我在代码中做了一些修改,并让你的内部性能敏感的循环向量化,并由gcc输出确认(尽管我不确定期望的结果是否与计算结果相同)。虽然我的理解仅限于你想要的代码,但发现如果你分别计算真实和图像,它的工作将会很好。看看:

  float t_r = 0.0,t_im = 0.0; //(j  prod = 1; //两个新的临时对象

for(i = 0; i //填充从V减去的临时值以避免错误错误
t_r = creal(v [i]) - ( 2. * creal(delta [j])* creal(M [j] [i]));
t_im = cimag(v [i]) - (2. * cimag(delta [j])* cimag(M [j] [i]))* I;
// v [i] = 2. * delta [j] * M [j] [i];
v [i] = t_r + t_im; //真实和img
prod * = v [i];
}
delta [j] = -delta [j];
s = -s;
p + = s * prod;
f [0] = 0;
f [j] = f [j + 1];
f [j + 1] = j + 1;
j = f [0];
}


I have the following C code. The first part just reads in a matrix of complex numbers from standard in into matrix called M. The interesting part is the second part.

#include <stdio.h>
#include <complex.h>
#include <stdlib.h>
#include <assert.h>
#include <math.h>

int main() {
    int n, m, c, d;
    float re, im;

    scanf("%d %d", &n, &m);
    assert(n==m);
    complex float M[n][n];

    for(c=0; c<n; c++) {
      for(d=0; d<n; d++) {
    scanf("%f%fi", &re, &im);
    M[c][d] = re + im * I;
      }
    }

    for(c=0; c<n; c++) {
      for(d=0; d<n; d++) {
        printf("%.2f%+.2fi ", creal(M[c][d]), cimag(M[c][d]));
      }
      printf("\n");
    }
/*
Example:input   
2 3
1+2i 2+3i 74-4i
3+4i 4+5i -7-8i
*/
    /* Part 2. M is now an n by n matrix of complex numbers */
    int s=1, i, j;
    int *f = malloc(n * sizeof *f);
    complex float *delta = malloc(n * sizeof *delta);
    complex float *v = malloc(n * sizeof *v);
    complex float p = 1, prod;

    for (i = 0; i < n; i++) {
      v[i] = 0;
      for (j = 0; j <n; j++) {
        v[i] += M[j][i];
      }
      p *= v[i];
      f[i] = i;
      delta[i] = 1;
    }
    j = 0;
    while (j < n-1) {
      prod = 1.;
      for (i = 0; i < n; i++) {
        v[i] -= 2.*delta[j]*M[j][i];
        prod *= v[i];
      }
      delta[j] = -delta[j];
      s = -s;            
      p += s*prod;
      f[0] = 0;
      f[j] = f[j+1];
      f[j+1] = j+1;
      j = f[0];
    }
    free(delta);
    free(f);
    free(v);
    printf("%f + i%f\n", creal(p/pow(2.,(n-1))), cimag(p/pow(2.,(n-1))));
    return 0;
}

I compile with gcc -fopt-info-vec-all -O3 -ffast-math -march=bdver2 permanent-in-c.c -lm. This explains to me why almost none of the loops are being vectorized.

The most important part for performance is lines 47--50 which are:

for (i = 0; i < n; i++) {
    v[i] -= 2.*delta[j]*M[j][i];
    prod *= v[i];
}

gcc tells me:

permanent-in-c.c:47:7: note: reduction used in loop.
permanent-in-c.c:47:7: note: Unknown def-use cycle pattern.
permanent-in-c.c:47:7: note: reduction used in loop.
permanent-in-c.c:47:7: note: Unknown def-use cycle pattern.
permanent-in-c.c:47:7: note: Unsupported pattern.
permanent-in-c.c:47:7: note: not vectorized: unsupported use in stmt.
permanent-in-c.c:47:7: note: unexpected pattern.
[...]
permanent-in-c.c:48:26: note: SLP: step doesn't divide the vector-size.
permanent-in-c.c:48:26: note: Unknown alignment for access: IMAGPART_EXPR <*M.4_40[j_202]{lb: 0 sz: pretmp_291 * 4}[i_200]>
permanent-in-c.c:48:26: note: SLP: step doesn't divide the vector-size.
permanent-in-c.c:48:26: note: Unknown alignment for access: REALPART_EXPR <*M.4_40[j_202]{lb: 0 sz: pretmp_291 * 4}[i_200]>
[...]
permanent-in-c.c:48:26: note: Build SLP failed: unrolling required in basic block SLP
permanent-in-c.c:48:26: note: Failed to SLP the basic block.
permanent-in-c.c:48:26: note: not vectorized: failed to find SLP opportunities in basic block.

How can I fix the problems that are stopping this part from being vectorized?


Curiously this part is vectorized but I am not sure why:

for (j = 0; j <n; j++) {
    v[i] += M[j][i];


The full output of gcc -fopt-info-vec-all -O3 -ffast-math -march=bdver2 permanent-in-c.c -lm is at https://bpaste.net/show/18ebc3d66a53.

解决方案

I think I might have figured it out. After a lot of trial/error, it became clear that gcc built in vectorization optimizations are sort of hard coded and it doesn't 'understand' complex numbers properly. I made some changes in the code and got your inner performance sensitive loop to vectorize, confirmed by gcc output (though I am not sure the desired outcome is computationally equivalent to what you want). While my understanding is limited to what you want the code to do, the finding is that it'll work fine if you compute real and imag separately. Have a look:

    float t_r = 0.0, t_im = 0.0; // two new temporaries  
    while (j < n-1) {
        prod = 1.;
        for (i = 0; i < n; i++) {
// fill the temps after subtraction from V to avoid stmt error
            t_r = creal (v[i]) - (2. * creal(delta[j]) * creal (M[j][i]));
            t_im = cimag(v[i]) - (2. * cimag(delta[j]) * cimag (M[j][i])) * I;
            //v[i] = 2.*delta[j]*M[j][i];
            v[i] = t_r + t_im; // sum of real and img
            prod *= v[i];
        }
        delta[j] = -delta[j];
        s = -s;            
        p += s*prod;
        f[0] = 0;
        f[j] = f[j+1];
        f[j+1] = j+1;
        j = f[0];
    }

这篇关于如何帮助gcc矢量化C代码的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆