ICC中的-O3弄乱了内在函数,可以与-O1或-O2或与相应的手动组装一起使用 [英] -O3 in ICC messes up intrinsics, fine with -O1 or -O2 or with corresponding manual assembly

查看:176
本文介绍了ICC中的-O3弄乱了内在函数,可以与-O1或-O2或与相应的手动组装一起使用的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

这是对以下4x4矩阵乘法C = AB的代码在所有优化设置的ICC上均可正常编译.它在-O1和-O2上正确执行,但是在-O3上给出不正确的结果.问题似乎来自_mm256_storeu_pd操作,因为用下面的asm语句替换它(并且仅替换它)在执行后给出了正确的结果.有什么想法吗?

The code below for a 4x4 matrix multiplication C = AB compiles fine on ICC on all optimization settings. It executes correctly on -O1 and -O2, but gives an incorrect result on -O3. The problem seems to come from the _mm256_storeu_pd operation, as substituting it (and only it) with the asm statement below gives correct results after execution. Any ideas?

inline void RunIntrinsics_FMA_UnalignedCopy_MultiplyMatrixByMatrix(double *A, double *B, double *C)
{
    size_t i;

    /* the registers you use */
    __m256d a0, a1, a2, a3, b0, b1, b2, b3, sum;
    //  __m256d *C256 = (__m256d *)C;

    /* load values from B */
    b0 = _mm256_loadu_pd(&B[0]);
    b1 = _mm256_loadu_pd(&B[4]);
    b2 = _mm256_loadu_pd(&B[8]);
    b3 = _mm256_loadu_pd(&B[12]);

    for (i = 0; i < 4; i++) {
        /* load values from A */
        a0 = _mm256_set1_pd(A[4*i + 0]);
        a1 = _mm256_set1_pd(A[4*i + 1]);
        a2 = _mm256_set1_pd(A[4*i + 2]);
        a3 = _mm256_set1_pd(A[4*i + 3]);

        sum = _mm256_mul_pd(a0, b0);
        sum = _mm256_fmadd_pd(a1, b1, sum);
        sum = _mm256_fmadd_pd(a2, b2, sum);
        sum = _mm256_fmadd_pd(a3, b3, sum);

        // asm ("vmovupd %1, %0" : "=m"(C256[i]) : "x"(sum));
        _mm256_storeu_pd(&C[4*i], sum);

    }

}

此外,这也是ICC生成的程序集.箭头分别用_mm256_storeu_pd或asm语句指示该行. RunIntrinsics_FMA_UnalignedCopy_Struct是一个函数,该函数从SourceMatrix中获取存储的数字并调用矩阵乘法例程.

Also, here is the assembly generated by ICC. The arrows indicate the line with _mm256_storeu_pd or the asm statement respectively. RunIntrinsics_FMA_UnalignedCopy_Struct is a function that takes stored numbers from SourceMatrix and calls the matrix multiplication routine.

-O2 -xcore-avx2

-O2 -xcore-avx2

ICC测试RunIntrinsics_FMA_UnalignedCopy_Struct:

ICC Test`RunIntrinsics_FMA_UnalignedCopy_Struct:

0x1000053c0 <+0>:    pushq  %rbp
0x1000053c1 <+1>:    movq   %rsp, %rbp
0x1000053c4 <+4>:    andq   $-0x20, %rsp
0x1000053c8 <+8>:    pushq  %r12
0x1000053ca <+10>:   pushq  %r13
0x1000053cc <+12>:   pushq  %r14
0x1000053ce <+14>:   pushq  %r15
0x1000053d0 <+16>:   pushq  %rbx
0x1000053d1 <+17>:   subq   $0x4f8, %rsp              ; imm = 0x4F8 
0x1000053d8 <+24>:   callq  0x10000b538               ; symbol stub for: clock
0x1000053dd <+29>:   movq   %rax, %rbx
0x1000053e0 <+32>:   vmovupd 0x95f8(%rip), %ymm11      ; SourceMatrix + 190
0x1000053e8 <+40>:   xorl   %eax, %eax
0x1000053ea <+42>:   vxorpd %xmm1, %xmm1, %xmm1
0x1000053ee <+46>:   vmovsd %xmm1, 0x40(%rsp)
0x1000053f4 <+52>:   vmovupd 0x9604(%rip), %ymm10      ; SourceMatrix + 222
0x1000053fc <+60>:   vmovupd 0x95bc(%rip), %ymm12      ; SourceMatrix + 158
0x100005404 <+68>:   vmovupd 0x9574(%rip), %ymm13      ; SourceMatrix + 94
0x10000540c <+76>:   vmovupd 0x952c(%rip), %ymm14      ; SourceMatrix + 30
0x100005414 <+84>:   vmovupd 0x9504(%rip), %ymm15      ; c_feature_names + 446
0x10000541c <+92>:   vmovupd 0x95fc(%rip), %ymm8       ; SourceMatrix + 254
0x100005424 <+100>:  vmovupd 0x9614(%rip), %ymm7       ; SourceMatrix + 286
0x10000542c <+108>:  vmovupd 0x962c(%rip), %ymm6       ; SourceMatrix + 318
0x100005434 <+116>:  vmovupd 0x9644(%rip), %ymm5       ; SourceMatrix + 350
0x10000543c <+124>:  vmovupd 0x965c(%rip), %ymm3       ; SourceMatrix + 382
0x100005444 <+132>:  vmovupd 0x9674(%rip), %ymm2       ; SourceMatrix + 414
0x10000544c <+140>:  vmovupd 0x968c(%rip), %ymm1       ; SourceMatrix + 446
0x100005454 <+148>:  vmovupd %ymm10, 0x420(%rsp)
0x10000545d <+157>:  vmovupd %ymm11, 0x440(%rsp)
0x100005466 <+166>:  vmovsd 0x626a(%rip), %xmm10      ; xmm10 = mem[0],zero 
0x10000546e <+174>:  vmovsd 0x40(%rsp), %xmm11        ; xmm11 = mem[0],zero 
0x100005474 <+180>:  vmovupd 0x94e4(%rip), %ymm9       ; SourceMatrix + 62
0x10000547c <+188>:  vmovupd %ymm1, 0x20(%rsp)
0x100005482 <+194>:  vmovupd 0x9516(%rip), %ymm4       ; SourceMatrix + 126
0x10000548a <+202>:  vmovupd %ymm2, 0x360(%rsp)
0x100005493 <+211>:  vmovupd %ymm3, 0x3c0(%rsp)
0x10000549c <+220>:  vmovupd %ymm5, 0x380(%rsp)
0x1000054a5 <+229>:  vmovupd %ymm6, 0x3a0(%rsp)
0x1000054ae <+238>:  vmovupd %ymm7, 0x3e0(%rsp)
0x1000054b7 <+247>:  vmovupd %ymm8, 0x400(%rsp)
0x1000054c0 <+256>:  vmovupd %ymm12, 0x4c0(%rsp)
0x1000054c9 <+265>:  vmovupd %ymm13, 0x4a0(%rsp)
0x1000054d2 <+274>:  vmovupd %ymm14, 0x480(%rsp)
0x1000054db <+283>:  vmovupd %ymm15, 0x460(%rsp)
0x1000054e4 <+292>:  vxorpd %ymm0, %ymm0, %ymm0
0x1000054e8 <+296>:  vmovupd %ymm0, 0x260(%rsp)
0x1000054f1 <+305>:  vmovupd %ymm0, 0x2e0(%rsp)
0x1000054fa <+314>:  vmovupd %ymm0, 0x280(%rsp)
0x100005503 <+323>:  vmovupd %ymm0, 0x300(%rsp)
0x10000550c <+332>:  vmovupd %ymm0, 0x2a0(%rsp)
0x100005515 <+341>:  vmovupd %ymm0, 0x320(%rsp)
0x10000551e <+350>:  vmovupd %ymm0, 0x2c0(%rsp)
0x100005527 <+359>:  vmovupd %ymm0, 0x340(%rsp)
0x100005530 <+368>:  vmovupd 0x95c8(%rip), %ymm0       ; SourceMatrix + 478
0x100005538 <+376>:  vmovupd %ymm0, (%rsp)
0x10000553d <+381>:  incl   %eax
0x10000553f <+383>:  vxorpd %xmm3, %xmm3, %xmm3
0x100005543 <+387>:  vcvtsi2sdl %eax, %xmm3, %xmm3
0x100005547 <+391>:  vdivsd %xmm3, %xmm10, %xmm2
0x10000554b <+395>:  vbroadcastsd %xmm2, %ymm8
0x100005550 <+400>:  vaddpd 0x460(%rsp), %ymm8, %ymm1
0x100005559 <+409>:  vaddpd %ymm4, %ymm8, %ymm3
0x10000555d <+413>:  vaddpd 0x480(%rsp), %ymm8, %ymm0
0x100005566 <+422>:  vaddpd 0x420(%rsp), %ymm8, %ymm2
0x10000556f <+431>:  vaddpd %ymm9, %ymm8, %ymm6
0x100005574 <+436>:  vaddpd 0x4a0(%rsp), %ymm8, %ymm7
0x10000557d <+445>:  vaddpd 0x400(%rsp), %ymm8, %ymm5
0x100005586 <+454>:  vmovupd %ymm1, 0x60(%rsp)
0x10000558c <+460>:  vmovupd %ymm0, 0x80(%rsp)
0x100005595 <+469>:  vmovupd %ymm6, 0xa0(%rsp)
0x10000559e <+478>:  vmovupd %ymm7, 0xc0(%rsp)
0x1000055a7 <+487>:  vmovupd %ymm3, 0xe0(%rsp)
0x1000055b0 <+496>:  vmovupd %ymm5, 0x160(%rsp)
0x1000055b9 <+505>:  vmovupd %ymm2, 0x140(%rsp)
0x1000055c2 <+514>:  vbroadcastsd 0x60(%rsp), %ymm14
0x1000055c9 <+521>:  vbroadcastsd 0x68(%rsp), %ymm13
0x1000055d0 <+528>:  vbroadcastsd 0x70(%rsp), %ymm15
0x1000055d7 <+535>:  vbroadcastsd 0x78(%rsp), %ymm12
0x1000055de <+542>:  vmulpd %ymm14, %ymm3, %ymm14
0x1000055e3 <+547>:  vaddpd 0x4c0(%rsp), %ymm8, %ymm1
0x1000055ec <+556>:  vaddpd 0x440(%rsp), %ymm8, %ymm0
0x1000055f5 <+565>:  vaddpd 0x380(%rsp), %ymm8, %ymm5
0x1000055fe <+574>:  vaddpd 0x3e0(%rsp), %ymm8, %ymm6
0x100005607 <+583>:  vaddpd 0x3a0(%rsp), %ymm8, %ymm7
0x100005610 <+592>:  vfmadd213pd %ymm14, %ymm1, %ymm13
0x100005615 <+597>:  vmovupd %ymm5, 0x1c0(%rsp)
0x10000561e <+606>:  vmovupd %ymm1, 0x100(%rsp)
0x100005627 <+615>:  vmovupd %ymm6, 0x180(%rsp)
0x100005630 <+624>:  vmovupd %ymm7, 0x1a0(%rsp)
0x100005639 <+633>:  vfmadd213pd %ymm13, %ymm0, %ymm15
0x10000563e <+638>:  vmovupd %ymm0, 0x120(%rsp)
0x100005647 <+647>:  vbroadcastsd 0x88(%rsp), %ymm13
0x100005651 <+657>:  vbroadcastsd 0x90(%rsp), %ymm14
0x10000565b <+667>:  vfmadd213pd %ymm15, %ymm2, %ymm12
0x100005660 <+672>:  vbroadcastsd 0x80(%rsp), %ymm15
0x10000566a <+682>:  vaddpd 0x3c0(%rsp), %ymm8, %ymm5
0x100005673 <+691>:  vaddpd 0x360(%rsp), %ymm8, %ymm7
0x10000567c <+700>:  vaddpd 0x20(%rsp), %ymm8, %ymm6
0x100005682 <+706>:  vaddpd (%rsp), %ymm8, %ymm8
0x100005687 <+711>:  vmulpd %ymm15, %ymm3, %ymm15
->  0x10000568c <+716>:  vmovupd %ymm12, 0x260(%rsp)
0x100005695 <+725>:  vmovupd %ymm5, 0x1e0(%rsp)
0x10000569e <+734>:  vmovupd %ymm8, 0x240(%rsp)
0x1000056a7 <+743>:  vmovupd %ymm6, 0x220(%rsp)
0x1000056b0 <+752>:  vfmadd213pd %ymm15, %ymm1, %ymm13
0x1000056b5 <+757>:  vmovupd %ymm7, 0x200(%rsp)

-O3 -xcore-avx2

-O3 -xcore-avx2

ICC测试RunIntrinsics_FMA_UnalignedCopy_Struct:

ICC Test`RunIntrinsics_FMA_UnalignedCopy_Struct:

0x100004c10 <+0>:    pushq  %rbp
0x100004c11 <+1>:    movq   %rsp, %rbp
0x100004c14 <+4>:    andq   $-0x20, %rsp
0x100004c18 <+8>:    pushq  %r12
0x100004c1a <+10>:   pushq  %r13
0x100004c1c <+12>:   pushq  %r14
0x100004c1e <+14>:   pushq  %r15
0x100004c20 <+16>:   pushq  %rbx
0x100004c21 <+17>:   subq   $0x858, %rsp              ; imm = 0x858 
0x100004c28 <+24>:   callq  0x10000b538               ; symbol stub for: clock
0x100004c2d <+29>:   movq   %rax, %rbx
0x100004c30 <+32>:   vbroadcastsd 0xc0(%rsp), %ymm15
0x100004c3a <+42>:   xorl   %eax, %eax
0x100004c3c <+44>:   vxorpd %xmm1, %xmm1, %xmm1
0x100004c40 <+48>:   vmovsd %xmm1, 0x40(%rsp)
0x100004c46 <+54>:   vmovupd 0x9c12(%rip), %ymm2       ; c_feature_names + 446
0x100004c4e <+62>:   vmovupd %ymm15, 0x620(%rsp)
0x100004c57 <+71>:   vmovupd 0x9c41(%rip), %ymm13      ; SourceMatrix + 62
0x100004c5f <+79>:   vmovupd 0x9c19(%rip), %ymm14      ; SourceMatrix + 30
0x100004c67 <+87>:   vmovupd 0x9c51(%rip), %ymm12      ; SourceMatrix + 94
0x100004c6f <+95>:   vmovupd %ymm2, 0x640(%rsp)
0x100004c78 <+104>:  vmovupd 0x9c60(%rip), %ymm11      ; SourceMatrix + 126
0x100004c80 <+112>:  vmovupd 0x9c78(%rip), %ymm10      ; SourceMatrix + 158
0x100004c88 <+120>:  vmovupd 0x9c90(%rip), %ymm9       ; SourceMatrix + 190
0x100004c90 <+128>:  vmovupd %ymm13, 0x680(%rsp)
0x100004c99 <+137>:  vmovupd 0x9c9f(%rip), %ymm8       ; SourceMatrix + 222
0x100004ca1 <+145>:  vmovupd 0x9cb7(%rip), %ymm7       ; SourceMatrix + 254
0x100004ca9 <+153>:  vmovupd 0x9ccf(%rip), %ymm6       ; SourceMatrix + 286
0x100004cb1 <+161>:  vmovupd %ymm9, 0x700(%rsp)
0x100004cba <+170>:  vmovupd 0x9cde(%rip), %ymm5       ; SourceMatrix + 318
0x100004cc2 <+178>:  vmovupd 0x9cf6(%rip), %ymm4       ; SourceMatrix + 350
0x100004cca <+186>:  vmovupd 0x9d0e(%rip), %ymm3       ; SourceMatrix + 382
0x100004cd2 <+194>:  vmovupd %ymm6, 0x760(%rsp)
0x100004cdb <+203>:  vmovupd 0x9d1d(%rip), %ymm2       ; SourceMatrix + 414
0x100004ce3 <+211>:  vmovupd 0x9d35(%rip), %ymm1       ; SourceMatrix + 446
0x100004ceb <+219>:  vmovsd 0x40(%rsp), %xmm13        ; xmm13 = mem[0],zero 
0x100004cf1 <+225>:  vbroadcastsd 0xc8(%rsp), %ymm15
0x100004cfb <+235>:  vmovupd %ymm3, 0x7c0(%rsp)
0x100004d04 <+244>:  vmovupd %ymm2, 0x7e0(%rsp)
0x100004d0d <+253>:  vmovupd %ymm1, 0x800(%rsp)
0x100004d16 <+262>:  vmovupd %ymm15, 0x600(%rsp)
0x100004d1f <+271>:  vmovupd %ymm4, 0x7a0(%rsp)
0x100004d28 <+280>:  vmovupd %ymm5, 0x780(%rsp)
0x100004d31 <+289>:  vmovupd %ymm7, 0x740(%rsp)
0x100004d3a <+298>:  vmovupd %ymm8, 0x720(%rsp)
0x100004d43 <+307>:  vmovupd %ymm10, 0x6e0(%rsp)
0x100004d4c <+316>:  vmovupd %ymm11, 0x6c0(%rsp)
0x100004d55 <+325>:  vmovupd %ymm12, 0x6a0(%rsp)
0x100004d5e <+334>:  vmovupd %ymm14, 0x660(%rsp)
0x100004d67 <+343>:  vbroadcastsd 0xd0(%rsp), %ymm15
0x100004d71 <+353>:  vmovupd %ymm15, 0x5e0(%rsp)
0x100004d7a <+362>:  vbroadcastsd 0xd8(%rsp), %ymm15
0x100004d84 <+372>:  vmovupd %ymm15, 0x5c0(%rsp)
0x100004d8d <+381>:  vbroadcastsd 0xe0(%rsp), %ymm15
0x100004d97 <+391>:  vmovupd %ymm15, 0x5a0(%rsp)
0x100004da0 <+400>:  vbroadcastsd 0xe8(%rsp), %ymm15
0x100004daa <+410>:  vmovupd %ymm15, 0x580(%rsp)
0x100004db3 <+419>:  vbroadcastsd 0xf0(%rsp), %ymm15
0x100004dbd <+429>:  vmovupd %ymm15, 0x560(%rsp)
0x100004dc6 <+438>:  vbroadcastsd 0xf8(%rsp), %ymm15
0x100004dd0 <+448>:  vmovupd %ymm15, 0x540(%rsp)
0x100004dd9 <+457>:  vbroadcastsd 0x100(%rsp), %ymm15
0x100004de3 <+467>:  vmovupd %ymm15, 0x520(%rsp)
0x100004dec <+476>:  vbroadcastsd 0x108(%rsp), %ymm15
0x100004df6 <+486>:  vmovupd %ymm15, 0x500(%rsp)
0x100004dff <+495>:  vbroadcastsd 0x110(%rsp), %ymm15
0x100004e09 <+505>:  vmovupd %ymm15, 0x4e0(%rsp)
0x100004e12 <+514>:  vbroadcastsd 0x118(%rsp), %ymm15
0x100004e1c <+524>:  vmovupd %ymm15, 0x4c0(%rsp)
0x100004e25 <+533>:  vbroadcastsd 0x1c0(%rsp), %ymm15
0x100004e2f <+543>:  vmovupd %ymm15, 0x4a0(%rsp)
0x100004e38 <+552>:  vbroadcastsd 0x1c8(%rsp), %ymm15
0x100004e42 <+562>:  vmovupd %ymm15, 0x480(%rsp)
0x100004e4b <+571>:  vbroadcastsd 0x1d0(%rsp), %ymm15
0x100004e55 <+581>:  vmovupd %ymm15, 0x460(%rsp)
0x100004e5e <+590>:  vbroadcastsd 0x1d8(%rsp), %ymm15
0x100004e68 <+600>:  vmovupd %ymm15, 0x440(%rsp)
0x100004e71 <+609>:  vbroadcastsd 0x1e0(%rsp), %ymm15
0x100004e7b <+619>:  vmovupd %ymm15, 0x420(%rsp)
0x100004e84 <+628>:  vbroadcastsd 0x1e8(%rsp), %ymm15
0x100004e8e <+638>:  vmovupd %ymm15, 0x400(%rsp)
0x100004e97 <+647>:  vbroadcastsd 0x1f0(%rsp), %ymm15
0x100004ea1 <+657>:  vmovupd %ymm15, 0x3e0(%rsp)
0x100004eaa <+666>:  vbroadcastsd 0x1f8(%rsp), %ymm15
0x100004eb4 <+676>:  vmovupd %ymm15, 0x3c0(%rsp)
0x100004ebd <+685>:  vbroadcastsd 0x200(%rsp), %ymm15
0x100004ec7 <+695>:  vmovupd %ymm15, 0x3a0(%rsp)
0x100004ed0 <+704>:  vbroadcastsd 0x208(%rsp), %ymm15
0x100004eda <+714>:  vmovupd %ymm15, 0x80(%rsp)
0x100004ee3 <+723>:  vbroadcastsd 0x210(%rsp), %ymm15
0x100004eed <+733>:  vxorpd %ymm0, %ymm0, %ymm0
0x100004ef1 <+737>:  vmovupd %ymm0, 0x2a0(%rsp)
0x100004efa <+746>:  vmovupd %ymm0, 0x320(%rsp)
0x100004f03 <+755>:  vmovupd %ymm0, 0x2c0(%rsp)
0x100004f0c <+764>:  vmovupd %ymm0, 0x340(%rsp)
0x100004f15 <+773>:  vmovupd %ymm0, 0x2e0(%rsp)
0x100004f1e <+782>:  vmovupd %ymm0, 0x360(%rsp)
0x100004f27 <+791>:  vmovupd %ymm0, 0x300(%rsp)
0x100004f30 <+800>:  vmovupd %ymm0, 0x380(%rsp)
0x100004f39 <+809>:  vmovupd 0x9aff(%rip), %ymm0       ; SourceMatrix + 478
0x100004f41 <+817>:  vmovupd %ymm15, 0x60(%rsp)
0x100004f47 <+823>:  vbroadcastsd 0x218(%rsp), %ymm15
0x100004f51 <+833>:  vmovupd %ymm0, 0x820(%rsp)
0x100004f5a <+842>:  vmovupd %ymm15, 0x20(%rsp)
0x100004f60 <+848>:  incl   %eax
0x100004f62 <+850>:  vxorpd %xmm12, %xmm12, %xmm12
0x100004f67 <+855>:  vcvtsi2sdl %eax, %xmm12, %xmm12
0x100004f6b <+859>:  vmovsd 0x6765(%rip), %xmm11      ; xmm11 = mem[0],zero 
0x100004f73 <+867>:  vdivsd %xmm12, %xmm11, %xmm8
0x100004f78 <+872>:  vbroadcastsd %xmm8, %ymm7
0x100004f7d <+877>:  vaddpd 0x640(%rsp), %ymm7, %ymm9
0x100004f86 <+886>:  vaddpd 0x6c0(%rsp), %ymm7, %ymm0
0x100004f8f <+895>:  vaddpd 0x6e0(%rsp), %ymm7, %ymm1
0x100004f98 <+904>:  vaddpd 0x700(%rsp), %ymm7, %ymm2
0x100004fa1 <+913>:  vaddpd 0x720(%rsp), %ymm7, %ymm3
0x100004faa <+922>:  vaddpd 0x740(%rsp), %ymm7, %ymm8
0x100004fb3 <+931>:  vaddpd 0x7c0(%rsp), %ymm7, %ymm4
0x100004fbc <+940>:  vaddpd 0x7e0(%rsp), %ymm7, %ymm5
0x100004fc5 <+949>:  vaddpd 0x800(%rsp), %ymm7, %ymm6
0x100004fce <+958>:  vaddpd 0x660(%rsp), %ymm7, %ymm10
0x100004fd7 <+967>:  vaddpd 0x680(%rsp), %ymm7, %ymm12
0x100004fe0 <+976>:  vaddpd 0x6a0(%rsp), %ymm7, %ymm11
0x100004fe9 <+985>:  vmovupd %ymm9, 0xa0(%rsp)
0x100004ff2 <+994>:  vmovupd %ymm0, 0x120(%rsp)
0x100004ffb <+1003>: vmovupd %ymm8, 0x1a0(%rsp)
0x100005004 <+1012>: vmovupd %ymm3, 0x180(%rsp)
0x10000500d <+1021>: vmovupd %ymm1, 0x140(%rsp)
0x100005016 <+1030>: vmovupd %ymm2, 0x160(%rsp)
0x10000501f <+1039>: vmovupd %ymm10, (%rsp)
0x100005024 <+1044>: vmovupd %ymm4, 0x220(%rsp)
0x10000502d <+1053>: vmovupd %ymm5, 0x240(%rsp)
0x100005036 <+1062>: vmovupd %ymm6, 0x260(%rsp)
0x10000503f <+1071>: vbroadcastsd 0xa0(%rsp), %ymm14
0x100005049 <+1081>: vbroadcastsd 0xa8(%rsp), %ymm15
0x100005053 <+1091>: vaddpd 0x760(%rsp), %ymm7, %ymm10
0x10000505c <+1100>: vaddpd 0x780(%rsp), %ymm7, %ymm9
0x100005065 <+1109>: vaddpd 0x7a0(%rsp), %ymm7, %ymm8
0x10000506e <+1118>: vaddpd 0x820(%rsp), %ymm7, %ymm7
0x100005077 <+1127>: vmulpd %ymm14, %ymm0, %ymm14
0x10000507c <+1132>: vmovupd %ymm7, 0x280(%rsp)
0x100005085 <+1141>: vfmadd213pd %ymm14, %ymm1, %ymm15
0x10000508a <+1146>: vbroadcastsd 0xb0(%rsp), %ymm14
0x100005094 <+1156>: vfmadd213pd %ymm15, %ymm2, %ymm14
0x100005099 <+1161>: vbroadcastsd 0xb8(%rsp), %ymm15
0x1000050a3 <+1171>: vfmadd213pd %ymm14, %ymm3, %ymm15
0x1000050a8 <+1176>: vmulpd 0x620(%rsp), %ymm0, %ymm14
->  0x1000050b1 <+1185>: vmovupd %ymm15, 0x2a0(%rsp)
0x1000050ba <+1194>: vmulpd 0x5a0(%rsp), %ymm0, %ymm15
0x1000050c3 <+1203>: vmulpd 0x520(%rsp), %ymm0, %ymm0
0x1000050cc <+1212>: vfmadd231pd 0x600(%rsp), %ymm1, %ymm14
0x1000050d6 <+1222>: vfmadd231pd 0x580(%rsp), %ymm1, %ymm15
0x1000050e0 <+1232>: vfmadd231pd 0x500(%rsp), %ymm1, %ymm0

由于空间限制,删除之前使用asm命令运行的程序集.

Working assembly with asm command from before deleted due to space restriction.

修改: 以下代码使用ICC 18在gcc.godbolt.org上编译:

The following code compiles on gcc.godbolt.org with ICC 18:

//
//  main.cpp
//  ICC Test
//

#include <iostream>
#include <stdio.h>
#include <math.h>
#include <time.h>
#include <stdlib.h>
#include <iomanip>
#include <complex>
#include <cmath>
#include <fstream>
#include <immintrin.h>

using namespace std;

#define N 4

#define POWER_FACTOR 4 // Power factor tells us how many matrices need to be multiplied. For the standard Wilson action, this is 4. For the first improvement, 6. But the relative runtime ration is independent of this.

#define ITERATIONS 10000000

#define GENERATE_NEW_RANDOMS false

typedef double FP_TYPE;

FP_TYPE SourceMatrix[POWER_FACTOR][N][N];





void InitialiseSourceMatrices();


void Run3ForLoops_Pointer();

inline void Run3ForLoops_MultiplyMatrixByMatrix_OutputTo3(FP_TYPE *A, FP_TYPE *B, FP_TYPE *C);


void RunIntrinsics_FMA_UnalignedCopy_Struct();

inline void RunIntrinsics_FMA_UnalignedCopy_MultiplyMatrixByMatrix(FP_TYPE *A, FP_TYPE *B, FP_TYPE *C);



inline FP_TYPE random(FP_TYPE min, FP_TYPE max) {
    return min + (max-min)*FP_TYPE(rand())/FP_TYPE(RAND_MAX);
}



int main(int argc, const char * argv[]) {


    cout <<"Beginning computation\n\n";



    InitialiseSourceMatrices();

    Run3ForLoops_Pointer();

    RunIntrinsics_FMA_UnalignedCopy_Struct();


    return 0;

}


void InitialiseSourceMatrices() {

    int i, j, k;

    // Assing random numbers to imaginary and real parts
    for(j=0; j<N; j++) {
        for(k=0; k<N; k++) {
            for(i=0; i<POWER_FACTOR; i++) {
                SourceMatrix[i][j][k] = random(-1.0, 1.0);
            }
        }
    }

}





void RunIntrinsics_FMA_UnalignedCopy_Struct() {


    clock_t timer;
    timer = clock();

    // Initialise Variables:


    int i, j, k, n;

    FP_TYPE MatrixDummy1[N][N], MatrixDummy2[N][N], MatrixDummy3[N][N];

    for(k=0; k<N; k++) {
        for(i=0; i<POWER_FACTOR; i++) {
            MatrixDummy1[k][i]=0.;
            MatrixDummy2[k][i]=0.;
            MatrixDummy3[k][i]=0.;
        }
    }


    struct matrix_struct {
        //  int dummy;
        //   __declspec(aligned(32))  double m[N][N];
        FP_TYPE m[N][N] __attribute__ ((aligned (32)));
    };

    // matrix_struct Matrix[POWER_FACTOR];

    matrix_struct Matrix[POWER_FACTOR]; // __attribute__ ((aligned (32)));

    //  double *p1, *p2, *p3, *p0;

    FP_TYPE trace = 0.0;

    // Read source matrices in own data format


    for(n=0; n<ITERATIONS; n++) { // We do the whole process ITERATIONS times to get less error for the runtime .

        // srand (time(NULL));

        for(j=0; j<N; j++) {
            for(k=0; k<N; k++) {
                for(i=0; i<POWER_FACTOR; i++) {
                    if(GENERATE_NEW_RANDOMS) Matrix[i].m[j][k] = random(-1.0, 1.0);
                    else Matrix[i].m[j][k] = SourceMatrix[i][j][k]+1.0/(double)(n+1);
                }
            }
        }


        RunIntrinsics_FMA_UnalignedCopy_MultiplyMatrixByMatrix(&Matrix[0].m[0][0], &Matrix[1].m[0][0], (&MatrixDummy1)[0][0]);

        RunIntrinsics_FMA_UnalignedCopy_MultiplyMatrixByMatrix(&Matrix[2].m[0][0], &Matrix[3].m[0][0], (&MatrixDummy2)[0][0]);

        RunIntrinsics_FMA_UnalignedCopy_MultiplyMatrixByMatrix((&MatrixDummy1)[0][0], (&MatrixDummy2)[0][0], (&MatrixDummy3)[0][0]);


        for(j=0; j<N; j++) {
            trace += MatrixDummy3[j][j];
        }



    }

    cout << setprecision(15);

    cout << "Trace Intrinsics = \t" << trace / (double) ITERATIONS << "    took " << (double) (clock()-timer) / CLOCKS_PER_SEC << "s" << endl << endl;


}



void Run3ForLoops_Pointer() {


    clock_t timer;
    timer = clock();

    // Initialise Variables:


    int i, j, k, n;

    FP_TYPE MatrixDummy1[N][N], MatrixDummy2[N][N], MatrixDummy3[N][N];;


    struct matrix_struct {
        //  int dummy;
        //   __declspec(aligned(32))  double m[N][N];
        FP_TYPE m[N][N] __attribute__ ((aligned (32)));
    };

    // matrix_struct Matrix[POWER_FACTOR];

    matrix_struct Matrix[POWER_FACTOR]; // __attribute__ ((aligned (32)));

    //  double *p1, *p2, *p3, *p0;

    FP_TYPE trace = 0.0;

    // Read source matrices in own data format


    for(n=0; n<ITERATIONS; n++) { // We do the whole process ITERATIONS times to get less error for the runtime .

        // srand (time(NULL));

        for(j=0; j<N; j++) {
            for(k=0; k<N; k++) {
                for(i=0; i<POWER_FACTOR; i++) {
                    if(GENERATE_NEW_RANDOMS) Matrix[i].m[j][k] = random(-1.0, 1.0);
                    else Matrix[i].m[j][k] = SourceMatrix[i][j][k]+1.0/(double)(n+1);
                }
            }
        }



        Run3ForLoops_MultiplyMatrixByMatrix_OutputTo3(&Matrix[0].m[0][0], &Matrix[1].m[0][0], (&MatrixDummy1)[0][0]);

        Run3ForLoops_MultiplyMatrixByMatrix_OutputTo3(&Matrix[2].m[0][0], &Matrix[3].m[0][0], (&MatrixDummy2)[0][0]);

        Run3ForLoops_MultiplyMatrixByMatrix_OutputTo3((&MatrixDummy1)[0][0], (&MatrixDummy2)[0][0], (&MatrixDummy3)[0][0]);




        for(j=0; j<N; j++) {
            trace += MatrixDummy3[j][j];
        }



    }

    cout << setprecision(15);

    cout << "Trace For Point. = \t\t" << trace / (double) ITERATIONS << "    took " << (double) (clock()-timer) / CLOCKS_PER_SEC << "s" << endl << endl;


}



inline void Run3ForLoops_MultiplyMatrixByMatrix_OutputTo3(FP_TYPE *A, FP_TYPE *B, FP_TYPE *C){

    int i, j, k;

    FP_TYPE dummy[N][N];

    for(j=0; j<N; j++) {
        for(k=0; k<N; k++) {
            dummy[j][k] = 0.0;
            for(i=0; i<N; i++) {
                dummy[j][k] += *(A+j*4+i)*(*(B+i*4+k));
            }
        }
    }

    for(j=0; j<N; j++) {
        for(k=0; k<N; k++) {
            *(C+j*4+k) = dummy[j][k];
        }
    }


}



void RunIntrinsics_FMA_UnalignedCopy_MultiplyMatrixByMatrix(FP_TYPE *A, FP_TYPE *B, FP_TYPE *C)
{
    size_t i;

    // the registers you use
    __m256d a0, a1, a2, a3, b0, b1, b2, b3, sum;
    __m256d *B256 = (__m256d *)B, *C256 = (__m256d *)C;

    // load values from B
    b0 = _mm256_loadu_pd(&B[0]);
    b1 = _mm256_loadu_pd(&B[4]);
    b2 = _mm256_loadu_pd(&B[8]);
    b3 = _mm256_loadu_pd(&B[12]);

    for (i = 0; i < 4; i++) {
        // load values from A
        a0 = _mm256_set1_pd(A[4*i + 0]);
        a1 = _mm256_set1_pd(A[4*i + 1]);
        a2 = _mm256_set1_pd(A[4*i + 2]);
        a3 = _mm256_set1_pd(A[4*i + 3]);

        sum = _mm256_mul_pd(a0, b0);
        sum = _mm256_fmadd_pd(a1, b1, sum);
        sum = _mm256_fmadd_pd(a2, b2, sum);
        sum = _mm256_fmadd_pd(a3, b3, sum);

     //   asm ("vmovupd %1, %0" : "=m"(C256[i]) : "x"(sum));
            _mm256_storeu_pd(&C[4*i], sum);

    }
}

推荐答案

我注意到的一件事是,您将FPTYPE*传递给实际上是多维数组的乘法函数.

One thing I noticed is that you are passing a FPTYPE* to the multiplication functions which are actually multi dimensional arrays.

也许英特尔编译器不太喜欢这个?

Maybe the Intel compiler doesn't like this too much?

为了更好地理解您的代码,我对C结构进行了一些C ++编程,并且我的代码现在将const结构引用传递给乘法函数.

In order to better understand your code I did some C++ification of the C constructs and my code is now passing a const struct reference to the multiplication functions.

我没有Intel编译器的许可证,但是也许您可以检查代码现在是否可以在-O3上运行:

I don't have a license for the Intel compiler, but maybe you can check if the code now works at -O3:

#include <iostream>
#include <cstdlib>
#include <iomanip>
#include <immintrin.h>

constexpr int N = 4;
// Power factor tells us how many matrices need to be multiplied.
// For the standard Wilson action, this is 4.
// For the first improvement, 6.
// But the relative runtime ration is independent of this.
constexpr int POWER_FACTOR = 4;
constexpr int ITERATIONS = 10 * 1000 * 1000;
constexpr bool GENERATE_NEW_RANDOMS = false;

typedef double FP_TYPE;

struct Matrix
{
  FP_TYPE m[N][N] __attribute__ ((aligned (32)));
};

typedef void (*multiply_method)(const Matrix&, const Matrix&, Matrix&);

Matrix source_matrices[POWER_FACTOR];

FP_TYPE random (FP_TYPE min, FP_TYPE max);
void randomize_source_matrices ();
void test_run (multiply_method method, const std::string &method_name);
void multiply_plain (const Matrix &a, const Matrix &b, Matrix &c);
void multiply_intrinsics (const Matrix &a, const Matrix &b, Matrix &c);

FP_TYPE random (FP_TYPE min, FP_TYPE max)
{
  return min + (max - min) * FP_TYPE(rand()) / FP_TYPE(RAND_MAX);
}

void randomize_source_matrices ()
{
  // Assign random numbers to imaginary and real parts
  for (int j = 0; j < N; j++)
    {
      for (int k = 0; k < N; k++)
        {
          for (int i = 0; i < POWER_FACTOR; i++)
            {
              source_matrices[i].m[j][k] = random(-1.0, 1.0);
            }
        }
    }
}

void multiply_plain (const Matrix &a, const Matrix &b, Matrix &c)
{
  for (int j = 0; j < N; j++)
    {
      for (int k = 0; k < N; k++)
        {
          c.m[j][k] = 0.0;
          for (int i = 0; i < N; i++)
            {
              c.m[j][k] += a.m[j][i] * b.m[i][k];
            }
        }
    }
}

void multiply_intrinsics (const Matrix &a, const Matrix &b, Matrix &c)
{
  //__m256d *B256 = (__m256d *) B;
  //__m256d *C256 = (__m256d *) C;

  // load values from B
  __m256d b0 = _mm256_loadu_pd (&b.m[0][0]);
  __m256d b1 = _mm256_loadu_pd (&b.m[1][0]);
  __m256d b2 = _mm256_loadu_pd (&b.m[2][0]);
  __m256d b3 = _mm256_loadu_pd (&b.m[3][0]);

  for (size_t i = 0; i < 4; i++)
    {
      // load values from A
      __m256d a0 = _mm256_set1_pd (a.m[i][0]);
      __m256d a1 = _mm256_set1_pd (a.m[i][1]);
      __m256d a2 = _mm256_set1_pd (a.m[i][2]);
      __m256d a3 = _mm256_set1_pd (a.m[i][3]);

      __m256d sum;
      sum = _mm256_mul_pd (a0, b0);
      sum = _mm256_fmadd_pd (a1, b1, sum);
      sum = _mm256_fmadd_pd (a2, b2, sum);
      sum = _mm256_fmadd_pd (a3, b3, sum);

      //   asm ("vmovupd %1, %0" : "=m"(C256[i]) : "x"(sum));
      _mm256_storeu_pd(&c.m[i][0], sum);
    }
}

void test_run (multiply_method method, const std::string &method_name)
{
  clock_t timer = clock ();

  Matrix matrix_dummy1 = {0};
  Matrix matrix_dummy2 = {0};
  Matrix matrix_dummy3 = {0};

  Matrix matrices[POWER_FACTOR];
  FP_TYPE trace = 0.0;

  // Read source matrices in own data format
  // We do the whole process ITERATIONS times to get less error for the runtime .
  for (int n = 0; n < ITERATIONS; n++)
    {
      for (int j = 0; j < N; j++)
        {
          for (int k = 0; k < N; k++)
            {
              for (int i = 0; i < POWER_FACTOR; i++)
                {
                  if (GENERATE_NEW_RANDOMS)
                    {
                      matrices[i].m[j][k] = random (-1.0, 1.0);
                    }
                  else
                    {
                      matrices[i].m[j][k] = source_matrices[i].m[j][k] + 1.0 / (double)(n + 1);
                    }
                }
            }
        }
      method (matrices[0], matrices[1], matrix_dummy1);
      method (matrices[2], matrices[3], matrix_dummy2);
      method (matrix_dummy1, matrix_dummy2, matrix_dummy3);

      for (int j = 0; j < N; j++)
        {
          trace += matrix_dummy3.m[j][j];
        }
    }
  std::cout << std::setprecision(15);
  std::cout << "Trace " << method_name << " = \t";
  std::cout << trace / (double) ITERATIONS;
  std::cout << "    took ";
  std::cout << (double) (clock() - timer) / CLOCKS_PER_SEC << "s\n\n";
}

int main ()
{
  std::cout << "Beginning computation\n\n";
  randomize_source_matrices ();
  test_run (multiply_plain, "For Point");
  test_run (multiply_intrinsics, "Intrinsics");
}

这有点慢,因为我将两个测试函数融合为一个,并删除了过程中的内联指令.

It is a bit slower, because I fused the two test functions into one and removed the inline directives in the process.

(如果您愿意容忍某些代码重复,那么将它们添加回当然应该没问题.)

(It should be no problem to add them back of course if your willing to tolerate some code duplication.)

对于此代码,仍有一些危险之处,例如,它仅在N = 4下正常工作.在生产中使用此类代码之前,请确保添加一些静态断言或一些类似的安全措施.

There are still some things that are dangerous about this code, for example it only works correctly with N = 4. Be sure to add some static assertions or some similar safety measures before using such code in production.

另一件事是,仍然有一些C样式的(double)强制转换喷入了其中,但是我认为这仅仅是因为它是测试代码.我也不确定代码是否可以在其他FP_TYPE上使用(以前从未在内部函数上使用过...).

Another thing is that there are still some C style (double) casts sprayed into it, but I assume that is only because it is test code. I'm also not sure if the code would ever work for a different FP_TYPE (never worked with intrinsics before ...).

仅出于完整性考虑,这是进一步改进的版本:

Just for completeness here is a further improved version:

#include <iostream>
#include <cstdlib>
#include <iomanip>
#include <vector>
#include <immintrin.h>

using FP_TYPE = double;

constexpr size_t N = 4;
// Power factor tells us how many matrices need to be multiplied.
// For the standard Wilson action, this is 4.
// For the first improvement, 6.
// But the relative runtime ration is independent of this.
constexpr size_t POWER_FACTOR = 4;
constexpr size_t ITERATIONS = 10 * 1000 * 1000;
constexpr bool GENERATE_NEW_RANDOMS = false;

struct Matrix
{
  FP_TYPE m[N][N] __attribute__ ((aligned (32))) = {{0}};
};

using multiply_func = void (*) (const Matrix&, const Matrix&, Matrix&);
using set_func = FP_TYPE (*) ();
using transform_func = FP_TYPE (*) (FP_TYPE value);

FP_TYPE random (FP_TYPE min, FP_TYPE max);
void randomize_matrix (Matrix &matrix);
void test_run (const std::vector<Matrix> &source_matrices,
               const multiply_func &func,
               const std::string &func_name);
void multiply_plain (const Matrix &a, const Matrix &b, Matrix &c);
void multiply_intrinsics (const Matrix &a, const Matrix &b, Matrix &c);
void set_each_matrix_value (Matrix &matrix, const set_func &func);
void init_matrix (const Matrix &source_matrix, Matrix &matrix,
                  size_t iteration);

FP_TYPE random (FP_TYPE min, FP_TYPE max)
{
  return min + (max - min) * FP_TYPE(rand()) / FP_TYPE(RAND_MAX);
}

void set_each_matrix_value (Matrix &matrix, const set_func &func)
{
  for (auto &j : matrix.m)
    {
      for (auto &k : j)
        {
          k = func ();
        }
    }
}

void randomize_matrix (Matrix &matrix)
{
  // Assign random numbers to imaginary and real parts
  set_each_matrix_value (matrix, [] ()
  {
    return random(-1.0, 1.0);
  });
}

void multiply_plain (const Matrix &a, const Matrix &b, Matrix &c)
{
  for (size_t j = 0; j < N; j++)
    {
      for (size_t k = 0; k < N; k++)
        {
          auto &val = c.m[j][k];
          val = 0.0;
          for (size_t i = 0; i < N; i++)
            {
              val += a.m[j][i] * b.m[i][k];
            }
        }
    }
}

void multiply_intrinsics (const Matrix &a, const Matrix &b, Matrix &c)
{
  static_assert (N == 4);
  static_assert (sizeof (FP_TYPE) == 8);
  static_assert (N * sizeof(FP_TYPE) == 256 / 8);
  // In addition the array in Matrix.m must be properly aligned

  //__m256d *B256 = (__m256d *) B;
  //__m256d *C256 = (__m256d *) C;

  // load values from B
  __m256d b0 = _mm256_loadu_pd (&b.m[0][0]);
  __m256d b1 = _mm256_loadu_pd (&b.m[1][0]);
  __m256d b2 = _mm256_loadu_pd (&b.m[2][0]);
  __m256d b3 = _mm256_loadu_pd (&b.m[3][0]);

  for (size_t i = 0; i < 4; i++)
    {
      // load values from A
      __m256d a0 = _mm256_set1_pd (a.m[i][0]);
      __m256d a1 = _mm256_set1_pd (a.m[i][1]);
      __m256d a2 = _mm256_set1_pd (a.m[i][2]);
      __m256d a3 = _mm256_set1_pd (a.m[i][3]);

      __m256d sum;
      sum = _mm256_mul_pd (a0, b0);
      sum = _mm256_fmadd_pd (a1, b1, sum);
      sum = _mm256_fmadd_pd (a2, b2, sum);
      sum = _mm256_fmadd_pd (a3, b3, sum);

      //   asm ("vmovupd %1, %0" : "=m"(C256[i]) : "x"(sum));
      _mm256_storeu_pd(&c.m[i][0], sum);
    }
}

void init_matrix (const Matrix &source_matrix, Matrix &matrix, size_t iteration)
{
  for (size_t j = 0; j < N; j++)
    {
      for (size_t k = 0; k < N; k++)
        {
          matrix.m[j][k] = source_matrix.m[j][k] + 1.0 / static_cast<FP_TYPE>
                           (iteration + 1);
        }
    }
}

void test_run (const std::vector<Matrix> &source_matrices,
               const multiply_func &func, const std::string &func_name)
{
  clock_t timer = clock ();

  Matrix matrix_dummy1;
  Matrix matrix_dummy2;
  Matrix matrix_dummy3;

  std::vector<Matrix> matrices (POWER_FACTOR);
  FP_TYPE trace = 0.0;

  // Read source matrices in own data format
  // We do the whole process ITERATIONS times to get less error for the runtime .
  for (size_t n = 0; n < ITERATIONS; n++)
    {
      if constexpr (GENERATE_NEW_RANDOMS)
        {
          for (auto &matrix : matrices)
            {
              randomize_matrix (matrix);
            }
        }
      else
        {
          for (size_t i = 0; i < POWER_FACTOR; i++)
            {
              init_matrix (source_matrices[i], matrices[i], n);
            }
        }

      func (matrices[0], matrices[1], matrix_dummy1);
      func (matrices[2], matrices[3], matrix_dummy2);
      func (matrix_dummy1, matrix_dummy2, matrix_dummy3);

      for (size_t j = 0; j < N; j++)
        {
          trace += matrix_dummy3.m[j][j];
        }
    }
  std::cout << std::setprecision(15);
  std::cout << "Trace " << func_name << " = \t";
  std::cout << trace / static_cast<FP_TYPE> (ITERATIONS);
  std::cout << "    took ";
  std::cout << static_cast<double> (clock() - timer) / CLOCKS_PER_SEC << "s\n";
  std::cout << std::endl;
}

int main ()
{
  std::vector<Matrix> source_matrices (POWER_FACTOR);
  std::cout << "Beginning computation\n";
  std::cout << std::endl;
  for (auto &matrix : source_matrices)
    {
      randomize_matrix (matrix);
    }
  test_run (source_matrices, multiply_plain, "For Point");
  test_run (source_matrices, multiply_intrinsics, "Intrinsics");
}

BTW:要使用g ++或clang ++进行编译,您必须添加-march=haswell(或任何具有的CPU).

BTW: To compile with g++ or clang++ you have to add -march=haswell (or whatever CPU you have).

这篇关于ICC中的-O3弄乱了内在函数,可以与-O1或-O2或与相应的手动组装一起使用的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆