有关strlen的不同实现的性能问题 [英] Questions about the performance of different implementations of strlen

查看：1214 发布时间：2016/7/18 20:03:50 c gcc assembly sse intrinsics

本文介绍了有关strlen的不同实现的性能问题的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我已经实现了以不同的方式，包括 SSE2组装的strlen（）函数， SSE4.2组装和 SSE2内在，我也产生了一些实验，请用的strlen（）上述＆lt;文件string.h＆gt;在glibc的和的strlen（）。然而，他们以毫秒为单位方面的性能（时间）是意想不到的。

我的实验环境：
的CentOS 7.0 + GCC 4.8.5 +英特尔®至强®

以下是我实现的：

的strlen 使用SSE2组装

 长strlen_sse2_asm（为const char * SRC）{
长期结果为0;
ASM（
    MOVL％1 %% EDI \\ n \\ t的
    MOVL $ -0x10，EAX %% \\ n \\ t的
    PXOR %% XMM0，%% XMM0 \\ n \\ t的
    LLOOP：的\\ n \\ t的
        ADDL $ 0×10 %% EAX \\ n \\ t的
        MOVDQU（EDI %%，%% EAX），xmm1中%% \\ n \\ t的
        PCMPEQB %% XMM0，xmm1中%% \\ n \\ t的
        PMOVMSKB %%将xmm1，ECX %% \\ n \\ t的
        测试%% ECX，ECX %% \\ n \\ t的
        JZ LLOOP \\ n \\ t的    BSF %% ECX，ECX %% \\ n \\ t的
    ADDL %% ECX，EAX %% \\ n \\ t的
    MOVL EAX %%，％0
    ：= R（结果）
    ：R（SRC）
    ：％EAX
    ）;
返回结果;
}

2 的strlen 使用SSE4.2组装

 长strlen_sse4_2_asm（为const char * SRC）{
长期结果为0;
ASM（
    MOVL％1 %% EDI \\ n \\ t的
    MOVL $ -0x10，EAX %% \\ n \\ t的
    PXOR %% XMM0，%% XMM0 \\ n \\ t的
    lloop2：的\\ n \\ t的
        ADDL $ 0×10 %% EAX \\ n \\ t的
        pcmpistri $ 0×08，（EDI %%，%% EAX），%% XMM0 \\ n \\ t的
        JNZ lloop2 \\ n \\ t的        增加%% ECX，EAX %% \\ n \\ t的
        MOVL EAX %%，％0    ：= R（结果）
    ：R（SRC）
    ：％EAX
    ）;
返回结果;
}

3。 的strlen 使用SSE2内在

 长strlen_sse2_intrin_align（为const char * SRC）{
如果（SRC == NULL || * SRC =='\\ 0'）{
    返回0;
}
常量__m128i零= _mm_setzero_si128（）;
常量__m128i * PTR =（常量__m128i *）SRC;如果（（（为size_t）PTR＆放大器;！0xF的）= 0）{
    __m128i XMM = _mm_loadu_si128（PTR）;
    unsigned int类型面膜= _mm_movemask_epi8（_mm_cmpeq_epi8（XMM零））;
    如果（掩模！= 0）{
        收益率（为const char *）PTR-SRC +（为size_t）FFS（面罩）;
    }
    PTR =（__m128i *）（0×10 +（为size_t）PTR和放大器;〜0xF的）;
}
为（;; PTR ++）{
    __m128i XMM = _mm_load_si128（PTR）;
    unsigned int类型面膜= _mm_movemask_epi8（_mm_cmpeq_epi8（XMM零））;
    如果（掩模！= 0）
        收益率（为const char *）PTR-SRC +（为size_t）FFS（面罩）;
}}

<醇开始=4>

我也看了起来在linux内核中实现的人，以下是其实施

 为size_t strlen_inline_asm（为const char *海峡）{
INT D0;
为size_t资源;
ASM挥发性（REPNE \\ n \\ t的
SCASB
：= C（RES），=和D（D0）
：1（STR），一（0），（0xffffffffu）
：内存）;返回〜RES-1;
}

在我的经验，我还添加了标准库之一，并比较了它们的性能。
以下是我的主函数code：

 的#include＆LT;＆stdio.h中GT;
＃包括LT＆;＆stdlib.h中GT;
＃包括LT＆;＆string.h中GT;
＃包括LT＆;＆xmmintrin.h GT;
＃包括LT＆;＆x86intrin.h GT;
＃包括LT＆;＆emmintrin.h GT;
＃包括LT＆;＆time.h中GT;
＃包括LT＆;＆unistd.h中GT;
＃包括LT＆; SYS / time.h中＆GT;
诠释的main（）
{
    timeval结构tpstart，tpend;
    INT I = 0;
    对于（; I＆LT; 1023;我++）{
            test_str [I] ='A';
    }
    test_str [I] ='\\ 0';
    函数gettimeofday（安培; tpstart，NULL）;
    对于（i = 0; I＆LT;千万，我++）
            strlen的（test_str）;
    函数gettimeofday（安培; tpend，NULL）;
    的printf（从stirng.h strlen的---＆GT;％LF \\ n，（tpend.tv_sec-tpstart.tv_sec）* 1000 +（tpend.tv_usec-tpstart.tv_usec）/1000.0）;    函数gettimeofday（安培; tpstart，NULL）;
    对于（i = 0; I＆LT;千万，我++）
            strlen_inline_asm（test_str）;
    函数gettimeofday（安培; tpend，NULL）;
    printf(\"strlen_inline_asm--->%lf\
\",(tpend.tv_sec-tpstart.tv_sec)*1000+(tpend.tv_usec-tpstart.tv_usec)/1000.0);    函数gettimeofday（安培; tpstart，NULL）;
    对于（i = 0; I＆LT;千万，我++）
            strlen_sse2_asm（test_str）;
    函数gettimeofday（安培; tpend，NULL）;
    printf(\"strlen_sse2_asm--->%lf\
\",(tpend.tv_sec-tpstart.tv_sec)*1000+(tpend.tv_usec-tpstart.tv_usec)/1000.0);    函数gettimeofday（安培; tpstart，NULL）;
    对于（i = 0; I＆LT;千万，我++）
            strlen_sse4_2_asm（test_str）;
    函数gettimeofday（安培; tpend，NULL）;
    printf(\"strlen_sse4_2_asm--->%lf\
\",(tpend.tv_sec-tpstart.tv_sec)*1000+(tpend.tv_usec-tpstart.tv_usec)/1000.0);    函数gettimeofday（安培; tpstart，NULL）;
    对于（i = 0; I＆LT;千万，我++）
            strlen_sse2_intrin_align（test_str）;
    函数gettimeofday（安培; tpend，NULL）;
    printf(\"strlen_sse2_intrin_align--->%lf\
\",(tpend.tv_sec-tpstart.tv_sec)*1000+(tpend.tv_usec-tpstart.tv_usec)/1000.0);    返回0;
}

的结果是：（毫秒）

 的strlen从stirng.h ---＆GT; 23.518000
strlen_inline_asm ---＆GT; 222.311000
strlen_sse2_asm ---＆GT; 782.907000
strlen_sse4_2_asm ---＆GT; 955.960000
strlen_sse2_intrin_align ---＆GT; 3499.586000

我有一些关于它的问题：

为什么的strlen 的文件string.h 是如此之快？我认为它的code，应查明 strlen_inline_asm ，因为我复制从 /linux-4.2.2/arch/x86的code /lib/string_32.c[ http://lxr.oss.org.cn/source/arch/x86/lib/string_32.c#L164]


为什么 SSE2内在和 SSE2组装在性能上有很大不同？

有人能帮助我如何拆卸code，这样我可以看到什么有静态库的功能的strlen 由编译器进行了改造？我用 GCC -S ，但没有找到的反汇编从在＆lt strlen的;文件string.h＆GT;  

我觉得我的code可以不是很顺利，我将AP preciate如果你能帮助我提高code，尤其是装配的。

感谢。


解决方案
 就像我在评论说，你最大的错误是 -O0 标杆。我讨论到底为什么与 -O0 测试是一个可怕的想法<一个href=\"http://stackoverflow.com/questions/32000917/c-loop-optimization-help-for-final-assignment/32001196#32001196\">in另一篇文章的第一部分。
基准的应的与至少-O2完成的，将其作为全工程将建设着，如果你想测试考什么源，可以最快的ASM相同的优化pferably $ P $ 
  -O0 解释了内联汇编是比C具有内在的方式更快地（或经常编译的C，对于C的strlen实现从借来的glibc）。
 IDK  -O0 仍然会优化掉循环丢弃库的strlen的结果反复，或者如果它在某种程度上只是避免了其他一些巨大的性能缺陷。这不是有趣的猜测在这样一个有缺陷的测试到底发生了什么。
我收紧你的SSE2内联汇编版本。大多只是因为我一直在玩GCC内联汇编输入/输出的制约最近，想看看它会是什么样子，如果我写的让编译器来选择哪些寄存器用于临时，并避免了不必要的指令。
同样的内联汇编适用于 32位和64位的目标的。当编译到备用一起功能，它没有保存/即使是在32位模式下恢复任何寄存器：
 的#include＆LT; immintrin.h＆GT;为size_t strlen_sse2_asm（为const char * SRC）{  //为const char * orig_src = SRC; //为指针增量以+ R（SRC）输出操作数  为size_t结果为0;
  无符号整型TMP1;
  __m128i零= _mm_setzero_si128（），vectmp;  ASM（
    \\ n.Lloop：的\\ n \\ t的
        MOVDQU（％[SRC]，％[RES]），％[vectmp]的\\ n \\ t的//结果章用作循环计数器
        PCMPEQB％[zerovec]％[vectmp] \\ n \\ t的
        PMOVMSKB％[vectmp]％[ITMP] \\ n \\ t的
        加$ 0×10，％[RES] \\ n \\ t的
        测试％[ITMP]％[ITMP] \\ n \\ t的
        JZ .Lloop \\ n \\ t的    BSF％[ITMP]％[ITMP] \\ n \\ t的
    添加％Q [ITMP]％Q [RES] \\ n \\ t// q修饰得到四字注册。
    // q被需要的，因为添加％edx中，RAX％不起作用。但是，在32位模式下，Q给出了一个32位的章，所以同样code ++工程
    ：[RES]+ R（结果），[vectmp]=安培; X（vectmp），[ITMP]=安培; R（TMP1）    ：[zerovec]X（零），//有可能已经是一个归零矢量章时，内联
      [来源]R（SRC）
    ：
    ）;
  返回结果;
  //返回结果+ TMP1; //做ASM外的附加使得海湾合作委员会签署或零扩展TMP1。
  //没有好处无论如何，因为GCC不知道TMP1是16B块或任何内的偏移。
}
 
应该保持寄存器pressure到最低限度内联时，不捆绑任何特殊用途寄存器（如 ECX 这是需要可变计数的变化）。
如果你能刮胡子另一个UOP了内部循环，这将是下降到4微指令，可以在每个周期的一个问题。正因为如此，5微指令意味着每次迭代需要2个周期从前端发出，在Intel SNB-系列CPU。
使用对齐的指针将允许负载折叠成内存操作数为 PCMPEQB 。有趣的是，使用零向量作为目标为 PCMPEQB 是好的：你不需要重新零迭代之间的载体，因为你退出循环，如果这是有史以来非零。它有1个周期的延迟，所以当缓存缺失延迟一个古老的迭代转向零向量成一个循环携带依赖性仅仅是个问题。
 AVX完全解决了这个问题。 AVX允许负载即使没有第一操作的对准检查折叠。三操作数非破坏性 vpcmpeqb 避免把零向量成一个循环携带依赖性。 AVX2将使32B检查一次。
开卷将帮助无论哪种方式，但没有AVX帮助更多。对齐到64B边界什么的，然后整个高速缓存行加载分为四16B载体。在 POR 的结果做了联合检查ING它们放在一起可能是好的，因为 pmovmsk  + 比较和分支 2微指令。
使用SSE4.1  PTEST 没有帮助（相比于 pmovmsk  / 测试 /  JNZ ），因为它是2微指令，不能宏观保险丝的方式测试可以。
  PTEST 可以直接测试整个16B载体是全零或全1（使用ANDNOT  - > CF一部分），但如果在字节级的一个元素是零。 （因此，我们无法避免 PCMPEQB ）。
看一看瓦格纳雾指南的优化ASM，并在的 86 维基。最优化（瓦格纳雾的，和英特尔和AMD的）会提到优化memcpy和strlen的具体地说，IIRC。
I have implemented the strlen() function in different ways, including SSE2 assembly, SSE4.2 assembly and SSE2 intrinsic, I also exerted some experiments on them, with strlen() in <string.h> and strlen() in glibc. However, their performance in terms of milliseconds (time) are unexpected.

My experiment environment:
CentOS 7.0 + gcc 4.8.5 + Intel Xeon

Following are my implementations:

strlen using SSE2 assembly
long strlen_sse2_asm(const char* src){
long result = 0;
asm(
    "movl %1, %%edi\n\t"
    "movl $-0x10, %%eax\n\t"
    "pxor %%xmm0, %%xmm0\n\t"
    "lloop:\n\t"
        "addl $0x10, %%eax\n\t"
        "movdqu (%%edi,%%eax), %%xmm1\n\t"
        "pcmpeqb %%xmm0, %%xmm1\n\t"
        "pmovmskb %%xmm1, %%ecx\n\t"
        "test %%ecx, %%ecx\n\t"
        "jz lloop\n\t"

    "bsf %%ecx, %%ecx\n\t"
    "addl %%ecx, %%eax\n\t"
    "movl %%eax, %0"
    :"=r"(result)
    :"r"(src)
    :"%eax"
    );
return result;
}

2.strlen using SSE4.2 assembly
long strlen_sse4_2_asm(const char* src){
long result = 0;
asm(
    "movl %1, %%edi\n\t"
    "movl $-0x10, %%eax\n\t"
    "pxor %%xmm0, %%xmm0\n\t"
    "lloop2:\n\t"
        "addl $0x10, %%eax\n\t"
        "pcmpistri $0x08,(%%edi, %%eax), %%xmm0\n\t"
        "jnz lloop2\n\t"

        "add %%ecx, %%eax\n\t"
        "movl %%eax, %0"

    :"=r"(result)
    :"r"(src)
    :"%eax"
    );
return result;
}
3. strlen using SSE2 intrinsic 
long strlen_sse2_intrin_align(const char* src){
if (src == NULL || *src == '\0'){
    return 0;
}
const __m128i zero = _mm_setzero_si128();
const __m128i* ptr = (const __m128i*)src;

if(((size_t)ptr&0xF)!=0){
    __m128i xmm = _mm_loadu_si128(ptr);
    unsigned int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(xmm,zero));
    if(mask!=0){
        return (const char*)ptr-src+(size_t)ffs(mask);
    }
    ptr = (__m128i*)(0x10+(size_t)ptr & ~0xF);
}
for (;;ptr++){
    __m128i xmm = _mm_load_si128(ptr);
    unsigned int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(xmm,zero));
    if (mask!=0)
        return (const char*)ptr-src+(size_t)ffs(mask);
}

}



I also looked up the one implemented in linux kernel， following is its implementation
size_t strlen_inline_asm(const char* str){
int d0;
size_t res;
asm volatile("repne\n\t"
"scasb"
:"=c" (res), "=&D" (d0)
: "1" (str), "a" (0), "" (0xffffffffu)
: "memory");

return ~res-1;
}

In my experience, I also added the one of standard library and compared their performance.
Followings are my main function code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <xmmintrin.h>
#include <x86intrin.h>
#include <emmintrin.h>
#include <time.h>
#include <unistd.h>
#include <sys/time.h>
int main()
{
    struct timeval tpstart,tpend;
    int i=0;
    for(;i<1023;i++){
            test_str[i] = 'a';
    }
    test_str[i]='\0';
    gettimeofday(&tpstart,NULL);
    for(i=0;i<10000000;i++)
            strlen(test_str);
    gettimeofday(&tpend,NULL);
    printf("strlen from stirng.h--->%lf\n",(tpend.tv_sec-tpstart.tv_sec)*1000+(tpend.tv_usec-tpstart.tv_usec)/1000.0);

    gettimeofday(&tpstart,NULL);
    for(i=0;i<10000000;i++)
            strlen_inline_asm(test_str);
    gettimeofday(&tpend,NULL);
    printf("strlen_inline_asm--->%lf\n",(tpend.tv_sec-tpstart.tv_sec)*1000+(tpend.tv_usec-tpstart.tv_usec)/1000.0);

    gettimeofday(&tpstart,NULL);
    for(i=0;i<10000000;i++)
            strlen_sse2_asm(test_str);
    gettimeofday(&tpend,NULL);
    printf("strlen_sse2_asm--->%lf\n",(tpend.tv_sec-tpstart.tv_sec)*1000+(tpend.tv_usec-tpstart.tv_usec)/1000.0);

    gettimeofday(&tpstart,NULL);
    for(i=0;i<10000000;i++)
            strlen_sse4_2_asm(test_str);
    gettimeofday(&tpend,NULL);
    printf("strlen_sse4_2_asm--->%lf\n",(tpend.tv_sec-tpstart.tv_sec)*1000+(tpend.tv_usec-tpstart.tv_usec)/1000.0);

    gettimeofday(&tpstart,NULL);
    for(i=0;i<10000000;i++)
            strlen_sse2_intrin_align(test_str);
    gettimeofday(&tpend,NULL);
    printf("strlen_sse2_intrin_align--->%lf\n",(tpend.tv_sec-tpstart.tv_sec)*1000+(tpend.tv_usec-tpstart.tv_usec)/1000.0);

    return 0;
}
The result is : (ms)
strlen from stirng.h--->23.518000
strlen_inline_asm--->222.311000
strlen_sse2_asm--->782.907000
strlen_sse4_2_asm--->955.960000
strlen_sse2_intrin_align--->3499.586000
I have some questions about it:    

Why strlen of string.h is so fast? I think its code should be identify to strlen_inline_asm because I copied the code from /linux-4.2.2/arch/x86/lib/string_32.c[http://lxr.oss.org.cn/source/arch/x86/lib/string_32.c#L164]    
Why sse2 intrinsic and sse2 assembly are so different in performance?
Could someone help me how to disassembly the code so that I can see what has the function strlen of static library been transformed by the compiler? I used gcc -s but didn't find the disassembly of strlen from the <string.h>    
I think my code may be not very well, I would be appreciate if you could help me improve my code, especially assembly ones.
Thanks.
 解决方案 
Like I said in comments, your biggest error is benchmarking with -O0.  I discussed exactly why testing with -O0 is a terrible idea in the first part of another post.

Benchmarks should be done with at least -O2, preferably with the same optimizations as your full project will build with, if you're trying to test test what source makes the fastest asm.

-O0 explains inline asm being way faster than C with intrinsics (or regular compiled C, for C strlen implementation borrowed from glibc).

IDK -O0 would still optimize away loop that discards the result of library strlen repeatedly, or if it somehow just avoided some other huge performance pitfall.  It's not interesting to guess about exactly what happened in such a flawed test.



I tightened up your SSE2 inline-asm version.  Mostly just because I've been playing with gcc inline asm input/output constraints recently, and wanted to see what it would look like if I wrote it to let the compiler choose which registers to use for temporaries, and avoided unneeded instructions.

The same inline asm works for 32 and 64bit targets.  When compiling to a stand-along function, it doesn't have to save/restore any registers even in 32bit mode:
#include <immintrin.h>

size_t strlen_sse2_asm(const char* src){

  // const char *orig_src = src; // for a pointer-increment with a "+r" (src) output operand

  size_t result = 0;
  unsigned int tmp1;
  __m128i zero = _mm_setzero_si128(), vectmp;

  asm(
    "\n.Lloop:\n\t"
        "movdqu   (%[src], %[res]), %[vectmp]\n\t"  // result reg is used as the loop counter
        "pcmpeqb  %[zerovec], %[vectmp]\n\t"
        "pmovmskb %[vectmp], %[itmp]\n\t"
        "add      $0x10, %[res]\n\t"
        "test     %[itmp], %[itmp]\n\t"
        "jz  .Lloop\n\t"

    "bsf %[itmp], %[itmp]\n\t"
    "add %q[itmp], %q[res]\n\t"   // q modifier to get quadword register.
    // q is needed because add %edx, %rax doesn't work.  But in 32bit mode, q gives a 32bit reg, so the same code works
    : [res] "+r"(result), [vectmp] "=&x" (vectmp), [itmp] "=&r" (tmp1)

    : [zerovec] "x" (zero), // There might already be a zeroed vector reg when inlining
      [src] "r"(src)
    :
    );
  return result;
  // return result + tmp1;  // doing the add outside the asm makes gcc sign or zero-extend tmp1.
  // No benefit anyway, since gcc doesn't know that tmp1 is the offset within a 16B chunk or anything.
}
It should keep register pressure to a minimum when inlining, and doesn't tie up any special-purpose registers (like ecx which is needed for variable-count shifts).

If you could shave another uop out of the inner loop, it would be down to 4 uops that could issue at one per cycle.  As it is, 5 uops means each iteration takes 2 cycles to issue from the frontend, on Intel SnB-family CPUs.

Using an aligned pointer would allow the load to fold into a memory operand for pcmpeqb.  Interestingly, using the zero-vector as the destination for pcmpeqb is ok: you don't need to re-zero the vector between iterations, because you exit the loop if it's ever non-zero.  It has 1-cycle latency, so turning the zero vector into a loop-carried dependency is only a problem when cache-misses delay an old iteration.

AVX solves the problem completely.  AVX allows the load to be folded even without doing an alignment check first.  3-operand non-destructive vpcmpeqb avoids turning the zero vector into a loop-carried dependency.  AVX2 would allow checking 32B at once.

Unrolling will help either way, but helps more without AVX.  Align to a 64B boundary or something, and then load the whole cache line into four 16B vectors.  Doing a combined check on the result of PORing them all together may be good, since pmovmsk + compare-and-branch is 2 uops.

Using SSE4.1 PTEST doesn't help (compared to pmovmsk / test / jnz) because it's 2 uops and can't macro-fuse the way test can.

PTEST can directly test for the whole 16B vector being all-zero or all-ones (using ANDNOT -> CF part), but not if one of the byte-elements is zero.  (So we can't avoid pcmpeqb).



Have a look at Agner Fog's guides for optimizing asm, and the other links on the x86 wiki.  Most optimization (Agner Fog's, and Intel's and AMD's) will mention optimizing memcpy and strlen specifically, IIRC.

                        这篇关于有关strlen的不同实现的性能问题的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持IT屋！


                    
                        查看全文

有关strlen的不同实现的性能问题 [英] Questions about the performance of different implementations of strlen

问题描述

相关文章

.NET Framework最新文章

热门教程

热门工具

登录关闭

有关strlen的不同实现的性能问题 [英] Questions about the performance of different implementations of strlen

问题描述

相关文章

.NET Framework最新文章

热门教程

热门工具

登录 关闭

登录关闭