快速计数两个阵列之间相等的字节数 [英] Fast counting the number of equal bytes between two arrays

查看:135
本文介绍了快速计数两个阵列之间相等的字节数的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我写的函数 INT compare_16bytes(__ m128i LHS,__m128i RHS),以比较使用SSE指令两个16字节的数字:该函数返回多少字节后相等执行比较。

现在我想用上面的功能,以比较任意长度的两个字节数组:长度可能不是16字节的倍数,所以我需要处理这个问题。我怎么能完成以下功能的实现?我怎么能改善以下功能?

  INT fast_compare(为const char * S,为const char * T,INT长度)
{
    INT结果为0;    为const char *特征码= S;
    为const char * TPTR = T;    而(...)
    {
        常量__m128i * LHS =(常量__m128i *)特征码;
        常量__m128i * RHS =(常量__m128i *)TPTR;        //比较下16字节S和T
        结果+ = compare_16bytes(* LHS,RHS *);        特征码+ = 16;
        TPTR + = 16;
    }    返回结果;
}


解决方案

由于@Mysticial在评论中说上面做比较和纵向总结,然后只在主循环的末尾水平总结:

 的#include<&stdio.h中GT;
#包括LT&;&stdlib.h中GT;
#包括LT&;&time.h中GT;
#包括LT&;&emmintrin.h GT;//参考实现
INT fast_compare_ref(为const char * S,为const char * T,INT长度)
{
    INT结果为0;
    INT I;    对于(i = 0; I<长度; ++ I)
    {
        如果(S [I] == T [I])
            结果++;
    }
    返回结果;
}//优化的实现
INT fast_compare(为const char * S,为const char * T,INT长度)
{
    INT结果为0;
    INT I;    __m128i VSUM = _mm_set1_epi32(0);
    对于(i = 0; I<长度 - 15,I + = 16)
    {
        __m128i VS,VT,V,VH,VL,VTEMP;        VS = _mm_loadu_si128((__ m128i *)及S [I]); //负荷16从输入字符
        VT = _mm_loadu_si128((__ m128i *)& T公司[I]);
        V = _mm_cmpeq_epi8(VS,VT); // 比较
        VH = _mm_unpackhi_epi8(V,V); //解压比较结果转换成2×8×16位向量
        VL = _mm_unpacklo_epi8(V,V);
        VTEMP = _mm_madd_epi16(VH,VH); //积累16位向量为4×32位的部分和
        VSUM = _mm_add_epi32(VSUM,VTEMP);
        VTEMP = _mm_madd_epi16(VL,VL);
        VSUM = _mm_add_epi32(VSUM,VTEMP);
    }    //获取4×32位的部分和之和
    VSUM = _mm_add_epi32(VSUM,_mm_srli_si128(VSUM,8));
    VSUM = _mm_add_epi32(VSUM,_mm_srli_si128(VSUM,4));
    结果= _mm_cvtsi128_si32(VSUM);    //处理任何剩余的字节(小于16)
    如果(I<长度)
    {
        结果+ = fast_compare_ref(安培; S [I],& T公司[I],长 - 我);
    }    返回结果;
}//测试工具
INT主要(无效)
{
    const int的N = 1000000;
    字符* S =的malloc(N);
    字符* T =的malloc(N);
    INT I,result_ref,结果;    函数srand(时间(NULL));    对于(i = 0; I< N ++ I)
    {
        S [i] = RAND();
        T [i] = RAND();
    }    result_ref = fast_compare_ref(S,T,N);
    结果= fast_compare(S,T,N);    的printf(result_ref =%d个,导致=%d个\\ N,result_ref,结果);;    返回0;
}

编译并运行上述测试工具:

  $ gcc的-O3 -Wall -msse3 fast_compare.c -o fast_compare
$ ./fast_compare
result_ref = 3955,结果= 3955
$ ./fast_compare
result_ref = 3947,结果= 3947
$ ./fast_compare
result_ref = 3945,结果= 3945


请注意,在上述上证所code在这里我们使用 _mm_madd_epi16 来解包和累积,可能不明显招16位 0 / 1 值32位部分和。我们采取的事实,即 -1 * -1 = 1 (和 0 * 0 = 0 当然) - 我们并不是真的在做乘法这里,刚刚拆箱,在一个指令总结


更新:如在下面的评论所指出的,这种解决方案不是最佳的 - 我只是把一个相当最佳16位溶液并加入8位到16位的解压缩,使之成为8位的数据。然而,对于8位数据有更有效的方法,例如使用<一个href=\"https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8&expand=4512,4512\"相对=nofollow> psadbw / _mm_sad_epu8 。我将离开这里这个答案留给后人,和任何人谁可能想要做这种事情的16位数据,但真的不需要拆包输入数据的其他答案之一应该是公认的答案。

I wrote the function int compare_16bytes(__m128i lhs, __m128i rhs) in order to compare two 16 byte numbers using SSE instructions: this function returns how many bytes are equal after performing the comparison.

Now I would like use the above function in order to compare two byte arrays of arbitrary length: the length may not be a multiple of 16 bytes, so I need deal with this problem. How could I complete the implementation of the function below? How could I improve the function below?

int fast_compare(const char* s, const char* t, int length)
{
    int result = 0;

    const char* sPtr = s;
    const char* tPtr = t;

    while(...)
    {
        const __m128i* lhs = (const __m128i*)sPtr;
        const __m128i* rhs = (const __m128i*)tPtr;

        // compare the next 16 bytes of s and t
        result += compare_16bytes(*lhs,*rhs);

        sPtr += 16;
        tPtr += 16;
    }

    return result;
}

解决方案

As @Mysticial says in the comments above, do the compare and sum vertically and then just sum horizontally at the end of the main loop:

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <emmintrin.h>

// reference implementation
int fast_compare_ref(const char *s, const char *t, int length)
{
    int result = 0;
    int i;

    for (i = 0; i < length; ++i)
    {
        if (s[i] == t[i])
            result++;
    }
    return result;
}

// optimised implementation
int fast_compare(const char *s, const char *t, int length)
{
    int result = 0;
    int i;

    __m128i vsum = _mm_set1_epi32(0);
    for (i = 0; i < length - 15; i += 16)
    {
        __m128i vs, vt, v, vh, vl, vtemp;

        vs = _mm_loadu_si128((__m128i *)&s[i]); // load 16 chars from input
        vt = _mm_loadu_si128((__m128i *)&t[i]);
        v = _mm_cmpeq_epi8(vs, vt);             // compare
        vh = _mm_unpackhi_epi8(v, v);           // unpack compare result into 2 x 8 x 16 bit vectors
        vl = _mm_unpacklo_epi8(v, v);
        vtemp = _mm_madd_epi16(vh, vh);         // accumulate 16 bit vectors into 4 x 32 bit partial sums
        vsum = _mm_add_epi32(vsum, vtemp);
        vtemp = _mm_madd_epi16(vl, vl);
        vsum = _mm_add_epi32(vsum, vtemp);
    }

    // get sum of 4 x 32 bit partial sums
    vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
    vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
    result = _mm_cvtsi128_si32(vsum);

    // handle any residual bytes ( < 16)
    if (i < length)
    {
        result += fast_compare_ref(&s[i], &t[i], length - i);
    }

    return result;
}

// test harness
int main(void)
{
    const int n = 1000000;
    char *s = malloc(n);
    char *t = malloc(n);
    int i, result_ref, result;

    srand(time(NULL));

    for (i = 0; i < n; ++i)
    {
        s[i] = rand();
        t[i] = rand();
    }

    result_ref = fast_compare_ref(s, t, n);
    result = fast_compare(s, t, n);

    printf("result_ref = %d, result = %d\n", result_ref, result);;

    return 0;
}

Compile and run the above test harness:

$ gcc -Wall -O3 -msse3 fast_compare.c -o fast_compare
$ ./fast_compare
result_ref = 3955, result = 3955
$ ./fast_compare
result_ref = 3947, result = 3947
$ ./fast_compare
result_ref = 3945, result = 3945


Note that there is one possibly non-obvious trick in the above SSE code where we use _mm_madd_epi16 to unpack and accumulate 16 bit 0/-1 values to 32 bit partial sums. We take advantage of the fact that -1*-1 = 1 (and 0*0 = 0 of course) - we're not really doing a multiply here, just unpacking and summing in one instruction.


UPDATE: as noted in the comments below, this solution is not optimal - I just took a fairly optimal 16 bit solution and added 8 bit to 16 bit unpacking to make it work for 8 bit data. However for 8 bit data there are more efficient methods, e.g. using psadbw/_mm_sad_epu8. I'll leave this answer here for posterity, and for anyone who might want to do this kind of thing with 16 bit data, but really one of the other answers which doesn't require unpacking the input data should be the accepted answer.

这篇关于快速计数两个阵列之间相等的字节数的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆