用C最快的解交错操作? [英] Fastest de-interleave operation in C?

查看:651
本文介绍了用C最快的解交错操作?的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我有一个指针字节数组混合包含两个不同阵列的交错字节数组1 数组2 。说混合看起来是这样的:

  a1b2c3d4 ...

我需要做的,所以我得到的是去交织的字节 ARRAY1 = ABCD ... 数组2 = 1234 ... 。我所知道的长度混合时间提前,而数组1 的长度和数组2 是等价的,都等于混合/ 2

下面是我当前的实现(数组1 数组2 已经分配):

  INT I,J;
INT mixedLength_2 = mixedLength / 2;
对于(i = 0,J = 0; I< mixedLength_2;我++,J + = 2)
{
    数组1 [i] =混合[J]。
    数组2 [I] =混合[J + 1];
}

这避免了任何昂贵的乘除运算,但仍不能运行速度不够快。我希望能有像的memcpy 的需要,可以使用低级别的块拷贝操作,加速过程中的索引。是否有一个更快的实现比我现在有?

修改

目标平台是Objective-C的iOS和Mac上。快速操作为iOS设备更重要,所以针对iOS的一个解决方案,专会比没有好。

更新

感谢大家的反应,尤其是斯蒂芬佳能,格雷厄姆李和Mecki。下面是一个使用斯蒂芬的NEON内在如有否则格雷厄姆的工会用游标迭代数量减少由Mecki的建议我的主人的功能。

 无效交错(常量uint8_t有* srcA表示,常量uint8_t有* srcB和uint8_t有* dstAB,为size_t dstABLength)
{
#如果定义__ARM_NEON__
    //尝试使用NEON内在    //一次迭代32字节
    div_t dstABLength_32 = DIV(dstABLength,32);
    如果(dstABLength_32.rem == 0)
    {
        而(dstABLength_32.quot - &0)
        {
            常量uint8x16_t A = vld1q_u8(srcA表示);
            常量uint8x16_t B = vld1q_u8(SRCB);
            常量uint8x16x2_t AB = {A,B};
            vst2q_u8(dstAB,AB);
            和srcA + = 16;
            srcB和+ = 16;
            dstAB + = 32;
        }
        返回;
    }    //一次迭代16字节
    div_t dstABLength_16 = DIV(dstABLength,16);
    如果(dstABLength_16.rem == 0)
    {
        而(dstABLength_16.quot - &0)
        {
            常量uint8x8_t A = vld1_u8(srcA表示);
            常量uint8x8_t B = vld1_u8(SRCB);
            常量uint8x8x2_t AB = {A,B};
            vst2_u8(dstAB,AB);
            和srcA + = 8;
            srcB和+ = 8;
            dstAB + = 16;
        }
        返回;
    }
#万一    //如果这些字节不正确对齐
    //或NEON不可用,回落到
    //优化迭代    同时//迭代8个字节
    div_t dstABLength_8 = DIV(dstABLength,8);
    如果(dstABLength_8.rem == 0)
    {
        工会的typedef
        {
            uint64_t中宽;
            结构{uint8_t有A1; uint8_t有B1; uint8_t有A2; uint8_t有B2; uint8_t有A3; uint8_t有B3; uint8_t有A4; uint8_t有B4; } 狭窄;
        } ab8x8_t;        uint64_t中* dstAB64 =(uint64_t中*)dstAB;
        INT J = 0;
        的for(int i = 0; I< dstABLength_8.quot;我++)
        {
            ab8x8_t光标;
            cursor.narrow.a1 = srcA表示[J]。
            cursor.narrow.b1 = srcB和[J ++];
            cursor.narrow.a2 = srcA表示[J]。
            cursor.narrow.b2 = srcB和[J ++];
            cursor.narrow.a3 = srcA表示[J]。
            cursor.narrow.b3 = srcB和[J ++];
            cursor.narrow.a4 = srcA表示[J]。
            cursor.narrow.b4 = srcB和[J ++];
            dstAB64 [I] = cursor.wide;
        }
        返回;
    }    在一个时间//迭代4字节
    div_t dstABLength_4 = DIV(dstABLength,4);
    如果(dstABLength_4.rem == 0)
    {
        工会的typedef
        {
            uint32_t的宽;
            结构{uint8_t有A1; uint8_t有B1; uint8_t有A2; uint8_t有B2; } 狭窄;
        } ab8x4_t;        uint32_t的* dstAB32 =(uint32_t的*)dstAB;
        INT J = 0;
        的for(int i = 0; I< dstABLength_4.quot;我++)
        {
            ab8x4_t光标;
            cursor.narrow.a1 = srcA表示[J]。
            cursor.narrow.b1 = srcB和[J ++];
            cursor.narrow.a2 = srcA表示[J]。
            cursor.narrow.b2 = srcB和[J ++];
            dstAB32 [I] = cursor.wide;
        }
        返回;
    }    在一个时间//迭代2个字节
    div_t dstABLength_2 = DIV(dstABLength,2);
    工会的typedef
    {
        uint16_t宽;
        结构{uint8_t有一个; uint8_t有B: } 狭窄;
    } ab8x2_t;    uint16_t * dstAB16 =(uint16_t *)dstAB;
    的for(int i = 0; I< dstABLength_2.quot;我++)
    {
        ab8x2_t光标;
        cursor.narrow.a = srcA表示[I]
        cursor.narrow.b = srcB和[I]
        dstAB16 [I] = cursor.wide;
    }
}无效解交织(常量uint8_t有* srcAB,uint8_t有* DSTA,uint8_t有*数字机顶盒,为size_t srcABLength)
{
#如果定义__ARM_NEON__
    //尝试使用NEON内在    //一次迭代32字节
    div_t srcABLength_32 = DIV(srcABLength,32);
    如果(srcABLength_32.rem == 0)
    {
        而(srcABLength_32.quot - &0)
        {
            常量uint8x16x2_t AB = vld2q_u8(srcAB);
            vst1q_u8(DSTA,ab.val [0]);
            vst1q_u8(DSTB,ab.val [1]);
            srcAB + = 32;
            DSTA + = 16;
            数字机顶盒+ = 16;
        }
        返回;
    }    //一次迭代16字节
    div_t srcABLength_16 = DIV(srcABLength,16);
    如果(srcABLength_16.rem == 0)
    {
        而(srcABLength_16.quot - &0)
        {
            常量uint8x8x2_t AB = vld2_u8(srcAB);
            vst1_u8(DSTA,ab.val [0]);
            vst1_u8(DSTB,ab.val [1]);
            srcAB + = 16;
            DSTA + = 8;
            数字机顶盒+ = 8;
        }
        返回;
    }
#万一    //如果这些字节不正确对齐
    //或NEON不可用,回落到
    //优化迭代    同时//迭代8个字节
    div_t srcABLength_8 = DIV(srcABLength,8);
    如果(srcABLength_8.rem == 0)
    {
        工会的typedef
        {
            uint64_t中宽;
            结构{uint8_t有A1; uint8_t有B1; uint8_t有A2; uint8_t有B2; uint8_t有A3; uint8_t有B3; uint8_t有A4; uint8_t有B4; } 狭窄;
        } ab8x8_t;        uint64_t中* srcAB64 =(uint64_t中*)srcAB;
        INT J = 0;
        的for(int i = 0; I< srcABLength_8.quot;我++)
        {
            ab8x8_t光标;
            cursor.wide = srcAB64 [I]
            DSTA [J] = cursor.narrow.a1;
            数字机顶盒[J ++] = cursor.narrow.b1;
            DSTA [J] = cursor.narrow.a2;
            数字机顶盒[J ++] = cursor.narrow.b2;
            DSTA [J] = cursor.narrow.a3;
            数字机顶盒[J ++] = cursor.narrow.b3;
            DSTA [J] = cursor.narrow.a4;
            数字机顶盒[J ++] = cursor.narrow.b4;
        }
        返回;
    }    在一个时间//迭代4字节
    div_t srcABLength_4 = DIV(srcABLength,4);
    如果(srcABLength_4.rem == 0)
    {
        工会的typedef
        {
            uint32_t的宽;
            结构{uint8_t有A1; uint8_t有B1; uint8_t有A2; uint8_t有B2; } 狭窄;
        } ab8x4_t;        uint32_t的* srcAB32 =(uint32_t的*)srcAB;
        INT J = 0;
        的for(int i = 0; I< srcABLength_4.quot;我++)
        {
            ab8x4_t光标;
            cursor.wide = srcAB32 [I]
            DSTA [J] = cursor.narrow.a1;
            数字机顶盒[J ++] = cursor.narrow.b1;
            DSTA [J] = cursor.narrow.a2;
            数字机顶盒[J ++] = cursor.narrow.b2;
        }
        返回;
    }    在一个时间//迭代2个字节
    div_t srcABLength_2 = DIV(srcABLength,2);
    工会的typedef
    {
        uint16_t宽;
        结构{uint8_t有一个; uint8_t有B: } 狭窄;
    } ab8x2_t;    uint16_t * srcAB16 =(uint16_t *)srcAB;
    的for(int i = 0; I< srcABLength_2.quot;我++)
    {
        ab8x2_t光标;
        cursor.wide = srcAB16 [I]
        DSTA [I] = cursor.narrow.a;
        数字机顶盒[I] = cursor.narrow.b;
    }
}


解决方案

关闭我的头顶,我不知道去交错2声道字节数据的库函数。但是它的价值提出与苹果的bug报告,要求这样的功能。

在此期间,它是pretty易于使用矢量化NEON或SSE内在这样的功能。具体来说,在ARM你将要使用 vld1q_u8 从每个源数组加载一个矢量, vuzpq_u8 来去交织他们和 vst1q_u8 来存储得到的载体;这里有一个粗略的草图,我没有测试过,甚至试图建立,但应该说明的总体思路。更复杂的实现是绝对有可能(尤其是,氖可以加载/存储的两个的16B登记在一个单一的指令,该指令的编译器可能不与此做,和流水线和/或展开的一些量可以益取决于你的缓冲区有多长):

 #如果定义__ARM_NEON__
#包括LT&;&arm_neon.h GT;
#万一
#包括LT&;&stdint.h GT;
#包括LT&;&STDDEF.H GT;无效解交织(uint8_t有*混合,uint8_t有* ARRAY1,uint8_t有*数组2,为size_t mixedLength){
#如果定义__ARM_NEON__
    为size_t载体= mixedLength / 32;
    mixedLength%= 32;
    而(载体 - 大于0){
        常量uint8x16_t src0 = vld1q_u8(混合);
        常量uint8x16_t SRC1 = vld1q_u8(混合+ 16);
        常量uint8x16x2_t DST = vuzpq_u8(src0,SRC1);
        vst1q_u8(数组1,dst.val [0]);
        vst1q_u8(数组2,dst.val [1]);
        混合+ = 32;
        数组1 + = 16;
        数组2 + = 16;
    }
#万一
    用于(为size_t我= 0; I< mixedLength / 2; ++ I){
        数组1 [i] =混合[2 * i];
        数组2 [I] =混合[2 * I + 1];
    }
}

I have a pointer to an array of bytes mixed that contains the interleaved bytes of two distinct arrays array1 and array2. Say mixed looks something like this:

a1b2c3d4...

What I need to do is de-interleave the bytes so I get array1 = abcd... and array2 = 1234.... I know the length of mixed ahead of time, and the lengths of array1 and array2 are equivalent, both equal to mixed / 2.

Here is my current implementation (array1 and array2 are already allocated):

int i, j;
int mixedLength_2 = mixedLength / 2;
for (i = 0, j = 0; i < mixedLength_2; i++, j += 2)
{
    array1[i] = mixed[j];
    array2[i] = mixed[j+1];
}

This avoids any expensive multiplication or division operations, but still doesn't run fast enough. I'm hoping there is something like memcpy that takes an indexer that can use low-level block copy operations to speed up the process. Is there a faster implementation than what I currently have?

Edit

The target platform is Objective-C for iOS and Mac. A fast operation is more important for iOS devices, so a solution targeting iOS specifically would be better than nothing.

Update

Thanks everyone for the responses, especially Stephen Canon, Graham Lee, and Mecki. Here is my "master" function that uses Stephen's NEON intrinsics if available and otherwise Graham's union cursors with a reduced number of iterations as suggested by Mecki.

void interleave(const uint8_t *srcA, const uint8_t *srcB, uint8_t *dstAB, size_t dstABLength)
{
#if defined __ARM_NEON__
    // attempt to use NEON intrinsics

    // iterate 32-bytes at a time
    div_t dstABLength_32 = div(dstABLength, 32);
    if (dstABLength_32.rem == 0)
    {
        while (dstABLength_32.quot --> 0)
        {
            const uint8x16_t a = vld1q_u8(srcA);
            const uint8x16_t b = vld1q_u8(srcB);
            const uint8x16x2_t ab = { a, b };
            vst2q_u8(dstAB, ab);
            srcA += 16;
            srcB += 16;
            dstAB += 32;
        }
        return;
    }

    // iterate 16-bytes at a time
    div_t dstABLength_16 = div(dstABLength, 16);
    if (dstABLength_16.rem == 0)
    {
        while (dstABLength_16.quot --> 0)
        {
            const uint8x8_t a = vld1_u8(srcA);
            const uint8x8_t b = vld1_u8(srcB);
            const uint8x8x2_t ab = { a, b };
            vst2_u8(dstAB, ab);
            srcA += 8;
            srcB += 8;
            dstAB += 16;
        }
        return;
    }
#endif

    // if the bytes were not aligned properly
    // or NEON is unavailable, fall back to
    // an optimized iteration

    // iterate 8-bytes at a time
    div_t dstABLength_8 = div(dstABLength, 8);
    if (dstABLength_8.rem == 0)
    {
        typedef union
        {
            uint64_t wide;
            struct { uint8_t a1; uint8_t b1; uint8_t a2; uint8_t b2; uint8_t a3; uint8_t b3; uint8_t a4; uint8_t b4; } narrow;
        } ab8x8_t;

        uint64_t *dstAB64 = (uint64_t *)dstAB;
        int j = 0;
        for (int i = 0; i < dstABLength_8.quot; i++)
        {
            ab8x8_t cursor;
            cursor.narrow.a1 = srcA[j  ];
            cursor.narrow.b1 = srcB[j++];
            cursor.narrow.a2 = srcA[j  ];
            cursor.narrow.b2 = srcB[j++];
            cursor.narrow.a3 = srcA[j  ];
            cursor.narrow.b3 = srcB[j++];
            cursor.narrow.a4 = srcA[j  ];
            cursor.narrow.b4 = srcB[j++];
            dstAB64[i] = cursor.wide;
        }
        return;
    }

    // iterate 4-bytes at a time
    div_t dstABLength_4 = div(dstABLength, 4);
    if (dstABLength_4.rem == 0)
    {
        typedef union
        {
            uint32_t wide;
            struct { uint8_t a1; uint8_t b1; uint8_t a2; uint8_t b2; } narrow;
        } ab8x4_t;

        uint32_t *dstAB32 = (uint32_t *)dstAB;
        int j = 0;
        for (int i = 0; i < dstABLength_4.quot; i++)
        {
            ab8x4_t cursor;
            cursor.narrow.a1 = srcA[j  ];
            cursor.narrow.b1 = srcB[j++];
            cursor.narrow.a2 = srcA[j  ];
            cursor.narrow.b2 = srcB[j++];
            dstAB32[i] = cursor.wide;
        }
        return;
    }

    // iterate 2-bytes at a time
    div_t dstABLength_2 = div(dstABLength, 2);
    typedef union
    {
        uint16_t wide;
        struct { uint8_t a; uint8_t b; } narrow;
    } ab8x2_t;

    uint16_t *dstAB16 = (uint16_t *)dstAB;
    for (int i = 0; i < dstABLength_2.quot; i++)
    {
        ab8x2_t cursor;
        cursor.narrow.a = srcA[i];
        cursor.narrow.b = srcB[i];
        dstAB16[i] = cursor.wide;
    }
}

void deinterleave(const uint8_t *srcAB, uint8_t *dstA, uint8_t *dstB, size_t srcABLength)
{
#if defined __ARM_NEON__
    // attempt to use NEON intrinsics

    // iterate 32-bytes at a time
    div_t srcABLength_32 = div(srcABLength, 32);
    if (srcABLength_32.rem == 0)
    {
        while (srcABLength_32.quot --> 0)
        {
            const uint8x16x2_t ab = vld2q_u8(srcAB);
            vst1q_u8(dstA, ab.val[0]);
            vst1q_u8(dstB, ab.val[1]);
            srcAB += 32;
            dstA += 16;
            dstB += 16;
        }
        return;
    }

    // iterate 16-bytes at a time
    div_t srcABLength_16 = div(srcABLength, 16);
    if (srcABLength_16.rem == 0)
    {
        while (srcABLength_16.quot --> 0)
        {
            const uint8x8x2_t ab = vld2_u8(srcAB);
            vst1_u8(dstA, ab.val[0]);
            vst1_u8(dstB, ab.val[1]);
            srcAB += 16;
            dstA += 8;
            dstB += 8;
        }
        return;
    }
#endif

    // if the bytes were not aligned properly
    // or NEON is unavailable, fall back to
    // an optimized iteration

    // iterate 8-bytes at a time
    div_t srcABLength_8 = div(srcABLength, 8);
    if (srcABLength_8.rem == 0)
    {
        typedef union
        {
            uint64_t wide;
            struct { uint8_t a1; uint8_t b1; uint8_t a2; uint8_t b2; uint8_t a3; uint8_t b3; uint8_t a4; uint8_t b4; } narrow;
        } ab8x8_t;

        uint64_t *srcAB64 = (uint64_t *)srcAB;
        int j = 0;
        for (int i = 0; i < srcABLength_8.quot; i++)
        {
            ab8x8_t cursor;
            cursor.wide = srcAB64[i];
            dstA[j  ] = cursor.narrow.a1;
            dstB[j++] = cursor.narrow.b1;
            dstA[j  ] = cursor.narrow.a2;
            dstB[j++] = cursor.narrow.b2;
            dstA[j  ] = cursor.narrow.a3;
            dstB[j++] = cursor.narrow.b3;
            dstA[j  ] = cursor.narrow.a4;
            dstB[j++] = cursor.narrow.b4;
        }
        return;
    }

    // iterate 4-bytes at a time
    div_t srcABLength_4 = div(srcABLength, 4);
    if (srcABLength_4.rem == 0)
    {
        typedef union
        {
            uint32_t wide;
            struct { uint8_t a1; uint8_t b1; uint8_t a2; uint8_t b2; } narrow;
        } ab8x4_t;

        uint32_t *srcAB32 = (uint32_t *)srcAB;
        int j = 0;
        for (int i = 0; i < srcABLength_4.quot; i++)
        {
            ab8x4_t cursor;
            cursor.wide = srcAB32[i];
            dstA[j  ] = cursor.narrow.a1;
            dstB[j++] = cursor.narrow.b1;
            dstA[j  ] = cursor.narrow.a2;
            dstB[j++] = cursor.narrow.b2;
        }
        return;
    }

    // iterate 2-bytes at a time
    div_t srcABLength_2 = div(srcABLength, 2);
    typedef union
    {
        uint16_t wide;
        struct { uint8_t a; uint8_t b; } narrow;
    } ab8x2_t;

    uint16_t *srcAB16 = (uint16_t *)srcAB;
    for (int i = 0; i < srcABLength_2.quot; i++)
    {
        ab8x2_t cursor;
        cursor.wide = srcAB16[i];
        dstA[i] = cursor.narrow.a;
        dstB[i] = cursor.narrow.b;
    }
}

解决方案

Off the top of my head, I don't know of a library function for de-interleaving 2 channel byte data. However it's worth filing a bug report with Apple to request such a function.

In the meantime, it's pretty easy to vectorize such a function using NEON or SSE intrinsics. Specifically, on ARM you will want to use vld1q_u8 to load a vector from each source array, vuzpq_u8 to de-interleave them, and vst1q_u8 to store the resulting vectors; here's a rough sketch that I haven't tested or even tried to build, but it should illustrate the general idea. More sophisticated implementations are definitely possible (in particular, NEON can load/store two 16B registers in a single instruction, which the compiler may not do with this, and some amount of pipelining and/or unrolling may be beneficial depending on how long your buffers are):

#if defined __ARM_NEON__
#   include <arm_neon.h>
#endif
#include <stdint.h>
#include <stddef.h>

void deinterleave(uint8_t *mixed, uint8_t *array1, uint8_t *array2, size_t mixedLength) {
#if defined __ARM_NEON__
    size_t vectors = mixedLength / 32;
    mixedLength %= 32;
    while (vectors --> 0) {
        const uint8x16_t src0 = vld1q_u8(mixed);
        const uint8x16_t src1 = vld1q_u8(mixed + 16);
        const uint8x16x2_t dst = vuzpq_u8(src0, src1);
        vst1q_u8(array1, dst.val[0]);
        vst1q_u8(array2, dst.val[1]);
        mixed += 32;
        array1 += 16;
        array2 += 16;
    }
#endif
    for (size_t i=0; i<mixedLength/2; ++i) {
        array1[i] = mixed[2*i];
        array2[i] = mixed[2*i + 1];
    }
}

这篇关于用C最快的解交错操作?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆