校验code实施Neon在内部函数 [英] Checksum code implementation for Neon in Intrinsics

查看:528
本文介绍了校验code实施Neon在内部函数的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我想实现校验和计算code(2的补加)的NEON,使用内部。当前的校验和计算正在对ARM进行。

我执行从存储器中取出128位在一次成NEON寄存器并执行的SIMD(加),并且结果是从128位的数字折叠以一个16位数字。

一切看起来都工作正常,但我的NEON实现消耗更多时间的ARM版本。

ARM版采用: 0.860000小号
NEON版本需要: 1.260000小号

注意:


  1. 使用公用程序异形time.h中

  2. 校验和函数调用10,000次从一个示例应用程序,并且所有的功能完整的运行
  3. 后计算时间

其他信息:


  1. 用于GNU工具链(ARM-NONE-Linux的gnueabi-GCC)编译的内在code,而不是手臂的工具链。

  2. Linux平台。

  3. C-内在code。

问题:


  1. 为什么NEON版本需要更多的时间比的ARM版本? (虽然我已照顾了内在与该批次最低循环使用)


  2. 如何实现我想达到什么? (与NEON效率)


  3. 可能有人点我或分享一些示例实现(伪code /算法/ code,而不是理论上的实施论文或谈判),使用ARM的NEON的互操作起来


任何帮助将是非常美联社preciated。

下面是我的code:

  uint16_t do_csum(const的无符号字符* BUFF,INT LEN)
{
诠释奇,伯爵,我;uint32x4_t结果= veorq_u32(结果,结果),总和= veorq_u32(总和,总和);
uint16x4_t数据,data_hi,data_low,DATA8;
uint16x8_t DATAQ;
uint16_t result16,DISP [20] = {0,0,0,0,0,0,0,0,0,0};如果(LEN&下; = 0)
    转到出来;
奇= 1&安培; (无符号长)的buff;
如果(奇数){
    uint8x8_t数据1 = veor_u8(数据1,数据1);
    数据1 =(uint16x4_t)vld1_lane_u8((uint8_t有*)的buff,数据1,0); //结果= * BUFF<< 8;
    DATA1 =(uint16x4_t)vshl_n_u16(DATA1,8);    len--;
    BUFF ++;
    结果= vaddw_u16(结果,数据1);
}
数= LEN>> 1; / *的16位字NR .. * /
如果(计数){
    如果(2及(无符号长)BUFF){
        uint16x4_t数据2 = veor_u16(数据2,数据2);
        数据2 =(uint16x4_t)vld1_lane_u16((uint16_t *)的buff,数据2,0); //结果+ = *(无符号短*)的buff;
        计数 - ;
        LEN - = 2;
        BUFF + = 2;
        结果= vaddw_u16(结果,数据2);
    }
    算上>> = 1; / *的32位字NR .. * /
    如果(计数){
        如果(4及(无符号长)BUFF){
            uint32x2_t DATA4 =(uint16x4_t)vld1_lane_u32((uint32_t的*)的buff,DATA4,0);
            计数 - ;
            LEN - = 4;
            BUFF + = 4;
            结果= vaddw_u16(结果,DATA4);
        }
        算上>> = 1; / *的64位字NR .. * /
        如果(计数){
            如果(8及(无符号长)BUFF){
                uint64x1_t DATA8 = vld1_u64((uint64_t中*)的buff);
                计数 - ;
                LEN - = 8;
                BUFF + = 8;
                结果= vaddw_u16(结果,(uint16x4_t)DATA8);
            }
            算上>> = 1; / * 128位字NR .. * /
            如果(计数){
                做{
                    DATAQ =(uint16x8_t)vld1q_u64((uint64_t中*)的buff); // VLD1.64 {D0,D1},[R0]
                    计数 - ;
                    BUFF + = 16;                    总和= vpaddlq_u16(DATAQ);
                    vst1q_u16(DISP,DATAQ); // VST1.16 {D0,D1},[R0]                    结果= vaddq_u32(求和,结果);
                }而(计数);
            }
            如果(LEN和8){
                uint64x1_t DATA8 = vld1_u64((uint64_t中*)的buff);
                BUFF + = 8;
                结果= vaddw_u16(结果,(uint16x4_t)DATA8);
            }
        }
        如果(LEN和4){
            uint32x2_t DATA4 = veor_u32(数据4,DATA4);            DATA4 =(uint16x4_t)vld1_lane_u32((uint32_t的*)的buff,DATA4,0); //结果+ = *(unsigned int类型*)的buff;
            BUFF + = 4;
            结果= vaddw_u16(结果,(uint16x4_t)数据4);
        }
    }
    如果(LEN和2){
        uint16x4_t数据2 = veor_u16(数据2,数据2);
        数据2 =(uint16x4_t)vld1_lane_u16((uint16_t *)的buff,数据2,0); //结果+ = *(无符号短*)的buff;
        BUFF + = 2;
        结果= vaddw_u16(结果,数据2);
    }
}
如果(LEN&放大器; 1){
    uint8x8_t数据1 = veor_u8(数据1,数据1);
    数据1 =(uint16x4_t)vld1_lane_u8((uint8_t有*)的buff,数据1,0); //结果= * BUFF<< 8;
    结果= vaddw_u8(结果,数据1);
}
result16 = from128to16(结果);如果(奇数)
    result16 =((result16>→8)及0xff的)| ((result16&放大器; 0xff的)下;&下; 8);出:
    返回result16;
}


解决方案

有几件事情可以改善:


  • 摆脱卖场为 DISP - 这看起来像调试code,它就剩在

  • 请不要你的主循环中做横向此外 - 只是做在循环部分的(垂直)的款项,并在循环后做最后一个水平除了(见的对于如何做到这一点的例子这个答案 - 这是上证所但原理是一样的)

  • 确保您使用 GCC -O3 ... 得到编译器优化最大的好处

  • 请不要使用转到! (不影响性能,而且是邪恶的。)

I'm trying to implement the checksum computation code(2's complement addition) for NEON, using intrinsic. The current checksum computation is being carried out on ARM.

My implementation fetches 128-bits at once from the memory into NEON registers and does SIMD (addition), and result is folded to a 16-bit number from a 128-bit number.

Everything looks to be working fine, but my NEON implementation is consuming more time that of the ARM version.

ARM version takes: 0.860000 s NEON version takes: 1.260000 s

Note:

  1. Profiled using utilities from "time.h"
  2. The checksum function called 10,000 times from a sample application, and time computed after complete run of all the functions

Other details:

  1. Used GNU tool-chain(arm-none-linux-gnueabi-gcc) for compiling the intrinsic code and not arm tool-chain.
  2. Linux platform.
  3. C-intrinsic code.

Questions:

  1. Why does NEON version take more time than that of the ARM version? (Although I have taken care that intrinsic with minimum cycles in the batch is used)

  2. How do achieve what I want to achieve? (efficiency with NEON)

  3. Could someone point to me or share some sample implementations(pseudo-code/algorithms/code, not the theoretical implementation papers or talks) which uses the inter-operations of ARM-NEON together?

Any help would be much appreciated.

Here's my code:

uint16_t do_csum(const unsigned char * buff, int len)
{
int odd, count, i;

uint32x4_t result = veorq_u32( result, result), sum = veorq_u32( sum, sum); 
uint16x4_t data, data_hi, data_low, data8;
uint16x8_t dataq;
uint16_t result16, disp[20] = {0,0,0,0,0,0,0,0,0,0};

if (len <= 0)
    goto out;
odd = 1 & (unsigned long) buff;
if (odd) {
    uint8x8_t data1 = veor_u8( data1, data1); 
    data1 = (uint16x4_t)vld1_lane_u8((uint8_t *)buff, data1, 0); //result = *buff << 8;
    data1 = (uint16x4_t)vshl_n_u16( data1, 8);

    len--;
    buff++;
    result = vaddw_u16(result, data1);
}
count = len >> 1;       /* nr of 16-bit words.. */
if (count) {
    if (2 & (unsigned long) buff) {
        uint16x4_t data2 = veor_u16( data2, data2); 
        data2 = (uint16x4_t) vld1_lane_u16((uint16_t *)buff, data2, 0); //result += *(unsigned short *) buff;
        count--;
        len -= 2;
        buff += 2;
        result = vaddw_u16( result, data2);
    }
    count >>= 1;        /* nr of 32-bit words.. */
    if (count) {
        if (4 & (unsigned long) buff) {
            uint32x2_t data4 = (uint16x4_t) vld1_lane_u32((uint32_t *) buff, data4, 0);
            count--;
            len -= 4;
            buff += 4;
            result = vaddw_u16( result, data4);
        }
        count >>= 1;    /* nr of 64-bit words.. */
        if (count) {
            if (8 & (unsigned long) buff) {
                uint64x1_t data8 = vld1_u64((uint64_t *) buff); 
                count--;
                len -= 8;
                buff += 8;
                result = vaddw_u16( result,(uint16x4_t)data8);
            }
            count >>= 1;    /* nr of 128-bit words.. */
            if (count) {
                do {
                    dataq = (uint16x8_t)vld1q_u64((uint64_t *) buff); // VLD1.64 {d0, d1}, [r0]
                    count--;
                    buff += 16;

                    sum = vpaddlq_u16(dataq);   
                    vst1q_u16( disp, dataq); // VST1.16 {d0, d1}, [r0]

                    result = vaddq_u32( sum, result);
                } while (count);
            }
            if (len & 8) {
                uint64x1_t data8 =  vld1_u64((uint64_t *) buff); 
                buff += 8;
                result = vaddw_u16( result, (uint16x4_t)data8);
            }
        }
        if (len & 4) {
            uint32x2_t data4 = veor_u32( data4, data4); 

            data4 = (uint16x4_t)vld1_lane_u32((uint32_t *) buff, data4, 0);//result += *(unsigned int *) buff;
            buff += 4;
            result = vaddw_u16( result,(uint16x4_t) data4);
        }
    }
    if (len & 2) {
        uint16x4_t data2 = veor_u16( data2, data2); 
        data2 = (uint16x4_t) vld1_lane_u16((uint16_t *)buff, data2, 0); //result += *(unsigned short *) buff;
        buff += 2;
        result = vaddw_u16( result, data2);
    }
}
if (len & 1){
    uint8x8_t data1 = veor_u8( data1, data1); 
    data1 = (uint16x4_t) vld1_lane_u8((uint8_t *)buff, data1, 0); //result = *buff << 8;
    result = vaddw_u8( result, data1);
}


result16 = from128to16(result);

if (odd)
    result16 = ((result16 >> 8) & 0xff) | ((result16 & 0xff) << 8);

out:
    return result16;
}

解决方案

A few things you can improve:

  • Get rid of the stores to disp - this looks like debug code that got left in ?
  • Don't do horizontal addition within your main loop - just do partial (vertical) sums in the loop and do one final horizontal addition after the loop (see this answer for an example of how to do this - it's for SSE but the principle is the same)
  • Make sure you use gcc -O3 ... to get maximum benefit from compiler optimisation
  • Don't use goto ! (Doesn't affect performance but is evil.)

这篇关于校验code实施Neon在内部函数的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆