校验code实施Neon在内部函数 [英] Checksum code implementation for Neon in Intrinsics
问题描述
我想实现校验和计算code(2的补加)的NEON,使用内部。当前的校验和计算正在对ARM进行。
我执行从存储器中取出128位在一次成NEON寄存器并执行的SIMD(加),并且结果是从128位的数字折叠以一个16位数字。
一切看起来都工作正常,但我的NEON实现消耗更多时间的ARM版本。
ARM版采用: 0.860000小号
NEON版本需要: 1.260000小号
注意:
- 使用公用程序异形time.h中
- 校验和函数调用10,000次从一个示例应用程序,并且所有的功能完整的运行 后计算时间
其他信息:
- 用于GNU工具链(ARM-NONE-Linux的gnueabi-GCC)编译的内在code,而不是手臂的工具链。
- Linux平台。
- C-内在code。
问题:
-
为什么NEON版本需要更多的时间比的ARM版本? (虽然我已照顾了内在与该批次最低循环使用)
-
如何实现我想达到什么? (与NEON效率)
-
可能有人点我或分享一些示例实现(伪code /算法/ code,而不是理论上的实施论文或谈判),使用ARM的NEON的互操作起来
任何帮助将是非常美联社preciated。
下面是我的code:
uint16_t do_csum(const的无符号字符* BUFF,INT LEN)
{
诠释奇,伯爵,我;uint32x4_t结果= veorq_u32(结果,结果),总和= veorq_u32(总和,总和);
uint16x4_t数据,data_hi,data_low,DATA8;
uint16x8_t DATAQ;
uint16_t result16,DISP [20] = {0,0,0,0,0,0,0,0,0,0};如果(LEN&下; = 0)
转到出来;
奇= 1&安培; (无符号长)的buff;
如果(奇数){
uint8x8_t数据1 = veor_u8(数据1,数据1);
数据1 =(uint16x4_t)vld1_lane_u8((uint8_t有*)的buff,数据1,0); //结果= * BUFF<< 8;
DATA1 =(uint16x4_t)vshl_n_u16(DATA1,8); len--;
BUFF ++;
结果= vaddw_u16(结果,数据1);
}
数= LEN>> 1; / *的16位字NR .. * /
如果(计数){
如果(2及(无符号长)BUFF){
uint16x4_t数据2 = veor_u16(数据2,数据2);
数据2 =(uint16x4_t)vld1_lane_u16((uint16_t *)的buff,数据2,0); //结果+ = *(无符号短*)的buff;
计数 - ;
LEN - = 2;
BUFF + = 2;
结果= vaddw_u16(结果,数据2);
}
算上>> = 1; / *的32位字NR .. * /
如果(计数){
如果(4及(无符号长)BUFF){
uint32x2_t DATA4 =(uint16x4_t)vld1_lane_u32((uint32_t的*)的buff,DATA4,0);
计数 - ;
LEN - = 4;
BUFF + = 4;
结果= vaddw_u16(结果,DATA4);
}
算上>> = 1; / *的64位字NR .. * /
如果(计数){
如果(8及(无符号长)BUFF){
uint64x1_t DATA8 = vld1_u64((uint64_t中*)的buff);
计数 - ;
LEN - = 8;
BUFF + = 8;
结果= vaddw_u16(结果,(uint16x4_t)DATA8);
}
算上>> = 1; / * 128位字NR .. * /
如果(计数){
做{
DATAQ =(uint16x8_t)vld1q_u64((uint64_t中*)的buff); // VLD1.64 {D0,D1},[R0]
计数 - ;
BUFF + = 16; 总和= vpaddlq_u16(DATAQ);
vst1q_u16(DISP,DATAQ); // VST1.16 {D0,D1},[R0] 结果= vaddq_u32(求和,结果);
}而(计数);
}
如果(LEN和8){
uint64x1_t DATA8 = vld1_u64((uint64_t中*)的buff);
BUFF + = 8;
结果= vaddw_u16(结果,(uint16x4_t)DATA8);
}
}
如果(LEN和4){
uint32x2_t DATA4 = veor_u32(数据4,DATA4); DATA4 =(uint16x4_t)vld1_lane_u32((uint32_t的*)的buff,DATA4,0); //结果+ = *(unsigned int类型*)的buff;
BUFF + = 4;
结果= vaddw_u16(结果,(uint16x4_t)数据4);
}
}
如果(LEN和2){
uint16x4_t数据2 = veor_u16(数据2,数据2);
数据2 =(uint16x4_t)vld1_lane_u16((uint16_t *)的buff,数据2,0); //结果+ = *(无符号短*)的buff;
BUFF + = 2;
结果= vaddw_u16(结果,数据2);
}
}
如果(LEN&放大器; 1){
uint8x8_t数据1 = veor_u8(数据1,数据1);
数据1 =(uint16x4_t)vld1_lane_u8((uint8_t有*)的buff,数据1,0); //结果= * BUFF<< 8;
结果= vaddw_u8(结果,数据1);
}
result16 = from128to16(结果);如果(奇数)
result16 =((result16>→8)及0xff的)| ((result16&放大器; 0xff的)下;&下; 8);出:
返回result16;
}
有几件事情可以改善:
- 摆脱卖场为
DISP
- 这看起来像调试code,它就剩在 ? - 请不要你的主循环中做横向此外 - 只是做在循环部分的(垂直)的款项,并在循环后做最后一个水平除了(见的对于如何做到这一点的例子这个答案 - 这是上证所但原理是一样的)
- 确保您使用
GCC -O3 ...
得到编译器优化最大的好处 - 请不要使用
转到
! (不影响性能,而且是邪恶的。)
I'm trying to implement the checksum computation code(2's complement addition) for NEON, using intrinsic. The current checksum computation is being carried out on ARM.
My implementation fetches 128-bits at once from the memory into NEON registers and does SIMD (addition), and result is folded to a 16-bit number from a 128-bit number.
Everything looks to be working fine, but my NEON implementation is consuming more time that of the ARM version.
ARM version takes: 0.860000 s NEON version takes: 1.260000 s
Note:
- Profiled using utilities from "time.h"
- The checksum function called 10,000 times from a sample application, and time computed after complete run of all the functions
Other details:
- Used GNU tool-chain(arm-none-linux-gnueabi-gcc) for compiling the intrinsic code and not arm tool-chain.
- Linux platform.
- C-intrinsic code.
Questions:
Why does NEON version take more time than that of the ARM version? (Although I have taken care that intrinsic with minimum cycles in the batch is used)
How do achieve what I want to achieve? (efficiency with NEON)
Could someone point to me or share some sample implementations(pseudo-code/algorithms/code, not the theoretical implementation papers or talks) which uses the inter-operations of ARM-NEON together?
Any help would be much appreciated.
Here's my code:
uint16_t do_csum(const unsigned char * buff, int len)
{
int odd, count, i;
uint32x4_t result = veorq_u32( result, result), sum = veorq_u32( sum, sum);
uint16x4_t data, data_hi, data_low, data8;
uint16x8_t dataq;
uint16_t result16, disp[20] = {0,0,0,0,0,0,0,0,0,0};
if (len <= 0)
goto out;
odd = 1 & (unsigned long) buff;
if (odd) {
uint8x8_t data1 = veor_u8( data1, data1);
data1 = (uint16x4_t)vld1_lane_u8((uint8_t *)buff, data1, 0); //result = *buff << 8;
data1 = (uint16x4_t)vshl_n_u16( data1, 8);
len--;
buff++;
result = vaddw_u16(result, data1);
}
count = len >> 1; /* nr of 16-bit words.. */
if (count) {
if (2 & (unsigned long) buff) {
uint16x4_t data2 = veor_u16( data2, data2);
data2 = (uint16x4_t) vld1_lane_u16((uint16_t *)buff, data2, 0); //result += *(unsigned short *) buff;
count--;
len -= 2;
buff += 2;
result = vaddw_u16( result, data2);
}
count >>= 1; /* nr of 32-bit words.. */
if (count) {
if (4 & (unsigned long) buff) {
uint32x2_t data4 = (uint16x4_t) vld1_lane_u32((uint32_t *) buff, data4, 0);
count--;
len -= 4;
buff += 4;
result = vaddw_u16( result, data4);
}
count >>= 1; /* nr of 64-bit words.. */
if (count) {
if (8 & (unsigned long) buff) {
uint64x1_t data8 = vld1_u64((uint64_t *) buff);
count--;
len -= 8;
buff += 8;
result = vaddw_u16( result,(uint16x4_t)data8);
}
count >>= 1; /* nr of 128-bit words.. */
if (count) {
do {
dataq = (uint16x8_t)vld1q_u64((uint64_t *) buff); // VLD1.64 {d0, d1}, [r0]
count--;
buff += 16;
sum = vpaddlq_u16(dataq);
vst1q_u16( disp, dataq); // VST1.16 {d0, d1}, [r0]
result = vaddq_u32( sum, result);
} while (count);
}
if (len & 8) {
uint64x1_t data8 = vld1_u64((uint64_t *) buff);
buff += 8;
result = vaddw_u16( result, (uint16x4_t)data8);
}
}
if (len & 4) {
uint32x2_t data4 = veor_u32( data4, data4);
data4 = (uint16x4_t)vld1_lane_u32((uint32_t *) buff, data4, 0);//result += *(unsigned int *) buff;
buff += 4;
result = vaddw_u16( result,(uint16x4_t) data4);
}
}
if (len & 2) {
uint16x4_t data2 = veor_u16( data2, data2);
data2 = (uint16x4_t) vld1_lane_u16((uint16_t *)buff, data2, 0); //result += *(unsigned short *) buff;
buff += 2;
result = vaddw_u16( result, data2);
}
}
if (len & 1){
uint8x8_t data1 = veor_u8( data1, data1);
data1 = (uint16x4_t) vld1_lane_u8((uint8_t *)buff, data1, 0); //result = *buff << 8;
result = vaddw_u8( result, data1);
}
result16 = from128to16(result);
if (odd)
result16 = ((result16 >> 8) & 0xff) | ((result16 & 0xff) << 8);
out:
return result16;
}
A few things you can improve:
- Get rid of the stores to
disp
- this looks like debug code that got left in ? - Don't do horizontal addition within your main loop - just do partial (vertical) sums in the loop and do one final horizontal addition after the loop (see this answer for an example of how to do this - it's for SSE but the principle is the same)
- Make sure you use
gcc -O3 ...
to get maximum benefit from compiler optimisation - Don't use
goto
! (Doesn't affect performance but is evil.)
这篇关于校验code实施Neon在内部函数的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!