使用 NEON 优化 RGBA8888 到 RGB565 的转换 [英] Optimizing RGBA8888 to RGB565 conversion with NEON
问题描述
我正在尝试使用 NEON 矢量指令集优化 iOS 上的图像格式转换.我认为这会很好地映射到那个,因为它处理了一堆相似的数据.
I'm trying to optimize an image format conversion on iOS using the NEON vector instruction set. I assumed this would map well to that because it processes a bunch of similar data.
不过,我的尝试并没有那么顺利,与简单的 c 实现相比,仅实现了微不足道的加速:
My attempts haven't gone that well, though, achieving only a marginal speedup vs the naive c implementation:
for(int i = 0; i < pixelCount; ++i, ++inPixel32) {
const unsigned int r = ((*inPixel32 >> 0 ) & 0xFF);
const unsigned int g = ((*inPixel32 >> 8 ) & 0xFF);
const unsigned int b = ((*inPixel32 >> 16) & 0xFF);
*outPixel16++ = ((r >> 3) << 11) | ((g >> 2) << 5) | ((b >> 3) << 0);
}
iPad 2 上的 1 百万像素图像阵列:
1 megapixel image array on iPad 2:
格式为 [min avg max n=timer samples] 以毫秒为单位
format is [min avg max n=number of timer samples] in milliseconds
C:[14.446 14.632 18.405 n=1000]ms
C: [14.446 14.632 18.405 n=1000]ms
霓虹灯:[11.920 12.032 15.336 n=1000]毫秒
NEON: [11.920 12.032 15.336 n=1000]ms
我对 NEON 实现的尝试如下:
My attempt at a NEON implementation is below:
int i;
const int pixelsPerLoop = 8;
for(i = 0; i < pixelCount; i += pixelsPerLoop, inPixel32 += pixelsPerLoop, outPixel16 += pixelsPerLoop) {
//Read all r,g,b pixels into 3 registers
uint8x8x4_t rgba = vld4_u8(inPixel32);
//Right-shift r,g,b as appropriate
uint8x8_t r = vshr_n_u8(rgba.val[0], 3);
uint8x8_t g = vshr_n_u8(rgba.val[1], 2);
uint8x8_t b = vshr_n_u8(rgba.val[2], 3);
//Widen b
uint16x8_t r5_g6_b5 = vmovl_u8(b);
//Widen r
uint16x8_t r16 = vmovl_u8(r);
//Left shift into position within 16-bit int
r16 = vshlq_n_u16(r16, 11);
r5_g6_b5 |= r16;
//Widen g
uint16x8_t g16 = vmovl_u8(g);
//Left shift into position within 16-bit int
g16 = vshlq_n_u16(g16, 5);
r5_g6_b5 |= g16;
//Now write back to memory
vst1q_u16(outPixel16, r5_g6_b5);
}
//Do the remainder on normal flt hardware
代码通过 LLVM 3.0 编译成以下内容(.loc 和额外标签已删除):
Code was compiled via LLVM 3.0 into the following (.loc and extra labels removed):
_DNConvert_ARGB8888toRGB565:
push {r4, r5, r7, lr}
mov r9, r1
mov.w r12, #0
add r7, sp, #8
cmp r2, #0
mov.w r1, #0
it ne
movne r1, #1
cmp r0, #0
mov.w r3, #0
it ne
movne r3, #1
cmp.w r9, #0
mov.w r4, #0
it ne
movne r4, #1
tst.w r9, #3
bne LBB0_8
ands r1, r3
ands r1, r4
cmp r1, #1
bne LBB0_8
movs r1, #0
lsr.w lr, r9, #2
cmp.w r1, r9, lsr #2
bne LBB0_9
mov r3, r2
mov r5, r0
b LBB0_5
LBB0_4:
movw r1, #65528
add.w r0, lr, #7
movt r1, #32767
ands r1, r0
LBB0_5:
mov.w r12, #1
cmp r1, lr
bhs LBB0_8
rsb r0, r1, r9, lsr #2
mov.w r9, #63488
mov.w lr, #2016
mov.w r12, #1
LBB0_7:
ldr r2, [r5], #4
subs r0, #1
and.w r1, r9, r2, lsl #8
and.w r4, lr, r2, lsr #5
ubfx r2, r2, #19, #5
orr.w r2, r2, r4
orr.w r1, r1, r2
strh r1, [r3], #2
bne LBB0_7
LBB0_8:
mov r0, r12
pop {r4, r5, r7, pc}
LBB0_9:
sub.w r1, lr, #1
movs r3, #32
add.w r3, r3, r1, lsl #2
bic r3, r3, #31
adds r5, r0, r3
movs r3, #16
add.w r1, r3, r1, lsl #1
bic r1, r1, #15
adds r3, r2, r1
movs r1, #0
LBB0_10:
vld4.8 {d16, d17, d18, d19}, [r0]!
adds r1, #8
cmp r1, lr
vshr.u8 d20, d16, #3
vshr.u8 d21, d17, #2
vshr.u8 d16, d18, #3
vmovl.u8 q11, d20
vmovl.u8 q9, d21
vmovl.u8 q8, d16
vshl.i16 q10, q11, #11
vshl.i16 q9, q9, #5
vorr q8, q8, q10
vorr q8, q8, q9
vst1.16 {d16, d17}, [r2]!
Ltmp28:
blo LBB0_10
b LBB0_4
完整代码可在 https://github.com/darknoon/DNImageConvert 获得我将不胜感激帮助,谢谢!
Full code is available at https://github.com/darknoon/DNImageConvert I would appreciate any help, thanks!
推荐答案
这里是为 XCode 准备的手动优化的 NEON 实现:
Here you are, hand-optimized NEON implementation ready for XCode :
/* IT DOESN'T WORK!!! USE THE NEXT VERSION BELOW.
* BGRA2RGB565.s
*
* Created by Jake "Alquimista" Lee on 11. 11. 1..
* Copyright 2011 Jake Lee. All rights reserved.
*/
.align 2
.globl _bgra2rgb565_neon
.private_extern _bgra2rgb565_neon
// unsigned int * bgra2rgb565_neon(unsigned int * pDst, unsigned int * pSrc, unsigned int count);
//ARM
pDst .req r0
pSrc .req r1
count .req r2
//NEON
blu .req d16
grn .req d17
red .req d18
alp .req d19
rg .req red
gb .req blu
_bgra2rgb565_neon:
pld [pSrc]
tst count, #0x7
movne r0, #0
bxne lr
loop:
pld [pSrc, #32]
vld4.8 {blu, grn, red, alp}, [pSrc]!
subs count, count, #8
vshr.u8 red, red, #3
vext.8 rg, grn, red, #5
vshr.u8 grn, grn, #2
vext.8 gb, blu, grn, #3
vst2.8 {gb, rg}, [pDst]!
bgt loop
bx lr
这个版本会比你建议的快很多倍:
This version will be many times faster than what you suggested :
通过 PLD 提高缓存命中率
increased cache hit rate via PLD
不需要转换为long"
conversion to "long" not necessary
循环中的指令更少
尽管仍有一些优化空间,您可以修改循环,使其每次迭代转换 16 个像素而不是 8 个.然后,您可以安排指令以完全避免两个停顿(这在上面的这个 8/迭代版本中根本不可能)并另外受益于 NEON 的双重发布功能.
There is still some room for optimizations though, you could modify the loop so that it converts 16 pixels per iteration instead of 8. Then you can schedule the instructions to avoid the two stalls completely (which is simply not possible in this 8/iteration version above) and benefit from NEON's dual-issue capability in addition.
我没有这样做,因为它会使代码难以理解.
I didn't do this because it would make the code hard to understand.
了解 VEXT 应该做什么很重要.
It's important to know what VEXT is supposed to do.
现在由你决定.:)
我验证了这段代码可以在 Xcode 下正确编译.虽然我很确定它也能正常工作,但我不能保证这一点,因为我没有测试环境.如果出现故障,请告诉我.届时我会相应地更正.
I verified this code to be properly compiled under Xcode. Although I'm pretty sure it works correctly as well, I cannot guarantee this since I don't have the test environment. In case of malfunctioning, please let me know. I'll correct it accordingly then.
青色
================================================================================
==============================================================================
好吧,这是改进版.
由于 VSRI 指令的性质不允许除目标之外的两个操作数,因此无法创建一个关于寄存器分配的更稳健的操作数.
Due to the nature of the VSRI instruction not allowing two operands other than the target, it was not possible to create a more robust one regarding the register assignment.
请检查源图像的图像格式.(元素的确切字节顺序)
Please check the image format of your source image. (exact byte order of the elements)
如果不是 B、G、R、A(iOS 上默认的和原生的),您的应用程序将受到 iOS 内部转换的严重影响.
If it's not B, G, R, A, which is the default and native one on iOS, your application will suffer heavily from internal conversions by iOS.
如果无论出于何种原因绝对不可能更改此设置,请告诉我.我会写一个与之匹配的新版本.
If it's absolutely not possible to change this for whatever the reason, let me know. I'll write a new version matching it.
PS:我忘记去掉函数原型开头的下划线了.现在没了.
PS : I forgot to remove the underscore at the start of the function prototype. Now it's gone.
/*
* BGRA2RGB565.s
*
* Created by Jake "Alquimista" Lee on 11. 11. 1..
* Copyright 2011 Jake Lee. All rights reserved.
*
* Version 1.1
* - bug fix
*
* Version 1.0
* - initial release
*/
.align 2
.globl _bgra2rgb565_neon
.private_extern _bgra2rgb565_neon
// unsigned int * bgra2rgb565_neon(unsigned int * pDst, unsigned int * pSrc, unsigned int count);
//ARM
pDst .req r0
pSrc .req r1
count .req r2
//NEON
blu .req d16
grn .req d17
red .req d18
alp .req d19
gb .req grn
rg .req red
_bgra2rgb565_neon:
pld [pSrc]
tst count, #0x7
movne r0, #0
bxne lr
.loop:
pld [pSrc, #32]
vld4.8 {blu, grn, red, alp}, [pSrc]!
subs count, count, #8
vsri.8 red, grn, #5
vshl.u8 gb, grn, #3
vsri.8 gb, blu, #3
vst2.8 {gb, rg}, [pDst]!
bgt .loop
bx lr
这篇关于使用 NEON 优化 RGBA8888 到 RGB565 的转换的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!