使用 NEON 优化 RGBA8888 到 RGB565 的转换 [英] Optimizing RGBA8888 to RGB565 conversion with NEON

查看:61
本文介绍了使用 NEON 优化 RGBA8888 到 RGB565 的转换的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我正在尝试使用 NEON 矢量指令集优化 iOS 上的图像格式转换.我认为这会很好地映射到那个,因为它处理了一堆相似的数据.

I'm trying to optimize an image format conversion on iOS using the NEON vector instruction set. I assumed this would map well to that because it processes a bunch of similar data.

不过,我的尝试并没有那么顺利,与简单的 c 实现相比,仅实现了微不足道的加速:

My attempts haven't gone that well, though, achieving only a marginal speedup vs the naive c implementation:

for(int i = 0; i < pixelCount; ++i, ++inPixel32) {
    const unsigned int r = ((*inPixel32 >> 0 ) & 0xFF);
    const unsigned int g = ((*inPixel32 >> 8 ) & 0xFF);
    const unsigned int b = ((*inPixel32 >> 16) & 0xFF);
    *outPixel16++ = ((r >> 3) << 11) | ((g >> 2) << 5) | ((b >> 3) << 0);
}

iPad 2 上的 1 百万像素图像阵列:

1 megapixel image array on iPad 2:

格式为 [min avg max n=timer samples] 以毫秒为单位

format is [min avg max n=number of timer samples] in milliseconds

C:[14.446 14.632 18.405 n=1000]ms

C: [14.446 14.632 18.405 n=1000]ms

霓虹灯:[11.920 12.032 15.336 n=1000]毫秒

NEON: [11.920 12.032 15.336 n=1000]ms

我对 NEON 实现的尝试如下:

My attempt at a NEON implementation is below:

    int i;
const int pixelsPerLoop = 8;
for(i = 0; i < pixelCount; i += pixelsPerLoop, inPixel32 += pixelsPerLoop, outPixel16 += pixelsPerLoop) {
    //Read all r,g,b pixels into 3 registers
    uint8x8x4_t rgba  = vld4_u8(inPixel32);
    //Right-shift r,g,b as appropriate
    uint8x8_t r = vshr_n_u8(rgba.val[0], 3);
    uint8x8_t g = vshr_n_u8(rgba.val[1], 2);
    uint8x8_t b = vshr_n_u8(rgba.val[2], 3);

    //Widen b
    uint16x8_t r5_g6_b5 = vmovl_u8(b);
    //Widen r
    uint16x8_t r16 = vmovl_u8(r);
    //Left shift into position within 16-bit int
    r16 = vshlq_n_u16(r16, 11);
    r5_g6_b5 |= r16;

    //Widen g
    uint16x8_t g16 = vmovl_u8(g);
    //Left shift into position within 16-bit int
    g16 = vshlq_n_u16(g16, 5);

    r5_g6_b5 |= g16;

    //Now write back to memory
    vst1q_u16(outPixel16, r5_g6_b5);        
}
//Do the remainder on normal flt hardware

代码通过 LLVM 3.0 编译成以下内容(.loc 和额外标签已删除):

Code was compiled via LLVM 3.0 into the following (.loc and extra labels removed):

_DNConvert_ARGB8888toRGB565:
    push    {r4, r5, r7, lr}
    mov r9, r1
    mov.w   r12, #0
    add r7, sp, #8
    cmp r2, #0
    mov.w   r1, #0
    it  ne
    movne   r1, #1
    cmp r0, #0
    mov.w   r3, #0
    it  ne
    movne   r3, #1
    cmp.w   r9, #0
    mov.w   r4, #0
    it  ne
    movne   r4, #1
    tst.w   r9, #3
    bne LBB0_8
    ands    r1, r3
    ands    r1, r4
    cmp r1, #1
    bne LBB0_8
    movs    r1, #0
    lsr.w   lr, r9, #2
    cmp.w   r1, r9, lsr #2
    bne LBB0_9
    mov r3, r2
    mov r5, r0
    b   LBB0_5
LBB0_4:
    movw    r1, #65528
    add.w   r0, lr, #7
    movt    r1, #32767
    ands    r1, r0
LBB0_5:
    mov.w   r12, #1
    cmp r1, lr
    bhs LBB0_8
    rsb r0, r1, r9, lsr #2
    mov.w   r9, #63488
    mov.w   lr, #2016
    mov.w   r12, #1
LBB0_7:
    ldr r2, [r5], #4
    subs    r0, #1
    and.w   r1, r9, r2, lsl #8
    and.w   r4, lr, r2, lsr #5
    ubfx    r2, r2, #19, #5
    orr.w   r2, r2, r4
    orr.w   r1, r1, r2
    strh    r1, [r3], #2
    bne LBB0_7
LBB0_8:
    mov r0, r12
    pop {r4, r5, r7, pc}
LBB0_9:
    sub.w   r1, lr, #1
    movs    r3, #32
    add.w   r3, r3, r1, lsl #2
    bic r3, r3, #31
    adds    r5, r0, r3
    movs    r3, #16
    add.w   r1, r3, r1, lsl #1
    bic r1, r1, #15
    adds    r3, r2, r1
    movs    r1, #0
LBB0_10:
    vld4.8  {d16, d17, d18, d19}, [r0]!
    adds    r1, #8
    cmp r1, lr
    vshr.u8 d20, d16, #3
    vshr.u8 d21, d17, #2
    vshr.u8 d16, d18, #3
    vmovl.u8    q11, d20
    vmovl.u8    q9, d21
    vmovl.u8    q8, d16
    vshl.i16    q10, q11, #11
    vshl.i16    q9, q9, #5
    vorr    q8, q8, q10
    vorr    q8, q8, q9
    vst1.16 {d16, d17}, [r2]!
Ltmp28:
    blo LBB0_10
    b   LBB0_4

完整代码可在 https://github.com/darknoon/DNImageConvert 获得我将不胜感激帮助,谢谢!

Full code is available at https://github.com/darknoon/DNImageConvert I would appreciate any help, thanks!

推荐答案

这里是为 XCode 准备的手动优化的 NEON 实现:

Here you are, hand-optimized NEON implementation ready for XCode :

/* IT DOESN'T WORK!!! USE THE NEXT VERSION BELOW.
 * BGRA2RGB565.s
 *
 * Created by Jake "Alquimista" Lee on 11. 11. 1..
 * Copyright 2011 Jake Lee. All rights reserved.
 */


    .align 2
    .globl _bgra2rgb565_neon
    .private_extern _bgra2rgb565_neon

// unsigned int * bgra2rgb565_neon(unsigned int * pDst, unsigned int * pSrc, unsigned int count);


//ARM
pDst        .req    r0
pSrc        .req    r1
count       .req    r2

//NEON
blu         .req    d16
grn         .req    d17
red         .req    d18
alp         .req    d19
rg          .req    red
gb          .req    blu

_bgra2rgb565_neon:
    pld     [pSrc]
    tst     count, #0x7
    movne   r0, #0
    bxne    lr

loop:
    pld     [pSrc, #32]
    vld4.8  {blu, grn, red, alp}, [pSrc]!
    subs    count, count, #8
    vshr.u8 red, red, #3
    vext.8  rg, grn, red, #5
    vshr.u8 grn, grn, #2
    vext.8  gb, blu, grn, #3
    vst2.8  {gb, rg}, [pDst]!
    bgt     loop

    bx      lr

这个版本会比你建议的快很多倍:

This version will be many times faster than what you suggested :

  • 通过 PLD 提高缓存命中率

  • increased cache hit rate via PLD

不需要转换为long"

conversion to "long" not necessary

循环中的指令更少

尽管仍有一些优化空间,您可以修改循环,使其每次迭代转换 16 个像素而不是 8 个.然后,您可以安排指令以完全避免两个停顿(这在上面的这个 8/迭代版本中根本不可能)并另外受益于 NEON 的双重发布功能.

There is still some room for optimizations though, you could modify the loop so that it converts 16 pixels per iteration instead of 8. Then you can schedule the instructions to avoid the two stalls completely (which is simply not possible in this 8/iteration version above) and benefit from NEON's dual-issue capability in addition.

我没有这样做,因为它会使代码难以理解.

I didn't do this because it would make the code hard to understand.

了解 VEXT 应该做什么很重要.

It's important to know what VEXT is supposed to do.

现在由你决定.:)

我验证了这段代码可以在 Xcode 下正确编译.虽然我很确定它也能正常工作,但我不能保证这一点,因为我没有测试环境.如果出现故障,请告诉我.届时我会相应地更正.

I verified this code to be properly compiled under Xcode. Although I'm pretty sure it works correctly as well, I cannot guarantee this since I don't have the test environment. In case of malfunctioning, please let me know. I'll correct it accordingly then.

青色

================================================================================

==============================================================================

好吧,这是改进版.

由于 VSRI 指令的性质不允许除目标之外的两个操作数,因此无法创建一个关于寄存器分配的更稳健的操作数.

Due to the nature of the VSRI instruction not allowing two operands other than the target, it was not possible to create a more robust one regarding the register assignment.

请检查源图像的图像格式.(元素的确切字节顺序)

Please check the image format of your source image. (exact byte order of the elements)

如果不是 B、G、R、A(iOS 上默认的和原生的),您的应用程序将受到 iOS 内部转换的严重影响.

If it's not B, G, R, A, which is the default and native one on iOS, your application will suffer heavily from internal conversions by iOS.

如果无论出于何种原因绝对不可能更改此设置,请告诉我.我会写一个与之匹配的新版本.

If it's absolutely not possible to change this for whatever the reason, let me know. I'll write a new version matching it.

PS:我忘记去掉函数原型开头的下划线了.现在没了.

PS : I forgot to remove the underscore at the start of the function prototype. Now it's gone.

/*
 * BGRA2RGB565.s
 *
 * Created by Jake "Alquimista" Lee on 11. 11. 1..
 * Copyright 2011 Jake Lee. All rights reserved.
 *
 * Version 1.1
 * - bug fix
 *
 * Version 1.0
 * - initial release
 */


    .align 2
    .globl _bgra2rgb565_neon
    .private_extern _bgra2rgb565_neon

// unsigned int * bgra2rgb565_neon(unsigned int * pDst, unsigned int * pSrc, unsigned int count);


//ARM
pDst        .req    r0
pSrc        .req    r1
count       .req    r2

//NEON
blu         .req    d16
grn         .req    d17
red         .req    d18
alp         .req    d19

gb          .req    grn
rg          .req    red

_bgra2rgb565_neon:
    pld     [pSrc]
    tst     count, #0x7
    movne   r0, #0
    bxne    lr

.loop:
    pld     [pSrc, #32]
    vld4.8  {blu, grn, red, alp}, [pSrc]!
    subs    count, count, #8

    vsri.8  red, grn, #5
    vshl.u8 gb, grn, #3
    vsri.8  gb, blu, #3

    vst2.8  {gb, rg}, [pDst]!
    bgt     .loop

    bx      lr

这篇关于使用 NEON 优化 RGBA8888 到 RGB565 的转换的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆