优化RGBA8888与NEON RGB565转换 [英] Optimizing RGBA8888 to RGB565 conversion with NEON

查看:2297
本文介绍了优化RGBA8888与NEON RGB565转换的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我试图以优化使用NEON矢量指令集的iOS的图像格式转换。我以为,因为它处理了一堆类似的数据,这将很好地映射到这一点。

I'm trying to optimize an image format conversion on iOS using the NEON vector instruction set. I assumed this would map well to that because it processes a bunch of similar data.

我的努力没有去说好了,虽然只获得边际加速VS天真C实现的:

My attempts haven't gone that well, though, achieving only a marginal speedup vs the naive c implementation:

for(int i = 0; i < pixelCount; ++i, ++inPixel32) {
    const unsigned int r = ((*inPixel32 >> 0 ) & 0xFF);
    const unsigned int g = ((*inPixel32 >> 8 ) & 0xFF);
    const unsigned int b = ((*inPixel32 >> 16) & 0xFF);
    *outPixel16++ = ((r >> 3) << 11) | ((g >> 2) << 5) | ((b >> 3) << 0);
}

这是iPad 2的100万像素图像阵列:

1 megapixel image array on iPad 2:

格式[分钟平均最大n =定时器样本数]以毫秒为单位。

format is [min avg max n=number of timer samples] in milliseconds

C:
[14.446 14.632 18.405 N = 1000]毫秒

C: [14.446 14.632 18.405 n=1000]ms

NEON:
[11.920 12.032 15.336 N = 1000]毫秒

NEON: [11.920 12.032 15.336 n=1000]ms

我在NEON实现尝试如下:

My attempt at a NEON implementation is below:

    int i;
const int pixelsPerLoop = 8;
for(i = 0; i < pixelCount; i += pixelsPerLoop, inPixel32 += pixelsPerLoop, outPixel16 += pixelsPerLoop) {
    //Read all r,g,b pixels into 3 registers
    uint8x8x4_t rgba  = vld4_u8(inPixel32);
    //Right-shift r,g,b as appropriate
    uint8x8_t r = vshr_n_u8(rgba.val[0], 3);
    uint8x8_t g = vshr_n_u8(rgba.val[1], 2);
    uint8x8_t b = vshr_n_u8(rgba.val[2], 3);

    //Widen b
    uint16x8_t r5_g6_b5 = vmovl_u8(b);
    //Widen r
    uint16x8_t r16 = vmovl_u8(r);
    //Left shift into position within 16-bit int
    r16 = vshlq_n_u16(r16, 11);
    r5_g6_b5 |= r16;

    //Widen g
    uint16x8_t g16 = vmovl_u8(g);
    //Left shift into position within 16-bit int
    g16 = vshlq_n_u16(g16, 5);

    r5_g6_b5 |= g16;

    //Now write back to memory
    vst1q_u16(outPixel16, r5_g6_b5);        
}
//Do the remainder on normal flt hardware

code是通过LLVM编译3.0成以下(的.loc和额外的标签去掉):

Code was compiled via LLVM 3.0 into the following (.loc and extra labels removed):

_DNConvert_ARGB8888toRGB565:
    push    {r4, r5, r7, lr}
    mov r9, r1
    mov.w   r12, #0
    add r7, sp, #8
    cmp r2, #0
    mov.w   r1, #0
    it  ne
    movne   r1, #1
    cmp r0, #0
    mov.w   r3, #0
    it  ne
    movne   r3, #1
    cmp.w   r9, #0
    mov.w   r4, #0
    it  ne
    movne   r4, #1
    tst.w   r9, #3
    bne LBB0_8
    ands    r1, r3
    ands    r1, r4
    cmp r1, #1
    bne LBB0_8
    movs    r1, #0
    lsr.w   lr, r9, #2
    cmp.w   r1, r9, lsr #2
    bne LBB0_9
    mov r3, r2
    mov r5, r0
    b   LBB0_5
LBB0_4:
    movw    r1, #65528
    add.w   r0, lr, #7
    movt    r1, #32767
    ands    r1, r0
LBB0_5:
    mov.w   r12, #1
    cmp r1, lr
    bhs LBB0_8
    rsb r0, r1, r9, lsr #2
    mov.w   r9, #63488
    mov.w   lr, #2016
    mov.w   r12, #1
LBB0_7:
    ldr r2, [r5], #4
    subs    r0, #1
    and.w   r1, r9, r2, lsl #8
    and.w   r4, lr, r2, lsr #5
    ubfx    r2, r2, #19, #5
    orr.w   r2, r2, r4
    orr.w   r1, r1, r2
    strh    r1, [r3], #2
    bne LBB0_7
LBB0_8:
    mov r0, r12
    pop {r4, r5, r7, pc}
LBB0_9:
    sub.w   r1, lr, #1
    movs    r3, #32
    add.w   r3, r3, r1, lsl #2
    bic r3, r3, #31
    adds    r5, r0, r3
    movs    r3, #16
    add.w   r1, r3, r1, lsl #1
    bic r1, r1, #15
    adds    r3, r2, r1
    movs    r1, #0
LBB0_10:
    vld4.8  {d16, d17, d18, d19}, [r0]!
    adds    r1, #8
    cmp r1, lr
    vshr.u8 d20, d16, #3
    vshr.u8 d21, d17, #2
    vshr.u8 d16, d18, #3
    vmovl.u8    q11, d20
    vmovl.u8    q9, d21
    vmovl.u8    q8, d16
    vshl.i16    q10, q11, #11
    vshl.i16    q9, q9, #5
    vorr    q8, q8, q10
    vorr    q8, q8, q9
    vst1.16 {d16, d17}, [r2]!
Ltmp28:
    blo LBB0_10
    b   LBB0_4

全部code可在 https://github.com/darknoon/DNImageConvert 我会AP preciate任何帮助,谢谢!

Full code is available at https://github.com/darknoon/DNImageConvert I would appreciate any help, thanks!

推荐答案

给你,手工优化的NEON实施准备X code:

Here you are, hand-optimized NEON implementation ready for XCode :

/* IT DOESN'T WORK!!! USE THE NEXT VERSION BELOW.
 * BGRA2RGB565.s
 *
 * Created by Jake "Alquimista" Lee on 11. 11. 1..
 * Copyright 2011 Jake Lee. All rights reserved.
 */


    .align 2
    .globl _bgra2rgb565_neon
    .private_extern _bgra2rgb565_neon

// unsigned int * bgra2rgb565_neon(unsigned int * pDst, unsigned int * pSrc, unsigned int count);


//ARM
pDst        .req    r0
pSrc        .req    r1
count       .req    r2

//NEON
blu         .req    d16
grn         .req    d17
red         .req    d18
alp         .req    d19
rg          .req    red
gb          .req    blu

_bgra2rgb565_neon:
    pld     [pSrc]
    tst     count, #0x7
    movne   r0, #0
    bxne    lr

loop:
    pld     [pSrc, #32]
    vld4.8  {blu, grn, red, alp}, [pSrc]!
    subs    count, count, #8
    vshr.u8 red, red, #3
    vext.8  rg, grn, red, #5
    vshr.u8 grn, grn, #2
    vext.8  gb, blu, grn, #3
    vst2.8  {gb, rg}, [pDst]!
    bgt     loop

    bx      lr

这个版本会比你的建议快许多倍:

This version will be many times faster than what you suggested :


  • 通过PLD提高缓存命中率

  • increased cache hit rate via PLD

转换为长没有必要

在循环中较少的指令

还有一定空间优化的,所以它每次迭代,而不是8转换16像素可以修改的循环。
然后,你可以安排的说明,以避免两个档位完全(这是根本不可能在这上面8 /迭代版本),并在另外NEON的双发射能力中受益。

There is still some room for optimizations though, you could modify the loop so that it converts 16 pixels per iteration instead of 8. Then you can schedule the instructions to avoid the two stalls completely (which is simply not possible in this 8/iteration version above) and benefit from NEON's dual-issue capability in addition.

我不这样做,因为这将使code不难理解了。

I didn't do this because it would make the code hard to understand.

要知道什么VEXT是应该做的是很重要的。

It's important to know what VEXT is supposed to do.

现在就看你了。 :)

我验证了这code到在X code正确编译。
虽然我pretty确保其正常工作,以及,我不能保证这一点,因为我没有测试环境。
在发生故障的情况下,请让我知道。我会改正它,然后相应

I verified this code to be properly compiled under Xcode. Although I'm pretty sure it works correctly as well, I cannot guarantee this since I don't have the test environment. In case of malfunctioning, please let me know. I'll correct it accordingly then.

CYA

=============================================== ===============================

==============================================================================

好了,这里是改进版。

由于VSRI指令不允许目标以外的两个操作数的性质,这是不可能的创建有关寄存器分配一个更强大的。

Due to the nature of the VSRI instruction not allowing two operands other than the target, it was not possible to create a more robust one regarding the register assignment.

请检查您的源图像的图像格式。 (元素的确切字节顺序)

Please check the image format of your source image. (exact byte order of the elements)

如果它不是B,G,R,A,这是默认和本地一个在iOS上,您的应用程序将在很大程度上来自内部的转换通过iOS的苦。

If it's not B, G, R, A, which is the default and native one on iOS, your application will suffer heavily from internal conversions by iOS.

如果这是绝对不可能改变这种不管什么原因,让我知道。
我会写一个新的版本匹配了。

If it's absolutely not possible to change this for whatever the reason, let me know. I'll write a new version matching it.

PS:我忘了在函数原型的开始删除下划线。现在它不见了。

PS : I forgot to remove the underscore at the start of the function prototype. Now it's gone.

/*
 * BGRA2RGB565.s
 *
 * Created by Jake "Alquimista" Lee on 11. 11. 1..
 * Copyright 2011 Jake Lee. All rights reserved.
 *
 * Version 1.1
 * - bug fix
 *
 * Version 1.0
 * - initial release
 */


    .align 2
    .globl _bgra2rgb565_neon
    .private_extern _bgra2rgb565_neon

// unsigned int * bgra2rgb565_neon(unsigned int * pDst, unsigned int * pSrc, unsigned int count);


//ARM
pDst        .req    r0
pSrc        .req    r1
count       .req    r2

//NEON
blu         .req    d16
grn         .req    d17
red         .req    d18
alp         .req    d19

gb          .req    grn
rg          .req    red

_bgra2rgb565_neon:
    pld     [pSrc]
    tst     count, #0x7
    movne   r0, #0
    bxne    lr

.loop:
    pld     [pSrc, #32]
    vld4.8  {blu, grn, red, alp}, [pSrc]!
    subs    count, count, #8

    vsri.8  red, grn, #5
    vshl.u8 gb, grn, #3
    vsri.8  gb, blu, #3

    vst2.8  {gb, rg}, [pDst]!
    bgt     .loop

    bx      lr

这篇关于优化RGBA8888与NEON RGB565转换的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆