转换GCC汇编code与armasm汇编code [英] Converting GCC assembly code to armasm assembly code

查看:388
本文介绍了转换GCC汇编code与armasm汇编code的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我想转换GCC汇编code到ARMASM组装code谁能请帮助我。主要的问题是名.req .unreq .qn.dn。我想知道上述指令的等价物。我试图ALIAS没有工作。

  4 .align伪
。全球ne10_fir_float_neon
.extern ne10_qMaskTable32
。拇指
.thumb_funcne10_fir_float_neon:
PUSH {R4-R12,LR} @push R12:
保持栈对齐的8字节
@ / * ARM寄存器* /
pStateStruct名.req R0
PSRC名.req R1
pDst名.req R2
BLOCKSIZE名.req R3pState名.req R4 @ / *州指针* /
pCoeffs名.req R5 @ / *系数指针* /
pStateCurnt名.req R6 @ / *点的状态的电流采样* /对于态缓冲器*的pX名.req R7 @ / *临时指针/
PB名.req R8 @ / *为系数缓冲区*临时指针/
numTaps名.req R9 @ / *过滤器*长度/tapCnt名.req R10 @ / *循环计数器* /
数名.req R11 @ / *循环计数器* /
PTEMP名.req R11
PMASK名.req R14 @ / *面膜表* /面膜名.req R12@ / * NEON variale声明* /
qInp .qn Q0.F32
dInp_0 .DN D0.F32
dInp_1 .DN D1.F32
qCoeff .qn Q1.F32
dCoeff_0 .DN D2.F32
dCoeff_1 .DN D3.F32
qZero .qn Q2.F32qMask .qn Q3.U32
dMask_0 .DN D6.U32
dMask_1 .DN D7.U32
dOut_0 .DN D6.F32
dOut_1 .DN D7.F32qAcc0 .qn Q8.F32
dAcc0_0 .DN D16.F32
dAcc0_1 .DN D17.F32
QTEMP .qn Q9.F32
dTemp_0 .DN D18.F32
dTemp_1 .DN D19.F32qTemp1 .qn Q10.F32
dTemp1_0 .DN D20.F32
dTemp1_1 .DN D21.F32
qTemp2 .qn Q11.F32
qTemp3 .qn Q12.F32
qMask1 .qn Q13.U32
dMask1_0 .DN D26.U32
dMask1_1 .DN D27.U32
qMaskTmp .qn Q14.U32
dMaskTmp_0 .DN D28.U32
dMaskTmp_1 .DN D29.U32qAcc1 .qn Q3.F32
qAcc2 .qn Q13.F32
qAcc3 .qn Q15.F32
LDRH numTaps,[pStateStruct],#4
LDR pState,[pStateStruct],#4
LDR pCoeffs,[pStateStruct],#4@ / * S-GT&;态缓冲器包含previous框架(numTaps - 1)样品* /
@ / * pStateCurnt指向其中新的输入数据应写入的位置* /
@ / * pStateCurnt =及(S->状态[(numTaps - 1U)])@ * /
子网掩码,numTaps,#1
LDR PMASK,= ne10_qMaskTable32
                   和tapCnt,numTaps,#3
                   ADD pStateCurnt,pState,面具,LSL#2
                   和口罩,块大小,#3
                   @ / *应用循环展开,同时计算4输出值。
                    @ *变量ACC0 ......正在计算ACC3保持输出值:
                    @ *
                    @ * ACC0 = B [numTaps-1] * X [正numTaps-1] + B [numTaps-2] * X [正numTaps-2] + B [numTaps-3] * X [正numTaps -3- ] + ... + b [0] * X [0]
                    @ * ACC1 = B [numTaps-1] * X [正numTaps] + B [numTaps-2] * X [正numTaps-1] + B [numTaps-3] * X [正numTaps-2] + ... + b [0] * X [1]
                    @ * ACC2 = B [numTaps-1] * X [正numTaps + 1〕+ B [numTaps-2] * X [正numTaps] + B [numTaps-3] * X [正numTaps-1] + ... + b [0] * X [2]
                    @ * ACC3 = B [numTaps-1] * X [正numTaps + 2] + B [numTaps-2] * X [正numTaps + 1〕+ B [numTaps-3] * X [正numTaps] + ... + b [0] * X [3]
                    @ * /                   @ / *如果numTaps,块大小不是4的倍数,获取适当的面具* /
                   ADD PTEMP,PMASK,tapCnt,LSL#4
                   VEOR qZero,qZero
                   ADD像素,PMASK,面具,LSL#4
                   VLD1 {dMaskTmp_0,dMaskTmp_1},[PTEMP]
                   VLD1 {dMask1_0,dMask1_1},[的pX]
                   @ / *复制blockCnt一些新的输入样本的进入状态缓冲区* /                   SUBS BLOCKSIZE,#4
                   BLT firEndOuterLoop                   @ / *计算4一次输出* /                   firOuterLoop:                   VLD1 {dTemp_0,dTemp_1},[PSRC]!
                   MOV像素,pState
                   MOV PB,pCoeffs
                   @ / *读取从态缓冲器的前四个样本:
                    @ * X [正numTaps]中,x [正numTaps-1],X [正numTaps-2]中,x [正numTaps-3] * /                   VST1 {dTemp_0,dTemp_1},[pStateCurnt]!
                   @ / *零累加器* /
                   VEOR qAcc0,qAcc0
                   VLD1 {dInp_0,dInp_1},[的pX]!                   // @ *阅读前四系数B〔numTaps]到b [numTaps-3] * /
                   VLD1 {dCoeff_0,dCoeff_1},[PB]!
                   @ / *循环展开。处理4抽头一次。 * /
                   SUBS tapCnt,numTaps,#4
                   VLD1 {dTemp_0,dTemp_1},[的pX]!                   BLT firEndInnerLoop                   firInnerLoop:
                   VEXT qTemp1,qInp,QTEMP,#1
                   @ / * ACC0 + = B [numTaps] * X [正numTaps-1] + B [numTaps] * X [正numTaps-2] +
                    @ * B [numTaps] * X [正numTaps-3] + B [numTaps] * X [正numTaps-4] * /
                   VMLA qAcc0,qInp,dCoeff_0 [0]
                   @ / * ACC1 + = B [numTaps-1] * X [正numTaps-2] + B [numTaps-1] * X [正numTaps-3] +
                    @b [numTaps-1] * X [正numTaps-4] + * B [numTaps-1] * X [正numTaps-5] * /
                   VMUL qAcc1,qTemp1,dCoeff_0 [1]                   VEXT qTemp2,qInp,QTEMP,#2
                   @ / * ACC2 + = B [numTaps-2] * X [正numTaps-3] + B [numTaps-2] * X [正numTaps-4] +
                    @b [numTaps-2] * X [正numTaps-5] + * B [numTaps-2] * X [正numTaps-6] * /
                   VMUL qAcc2,qTemp2,dCoeff_1 [0]
                   VADD qAcc0,qAcc0,qAcc1                   VEXT qTemp3,qInp,QTEMP,#3
                   @ / * ACC3 + = B [numTaps-3] * X [正numTaps-4] + B [numTaps-3] * X [正numTaps-5] +
                    @b [numTaps-3] * X [正numTaps-6] + * B [numTaps-3] * X [正numTaps-7] * /
                   VMUL qAcc3,qTemp3,dCoeff_1 [1]
                   VADD qAcc0,qAcc0,qAcc2                   VMOV qInp,QTEMP
                   VLD1 {dTemp_0,dTemp_1},[的pX]!
                   VADD qAcc0,qAcc0,qAcc3                   SUBS tapCnt,#4
                   @ / *读取B〔numTaps-4] B〔numTaps-7]系数* /
                   VLD1 {dCoeff_0,dCoeff_1},[PB]!
                   BGE firInnerLoop
                   firEndInnerLoop:                   ADDS tapCnt,tapCnt,#4
                   BEQ firStoreOutput                   @ / *如果滤波器长度不是4的倍数,计算剩余的滤波器抽头* /
                   @ / *选择仅剩下滤波器抽头* /
                   VMOV qMask,qMaskTmp
                   VBSL qMask,qCoeff,qZero
                   VEXT qTemp1,qInp,QTEMP,#1
                   VMLA qAcc0,qInp,dOut_0 [0]
                   VEXT qTemp2,qInp,QTEMP,#2
                   VMLA qAcc0,qTemp1,dOut_0 [1]
                   VMLA qAcc0,qTemp2,dOut_1 [0]                   firStoreOutput:
                   @ / * 4推进状态的指针来处理的4个样品下一组* /
                   ADD pState,#16                   @ / *在4累加器的结果是在2.30的格式。转换为1.31
                    @ *然后存储在目的地缓冲的4个输出。 * /
                   SUBS BLOCKSIZE,#4
                   VST1 {dAcc0_0,dAcc0_1},[pDst]!                   BGE firOuterLoop                   firEndOuterLoop:
                   @ / *处理块大小不是4的倍数* /
                   ADDS BLOCKSIZE,#4
                   BEQ firCopyData
                   @ / *复制输入采样的块大小剩余数量态缓冲器* /
                   VMOV qMask,qMask1
                   VLD1 {dTemp1_0,dTemp1_1},[pStateCurnt]
                   VLD1 {dTemp_0,dTemp_1},[PSRC]                   ADD PSRC,PSRC,块大小,LSL#2
                   MOV像素,pState
                   MOV PB,pCoeffs                   VBSL qMask,QTEMP,qTemp1
                   VST1 {dMask_0,dMask_1},[pStateCurnt]
                   VLD1 {dInp_0,dInp_1},[的pX]!                   ADD pStateCurnt,pStateCurnt,块大小,LSL#2                   @ / *零累加器* /
                   VEOR qAcc0,qAcc0
                   VLD1 {dCoeff_0,dCoeff_1},[PB]!
                   SUBS tapCnt,numTaps,#4
                   VLD1 {dTemp_0,dTemp_1},[的pX]!                   BLT firEndInnerLoop1                   firInnerLoop1:                   VEXT qTemp1,qInp,QTEMP,#1
                   VMLA qAcc0,qInp,dCoeff_0 [0]
                   VEXT qTemp2,qInp,QTEMP,#2
                   VMLA qAcc0,qTemp1,dCoeff_0 [1]
                   VEXT qTemp3,qInp,QTEMP,#3
                   VMLA qAcc0,qTemp2,dCoeff_1 [0]
                   VMOV qInp,QTEMP
                   VMLA qAcc0,qTemp3,dCoeff_1 [1]
                   VLD1 {dCoeff_0,dCoeff_1},[PB]!
                   SUBS tapCnt,#4
                   VLD1 {dTemp_0,dTemp_1},[的pX]!                   BGE firInnerLoop1
                   firEndInnerLoop1:
                   VMOV qMask,qMaskTmp
                   VBSL qMask,qCoeff,qZero
                   VEXT qTemp1,qInp,QTEMP,#1
                   VMLA qAcc0,qInp,dOut_0 [0]
                   VEXT qTemp2,qInp,QTEMP,#2
                   VMLA qAcc0,qTemp1,dOut_0 [1]
                   VMLA qAcc0,qTemp2,dOut_1 [0]
                   VMOV qMask,qMask1
                   VLD1 {dTemp_0,dTemp_1},[pDst]
                   @ / *如果块大小不是4的倍数,屏蔽不需要的输出* /                   VBSL qMask,qAcc0,QTEMP
                   VST1 {dMask_0,dMask_1},[pDst]
                   ADD pDst,pDst,块大小,LSL#2
                   ADD pState,pState,块大小,LSL#2
                   firCopyData:
                   @ / *处理就完成了。现在,数据的状态缓冲下降移
                    @ ** BLOCKSIZE样本。这prepares下一个功能态缓冲器
                    @ **电话。 * /                   @ / *点的状态缓冲区的开始* /                   SUB numTaps,numTaps,#1
                   和口罩,numTaps,#3
                   LDR pStateCurnt,[pStateStruct,# - 8]
                   ADD PTEMP,PMASK,面具,LSL#4
                   VLD1 {dInp_0,dInp_1},[pState]!
                   VLD1 {dMask_0,dMask_1},[PTEMP]
                   @ / *复制数据* /                   SUBS计数,numTaps,#4
                   BLT的友情
                   firCopyLoop:
                   VST1 {dInp_0,dInp_1},[pStateCurnt]!
                   SUBS伯爵,#4
                   VLD1 {dInp_0,dInp_1},[pState]!
                   BGE firCopyLoop                   的友情:                   VLD1 {dTemp_0,dTemp_1},[pStateCurnt]
                   VBSL qMask,qInp,QTEMP
                   VST1 {dOut_0,dOut_1},[pStateCurnt]
                   ADD pStateCurnt,pStateCurnt,面具,LSL#2                   @ / *返回从功能* /
                   POP {R4-R12,PC}
                   @ / * ARM寄存器* /
                   .unreq pStateStruct
                   .unreq PSRC
                   .unreq pDst
                   .unreq BLOCKSIZE                   .unreq pState
                   .unreq pCoeffs
                   .unreq pStateCurnt                   .unreq的pX
                   .unreq PB
                   .unreq numTaps                   .unreq tapCnt
                   .unreq计数
                   .unreq PTEMP
                   .unreq PMASK                   .unreq面具                   @ / * NEON variale声明* /
                   .unreq qInp
                   .unreq dInp_0
                   .unreq dInp_1
                   .unreq qCoeff
                   .unreq dCoeff_0
                   .unreq dCoeff_1
                   .unreq qZero                   .unreq qMask
                   .unreq dMask_0
                   .unreq dMask_1
                   .unreq dOut_0
                   .unreq dOut_1                   .unreq qAcc0
                   .unreq dAcc0_0
                   .unreq dAcc0_1                   .unreq QTEMP
                   .unreq dTemp_0
                   .unreq dTemp_1                   .unreq qTemp1
                   .unreq dTemp1_0
                   .unreq dTemp1_1
                   .unreq qTemp2
                   .unreq qTemp3
                   .unreq qMask1
                   .unreq dMask1_0
                   .unreq dMask1_1
                   .unreq qMaskTmp
                   .unreq dMaskTmp_0
                   .unreq dMaskTmp_1                   .unreq qAcc1
                   .unreq qAcc2
                   .unreq qAcc3
                   。结束


解决方案

明白了名.req是一样的RN和做了一些调整和删除unnesaary指令得到它的工作!

I am trying to convert GCC assembly code to ARMASM assembly code can anyone please help me with this. The main problem is .req .unreq .qn.dn . I wanted to know the equivalents of the above directives. I tried ALIAS it did not work.

 .align   4
.global   ne10_fir_float_neon
.extern   ne10_qMaskTable32
.thumb
.thumb_func

ne10_fir_float_neon:
PUSH    {r4-r12,lr}    @push r12:
to keep stack 8 bytes aligned
@/*ARM Registers*/
pStateStruct     .req   R0
pSrc             .req   R1
pDst             .req   R2
blockSize        .req   R3

pState           .req   R4             @/* State pointer */
pCoeffs          .req   R5             @/* Coefficient pointer */
pStateCurnt      .req   R6             @/* Points to the current sample of the state */

pX               .req   R7             @/* Temporary pointers for state buffer */
pB               .req   R8             @/* Temporary pointers for coefficient buffer */
numTaps          .req   R9             @/* Length of the filter */

tapCnt           .req   R10            @ /* Loop counter */
Count            .req   R11            @ /* Loop counter */
pTemp            .req   R11
pMask            .req   R14            @  /* Mask Table */

mask             .req   R12

@/*NEON variale Declaration*/
qInp             .qn   Q0.F32
dInp_0           .dn   D0.F32
dInp_1           .dn   D1.F32
qCoeff           .qn   Q1.F32
dCoeff_0         .dn   D2.F32
dCoeff_1         .dn   D3.F32
qZero            .qn   Q2.F32

qMask            .qn   Q3.U32
dMask_0          .dn   D6.U32
dMask_1          .dn   D7.U32
dOut_0           .dn   D6.F32
dOut_1           .dn   D7.F32

qAcc0            .qn   Q8.F32
dAcc0_0          .dn   D16.F32
dAcc0_1          .dn   D17.F32


qTemp            .qn   Q9.F32
dTemp_0          .dn   D18.F32
dTemp_1          .dn   D19.F32

qTemp1           .qn   Q10.F32
dTemp1_0         .dn   D20.F32
dTemp1_1         .dn   D21.F32
qTemp2           .qn   Q11.F32
qTemp3           .qn   Q12.F32
qMask1           .qn   Q13.U32
dMask1_0         .dn   D26.U32
dMask1_1         .dn   D27.U32
qMaskTmp         .qn   Q14.U32
dMaskTmp_0       .dn   D28.U32
dMaskTmp_1       .dn   D29.U32

qAcc1            .qn   Q3.F32
qAcc2            .qn   Q13.F32
qAcc3            .qn   Q15.F32




LDRH        numTaps,[pStateStruct],#4
LDR         pState,[pStateStruct],#4
LDR         pCoeffs,[pStateStruct],#4

@/* S->state buffer contains previous frame (numTaps - 1) samples */
@/* pStateCurnt points to the location where the new input data should be written */
@/*pStateCurnt = &(S->state[(numTaps - 1u)])@*/
SUB         mask,numTaps,#1
LDR         pMask,=ne10_qMaskTable32
                   AND         tapCnt,numTaps,#3
                   ADD         pStateCurnt,pState,mask,LSL #2
                   AND         mask,blockSize,#3


                   @/* Apply loop unrolling and compute 4 output values simultaneously.
                    @* The variables acc0 ... acc3 hold output values that are being computed:
                    @*
                    @*    acc0 =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
                    @*    acc1 =  b[numTaps-1] * x[n-numTaps] +   b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
                    @*    acc2 =  b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] +   b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
                    @*    acc3 =  b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps]   +...+ b[0] * x[3]
                    @*/

                   @/*If numTaps,blockSize are not  multiples of 4,  Get the appropriate Masks*/


                   ADD         pTemp,pMask,tapCnt,LSL #4
                   VEOR        qZero,qZero
                   ADD         pX,pMask,mask,LSL #4
                   VLD1        {dMaskTmp_0,dMaskTmp_1},[pTemp]
                   VLD1        {dMask1_0,dMask1_1},[pX]


                   @/* Copy blockCnt number of  new input samples into the state buffer */

                   SUBS        blockSize,#4
                   BLT         firEndOuterLoop

                   @/* Compute 4 outputs at a time*/

                   firOuterLoop:

                   VLD1        {dTemp_0,dTemp_1},[pSrc]!
                   MOV         pX,pState
                   MOV         pB,pCoeffs
                   @/* Read the first four samples from the state buffer:
                    @* x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2],x[n-numTaps-3] */

                   VST1        {dTemp_0,dTemp_1},[pStateCurnt]!
                   @/* Zero the Accumulators*/
                   VEOR        qAcc0,qAcc0
                   VLD1        {dInp_0,dInp_1},[pX]!

                   @//* Read the first four coefficients b[numTaps] to b[numTaps-3] */
                   VLD1        {dCoeff_0,dCoeff_1},[pB]!
                   @/* Loop unrolling.  Process 4 taps at a time. */
                   SUBS        tapCnt,numTaps,#4
                   VLD1        {dTemp_0,dTemp_1},[pX]!

                   BLT         firEndInnerLoop

                   firInnerLoop:
                   VEXT        qTemp1,qInp,qTemp,#1
                   @/* acc0 +=  b[numTaps] * x[n-numTaps-1]+ b[numTaps] * x[n-numTaps-2] +
                    @* b[numTaps] * x[n-numTaps-3] +  b[numTaps] * x[n-numTaps-4]*/
                   VMLA        qAcc0,qInp,dCoeff_0[0]
                   @/* acc1 +=  b[numTaps-1] * x[n-numTaps-2]+ b[numTaps-1] * x[n-numTaps-3] +
                    @b[numTaps-1] * x[n-numTaps-4] +*b[numTaps-1] * x[n-numTaps-5]*/
                   VMUL        qAcc1,qTemp1,dCoeff_0[1]

                   VEXT        qTemp2,qInp,qTemp,#2
                   @/* acc2 +=  b[numTaps-2] * x[n-numTaps-3]+ b[numTaps-2] * x[n-numTaps-4] +
                    @b[numTaps-2] * x[n-numTaps-5] + *b[numTaps-2] * x[n-numTaps-6]*/
                   VMUL        qAcc2,qTemp2,dCoeff_1[0]
                   VADD        qAcc0, qAcc0, qAcc1

                   VEXT        qTemp3,qInp,qTemp,#3
                   @/* acc3 +=  b[numTaps-3] * x[n-numTaps-4]+ b[numTaps-3] * x[n-numTaps-5] +
                    @b[numTaps-3] * x[n-numTaps-6] +*b[numTaps-3] * x[n-numTaps-7]  */
                   VMUL        qAcc3,qTemp3,dCoeff_1[1]
                   VADD        qAcc0, qAcc0, qAcc2

                   VMOV        qInp,qTemp
                   VLD1        {dTemp_0,dTemp_1},[pX]!
                   VADD        qAcc0, qAcc0, qAcc3

                   SUBS        tapCnt,#4
                   @/* Read the b[numTaps-4] to b[numTaps-7]  coefficients */
                   VLD1        {dCoeff_0,dCoeff_1},[pB]!


                   BGE         firInnerLoop
                   firEndInnerLoop:

                   ADDS        tapCnt, tapCnt, #4
                   BEQ         firStoreOutput

                   @/* If the filter length is not a multiple of 4, compute the remaining filter taps */
                   @/*Select only the remaining filter Taps*/
                   VMOV        qMask,qMaskTmp
                   VBSL        qMask,qCoeff,qZero
                   VEXT        qTemp1,qInp,qTemp,#1
                   VMLA        qAcc0,qInp,dOut_0[0]
                   VEXT        qTemp2,qInp,qTemp,#2
                   VMLA        qAcc0,qTemp1,dOut_0[1]
                   VMLA        qAcc0,qTemp2,dOut_1[0]

                   firStoreOutput:
                   @/* Advance the state pointer by 4 to process the next group of 4 samples */
                   ADD         pState,#16

                   @/* The results in the 4 accumulators are in 2.30 format.  Convert to 1.31
                    @ * Then store the 4 outputs in the destination buffer. */
                   SUBS        blockSize,#4
                   VST1        {dAcc0_0,dAcc0_1},[pDst]!

                   BGE         firOuterLoop

                   firEndOuterLoop:
                   @/*Handle BlockSize Not a Multiple of 4*/
                   ADDS        blockSize,#4
                   BEQ         firCopyData
                   @/*Copy the Remaining BlockSize Number of Input Sample to state Buffer*/
                   VMOV        qMask,qMask1
                   VLD1        {dTemp1_0,dTemp1_1},[pStateCurnt]
                   VLD1        {dTemp_0,dTemp_1},[pSrc]

                   ADD         pSrc,pSrc,blockSize,LSL #2
                   MOV         pX,pState
                   MOV         pB,pCoeffs

                   VBSL        qMask,qTemp,qTemp1
                   VST1        {dMask_0,dMask_1},[pStateCurnt]
                   VLD1        {dInp_0,dInp_1},[pX]!

                   ADD         pStateCurnt,pStateCurnt,blockSize, LSL #2

                   @/* Zero the Accumulators*/
                   VEOR        qAcc0,qAcc0
                   VLD1        {dCoeff_0,dCoeff_1},[pB]!
                   SUBS        tapCnt,numTaps,#4
                   VLD1        {dTemp_0,dTemp_1},[pX]!

                   BLT         firEndInnerLoop1

                   firInnerLoop1:

                   VEXT        qTemp1,qInp,qTemp,#1
                   VMLA        qAcc0,qInp,dCoeff_0[0]
                   VEXT        qTemp2,qInp,qTemp,#2
                   VMLA        qAcc0,qTemp1,dCoeff_0[1]
                   VEXT        qTemp3,qInp,qTemp,#3
                   VMLA        qAcc0,qTemp2,dCoeff_1[0]
                   VMOV        qInp,qTemp
                   VMLA        qAcc0,qTemp3,dCoeff_1[1]
                   VLD1        {dCoeff_0,dCoeff_1},[pB]!
                   SUBS        tapCnt,#4
                   VLD1        {dTemp_0,dTemp_1},[pX]!

                   BGE         firInnerLoop1
                   firEndInnerLoop1:


                   VMOV        qMask,qMaskTmp
                   VBSL        qMask,qCoeff,qZero
                   VEXT        qTemp1,qInp,qTemp,#1
                   VMLA        qAcc0,qInp,dOut_0[0]
                   VEXT        qTemp2,qInp,qTemp,#2
                   VMLA        qAcc0,qTemp1,dOut_0[1]
                   VMLA        qAcc0,qTemp2,dOut_1[0]
                   VMOV        qMask,qMask1
                   VLD1        {dTemp_0,dTemp_1},[pDst]


                   @/* If the blockSize is not a multiple of 4, Mask the unwanted Output */

                   VBSL        qMask,qAcc0,qTemp
                   VST1        {dMask_0,dMask_1},[pDst]
                   ADD         pDst,pDst,blockSize,LSL #2
                   ADD         pState,pState,blockSize,LSL #2


                   firCopyData:
                   @/* Processing is complete.  Now shift the data in the state buffer down by
                    @** blockSize samples.  This prepares the state buffer for the next function
                    @** call. */

                   @/* Points to the start of the state buffer */

                   SUB         numTaps,numTaps,#1
                   AND         mask,numTaps,#3
                   LDR         pStateCurnt,[pStateStruct,#-8]
                   ADD         pTemp,pMask,mask,LSL #4
                   VLD1        {dInp_0,dInp_1},[pState]!
                   VLD1        {dMask_0,dMask_1},[pTemp]


                   @/* copy data */

                   SUBS        Count,numTaps,#4
                   BLT         firEnd
                   firCopyLoop:
                   VST1        {dInp_0,dInp_1},[pStateCurnt]!
                   SUBS        Count,#4
                   VLD1        {dInp_0,dInp_1},[pState]!
                   BGE         firCopyLoop

                   firEnd:

                   VLD1        {dTemp_0,dTemp_1},[pStateCurnt]
                   VBSL        qMask,qInp,qTemp
                   VST1        {dOut_0,dOut_1},[pStateCurnt]
                   ADD         pStateCurnt,pStateCurnt,mask, LSL #2

                   @/*Return From Function*/
                   POP     {r4-r12,pc}
                   @/*ARM Registers*/
                   .unreq    pStateStruct
                   .unreq    pSrc
                   .unreq    pDst
                   .unreq    blockSize

                   .unreq    pState
                   .unreq    pCoeffs
                   .unreq    pStateCurnt

                   .unreq    pX
                   .unreq    pB
                   .unreq    numTaps

                   .unreq    tapCnt
                   .unreq    Count
                   .unreq    pTemp
                   .unreq    pMask

                   .unreq    mask

                   @/*NEON variale Declaration*/
                   .unreq    qInp
                   .unreq    dInp_0
                   .unreq    dInp_1
                   .unreq    qCoeff
                   .unreq    dCoeff_0
                   .unreq    dCoeff_1
                   .unreq    qZero

                   .unreq    qMask
                   .unreq    dMask_0
                   .unreq    dMask_1
                   .unreq    dOut_0
                   .unreq    dOut_1

                   .unreq    qAcc0
                   .unreq    dAcc0_0
                   .unreq    dAcc0_1

                   .unreq    qTemp
                   .unreq    dTemp_0
                   .unreq    dTemp_1

                   .unreq    qTemp1
                   .unreq    dTemp1_0
                   .unreq    dTemp1_1
                   .unreq    qTemp2
                   .unreq    qTemp3
                   .unreq    qMask1
                   .unreq    dMask1_0
                   .unreq    dMask1_1
                   .unreq    qMaskTmp
                   .unreq    dMaskTmp_0
                   .unreq    dMaskTmp_1

                   .unreq    qAcc1
                   .unreq    qAcc2
                   .unreq    qAcc3
                   .end

解决方案

Got it .req is same as RN and made some adjustments and removed unnesaary directives got it working!!

这篇关于转换GCC汇编code与armasm汇编code的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆