转换GCC汇编code与armasm汇编code [英] Converting GCC assembly code to armasm assembly code
本文介绍了转换GCC汇编code与armasm汇编code的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!
问题描述
我想转换GCC汇编code到ARMASM组装code谁能请帮助我。主要的问题是名.req .unreq .qn.dn。我想知道上述指令的等价物。我试图ALIAS没有工作。
4 .align伪
。全球ne10_fir_float_neon
.extern ne10_qMaskTable32
。拇指
.thumb_funcne10_fir_float_neon:
PUSH {R4-R12,LR} @push R12:
保持栈对齐的8字节
@ / * ARM寄存器* /
pStateStruct名.req R0
PSRC名.req R1
pDst名.req R2
BLOCKSIZE名.req R3pState名.req R4 @ / *州指针* /
pCoeffs名.req R5 @ / *系数指针* /
pStateCurnt名.req R6 @ / *点的状态的电流采样* /对于态缓冲器*的pX名.req R7 @ / *临时指针/
PB名.req R8 @ / *为系数缓冲区*临时指针/
numTaps名.req R9 @ / *过滤器*长度/tapCnt名.req R10 @ / *循环计数器* /
数名.req R11 @ / *循环计数器* /
PTEMP名.req R11
PMASK名.req R14 @ / *面膜表* /面膜名.req R12@ / * NEON variale声明* /
qInp .qn Q0.F32
dInp_0 .DN D0.F32
dInp_1 .DN D1.F32
qCoeff .qn Q1.F32
dCoeff_0 .DN D2.F32
dCoeff_1 .DN D3.F32
qZero .qn Q2.F32qMask .qn Q3.U32
dMask_0 .DN D6.U32
dMask_1 .DN D7.U32
dOut_0 .DN D6.F32
dOut_1 .DN D7.F32qAcc0 .qn Q8.F32
dAcc0_0 .DN D16.F32
dAcc0_1 .DN D17.F32
QTEMP .qn Q9.F32
dTemp_0 .DN D18.F32
dTemp_1 .DN D19.F32qTemp1 .qn Q10.F32
dTemp1_0 .DN D20.F32
dTemp1_1 .DN D21.F32
qTemp2 .qn Q11.F32
qTemp3 .qn Q12.F32
qMask1 .qn Q13.U32
dMask1_0 .DN D26.U32
dMask1_1 .DN D27.U32
qMaskTmp .qn Q14.U32
dMaskTmp_0 .DN D28.U32
dMaskTmp_1 .DN D29.U32qAcc1 .qn Q3.F32
qAcc2 .qn Q13.F32
qAcc3 .qn Q15.F32
LDRH numTaps,[pStateStruct],#4
LDR pState,[pStateStruct],#4
LDR pCoeffs,[pStateStruct],#4@ / * S-GT&;态缓冲器包含previous框架(numTaps - 1)样品* /
@ / * pStateCurnt指向其中新的输入数据应写入的位置* /
@ / * pStateCurnt =及(S->状态[(numTaps - 1U)])@ * /
子网掩码,numTaps,#1
LDR PMASK,= ne10_qMaskTable32
和tapCnt,numTaps,#3
ADD pStateCurnt,pState,面具,LSL#2
和口罩,块大小,#3
@ / *应用循环展开,同时计算4输出值。
@ *变量ACC0 ......正在计算ACC3保持输出值:
@ *
@ * ACC0 = B [numTaps-1] * X [正numTaps-1] + B [numTaps-2] * X [正numTaps-2] + B [numTaps-3] * X [正numTaps -3- ] + ... + b [0] * X [0]
@ * ACC1 = B [numTaps-1] * X [正numTaps] + B [numTaps-2] * X [正numTaps-1] + B [numTaps-3] * X [正numTaps-2] + ... + b [0] * X [1]
@ * ACC2 = B [numTaps-1] * X [正numTaps + 1〕+ B [numTaps-2] * X [正numTaps] + B [numTaps-3] * X [正numTaps-1] + ... + b [0] * X [2]
@ * ACC3 = B [numTaps-1] * X [正numTaps + 2] + B [numTaps-2] * X [正numTaps + 1〕+ B [numTaps-3] * X [正numTaps] + ... + b [0] * X [3]
@ * / @ / *如果numTaps,块大小不是4的倍数,获取适当的面具* /
ADD PTEMP,PMASK,tapCnt,LSL#4
VEOR qZero,qZero
ADD像素,PMASK,面具,LSL#4
VLD1 {dMaskTmp_0,dMaskTmp_1},[PTEMP]
VLD1 {dMask1_0,dMask1_1},[的pX]
@ / *复制blockCnt一些新的输入样本的进入状态缓冲区* / SUBS BLOCKSIZE,#4
BLT firEndOuterLoop @ / *计算4一次输出* / firOuterLoop: VLD1 {dTemp_0,dTemp_1},[PSRC]!
MOV像素,pState
MOV PB,pCoeffs
@ / *读取从态缓冲器的前四个样本:
@ * X [正numTaps]中,x [正numTaps-1],X [正numTaps-2]中,x [正numTaps-3] * / VST1 {dTemp_0,dTemp_1},[pStateCurnt]!
@ / *零累加器* /
VEOR qAcc0,qAcc0
VLD1 {dInp_0,dInp_1},[的pX]! // @ *阅读前四系数B〔numTaps]到b [numTaps-3] * /
VLD1 {dCoeff_0,dCoeff_1},[PB]!
@ / *循环展开。处理4抽头一次。 * /
SUBS tapCnt,numTaps,#4
VLD1 {dTemp_0,dTemp_1},[的pX]! BLT firEndInnerLoop firInnerLoop:
VEXT qTemp1,qInp,QTEMP,#1
@ / * ACC0 + = B [numTaps] * X [正numTaps-1] + B [numTaps] * X [正numTaps-2] +
@ * B [numTaps] * X [正numTaps-3] + B [numTaps] * X [正numTaps-4] * /
VMLA qAcc0,qInp,dCoeff_0 [0]
@ / * ACC1 + = B [numTaps-1] * X [正numTaps-2] + B [numTaps-1] * X [正numTaps-3] +
@b [numTaps-1] * X [正numTaps-4] + * B [numTaps-1] * X [正numTaps-5] * /
VMUL qAcc1,qTemp1,dCoeff_0 [1] VEXT qTemp2,qInp,QTEMP,#2
@ / * ACC2 + = B [numTaps-2] * X [正numTaps-3] + B [numTaps-2] * X [正numTaps-4] +
@b [numTaps-2] * X [正numTaps-5] + * B [numTaps-2] * X [正numTaps-6] * /
VMUL qAcc2,qTemp2,dCoeff_1 [0]
VADD qAcc0,qAcc0,qAcc1 VEXT qTemp3,qInp,QTEMP,#3
@ / * ACC3 + = B [numTaps-3] * X [正numTaps-4] + B [numTaps-3] * X [正numTaps-5] +
@b [numTaps-3] * X [正numTaps-6] + * B [numTaps-3] * X [正numTaps-7] * /
VMUL qAcc3,qTemp3,dCoeff_1 [1]
VADD qAcc0,qAcc0,qAcc2 VMOV qInp,QTEMP
VLD1 {dTemp_0,dTemp_1},[的pX]!
VADD qAcc0,qAcc0,qAcc3 SUBS tapCnt,#4
@ / *读取B〔numTaps-4] B〔numTaps-7]系数* /
VLD1 {dCoeff_0,dCoeff_1},[PB]!
BGE firInnerLoop
firEndInnerLoop: ADDS tapCnt,tapCnt,#4
BEQ firStoreOutput @ / *如果滤波器长度不是4的倍数,计算剩余的滤波器抽头* /
@ / *选择仅剩下滤波器抽头* /
VMOV qMask,qMaskTmp
VBSL qMask,qCoeff,qZero
VEXT qTemp1,qInp,QTEMP,#1
VMLA qAcc0,qInp,dOut_0 [0]
VEXT qTemp2,qInp,QTEMP,#2
VMLA qAcc0,qTemp1,dOut_0 [1]
VMLA qAcc0,qTemp2,dOut_1 [0] firStoreOutput:
@ / * 4推进状态的指针来处理的4个样品下一组* /
ADD pState,#16 @ / *在4累加器的结果是在2.30的格式。转换为1.31
@ *然后存储在目的地缓冲的4个输出。 * /
SUBS BLOCKSIZE,#4
VST1 {dAcc0_0,dAcc0_1},[pDst]! BGE firOuterLoop firEndOuterLoop:
@ / *处理块大小不是4的倍数* /
ADDS BLOCKSIZE,#4
BEQ firCopyData
@ / *复制输入采样的块大小剩余数量态缓冲器* /
VMOV qMask,qMask1
VLD1 {dTemp1_0,dTemp1_1},[pStateCurnt]
VLD1 {dTemp_0,dTemp_1},[PSRC] ADD PSRC,PSRC,块大小,LSL#2
MOV像素,pState
MOV PB,pCoeffs VBSL qMask,QTEMP,qTemp1
VST1 {dMask_0,dMask_1},[pStateCurnt]
VLD1 {dInp_0,dInp_1},[的pX]! ADD pStateCurnt,pStateCurnt,块大小,LSL#2 @ / *零累加器* /
VEOR qAcc0,qAcc0
VLD1 {dCoeff_0,dCoeff_1},[PB]!
SUBS tapCnt,numTaps,#4
VLD1 {dTemp_0,dTemp_1},[的pX]! BLT firEndInnerLoop1 firInnerLoop1: VEXT qTemp1,qInp,QTEMP,#1
VMLA qAcc0,qInp,dCoeff_0 [0]
VEXT qTemp2,qInp,QTEMP,#2
VMLA qAcc0,qTemp1,dCoeff_0 [1]
VEXT qTemp3,qInp,QTEMP,#3
VMLA qAcc0,qTemp2,dCoeff_1 [0]
VMOV qInp,QTEMP
VMLA qAcc0,qTemp3,dCoeff_1 [1]
VLD1 {dCoeff_0,dCoeff_1},[PB]!
SUBS tapCnt,#4
VLD1 {dTemp_0,dTemp_1},[的pX]! BGE firInnerLoop1
firEndInnerLoop1:
VMOV qMask,qMaskTmp
VBSL qMask,qCoeff,qZero
VEXT qTemp1,qInp,QTEMP,#1
VMLA qAcc0,qInp,dOut_0 [0]
VEXT qTemp2,qInp,QTEMP,#2
VMLA qAcc0,qTemp1,dOut_0 [1]
VMLA qAcc0,qTemp2,dOut_1 [0]
VMOV qMask,qMask1
VLD1 {dTemp_0,dTemp_1},[pDst]
@ / *如果块大小不是4的倍数,屏蔽不需要的输出* / VBSL qMask,qAcc0,QTEMP
VST1 {dMask_0,dMask_1},[pDst]
ADD pDst,pDst,块大小,LSL#2
ADD pState,pState,块大小,LSL#2
firCopyData:
@ / *处理就完成了。现在,数据的状态缓冲下降移
@ ** BLOCKSIZE样本。这prepares下一个功能态缓冲器
@ **电话。 * / @ / *点的状态缓冲区的开始* / SUB numTaps,numTaps,#1
和口罩,numTaps,#3
LDR pStateCurnt,[pStateStruct,# - 8]
ADD PTEMP,PMASK,面具,LSL#4
VLD1 {dInp_0,dInp_1},[pState]!
VLD1 {dMask_0,dMask_1},[PTEMP]
@ / *复制数据* / SUBS计数,numTaps,#4
BLT的友情
firCopyLoop:
VST1 {dInp_0,dInp_1},[pStateCurnt]!
SUBS伯爵,#4
VLD1 {dInp_0,dInp_1},[pState]!
BGE firCopyLoop 的友情: VLD1 {dTemp_0,dTemp_1},[pStateCurnt]
VBSL qMask,qInp,QTEMP
VST1 {dOut_0,dOut_1},[pStateCurnt]
ADD pStateCurnt,pStateCurnt,面具,LSL#2 @ / *返回从功能* /
POP {R4-R12,PC}
@ / * ARM寄存器* /
.unreq pStateStruct
.unreq PSRC
.unreq pDst
.unreq BLOCKSIZE .unreq pState
.unreq pCoeffs
.unreq pStateCurnt .unreq的pX
.unreq PB
.unreq numTaps .unreq tapCnt
.unreq计数
.unreq PTEMP
.unreq PMASK .unreq面具 @ / * NEON variale声明* /
.unreq qInp
.unreq dInp_0
.unreq dInp_1
.unreq qCoeff
.unreq dCoeff_0
.unreq dCoeff_1
.unreq qZero .unreq qMask
.unreq dMask_0
.unreq dMask_1
.unreq dOut_0
.unreq dOut_1 .unreq qAcc0
.unreq dAcc0_0
.unreq dAcc0_1 .unreq QTEMP
.unreq dTemp_0
.unreq dTemp_1 .unreq qTemp1
.unreq dTemp1_0
.unreq dTemp1_1
.unreq qTemp2
.unreq qTemp3
.unreq qMask1
.unreq dMask1_0
.unreq dMask1_1
.unreq qMaskTmp
.unreq dMaskTmp_0
.unreq dMaskTmp_1 .unreq qAcc1
.unreq qAcc2
.unreq qAcc3
。结束
解决方案
明白了名.req是一样的RN和做了一些调整和删除unnesaary指令得到它的工作!
I am trying to convert GCC assembly code to ARMASM assembly code can anyone please help me with this. The main problem is .req .unreq .qn.dn . I wanted to know the equivalents of the above directives. I tried ALIAS it did not work.
.align 4
.global ne10_fir_float_neon
.extern ne10_qMaskTable32
.thumb
.thumb_func
ne10_fir_float_neon:
PUSH {r4-r12,lr} @push r12:
to keep stack 8 bytes aligned
@/*ARM Registers*/
pStateStruct .req R0
pSrc .req R1
pDst .req R2
blockSize .req R3
pState .req R4 @/* State pointer */
pCoeffs .req R5 @/* Coefficient pointer */
pStateCurnt .req R6 @/* Points to the current sample of the state */
pX .req R7 @/* Temporary pointers for state buffer */
pB .req R8 @/* Temporary pointers for coefficient buffer */
numTaps .req R9 @/* Length of the filter */
tapCnt .req R10 @ /* Loop counter */
Count .req R11 @ /* Loop counter */
pTemp .req R11
pMask .req R14 @ /* Mask Table */
mask .req R12
@/*NEON variale Declaration*/
qInp .qn Q0.F32
dInp_0 .dn D0.F32
dInp_1 .dn D1.F32
qCoeff .qn Q1.F32
dCoeff_0 .dn D2.F32
dCoeff_1 .dn D3.F32
qZero .qn Q2.F32
qMask .qn Q3.U32
dMask_0 .dn D6.U32
dMask_1 .dn D7.U32
dOut_0 .dn D6.F32
dOut_1 .dn D7.F32
qAcc0 .qn Q8.F32
dAcc0_0 .dn D16.F32
dAcc0_1 .dn D17.F32
qTemp .qn Q9.F32
dTemp_0 .dn D18.F32
dTemp_1 .dn D19.F32
qTemp1 .qn Q10.F32
dTemp1_0 .dn D20.F32
dTemp1_1 .dn D21.F32
qTemp2 .qn Q11.F32
qTemp3 .qn Q12.F32
qMask1 .qn Q13.U32
dMask1_0 .dn D26.U32
dMask1_1 .dn D27.U32
qMaskTmp .qn Q14.U32
dMaskTmp_0 .dn D28.U32
dMaskTmp_1 .dn D29.U32
qAcc1 .qn Q3.F32
qAcc2 .qn Q13.F32
qAcc3 .qn Q15.F32
LDRH numTaps,[pStateStruct],#4
LDR pState,[pStateStruct],#4
LDR pCoeffs,[pStateStruct],#4
@/* S->state buffer contains previous frame (numTaps - 1) samples */
@/* pStateCurnt points to the location where the new input data should be written */
@/*pStateCurnt = &(S->state[(numTaps - 1u)])@*/
SUB mask,numTaps,#1
LDR pMask,=ne10_qMaskTable32
AND tapCnt,numTaps,#3
ADD pStateCurnt,pState,mask,LSL #2
AND mask,blockSize,#3
@/* Apply loop unrolling and compute 4 output values simultaneously.
@* The variables acc0 ... acc3 hold output values that are being computed:
@*
@* acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
@* acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
@* acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
@* acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
@*/
@/*If numTaps,blockSize are not multiples of 4, Get the appropriate Masks*/
ADD pTemp,pMask,tapCnt,LSL #4
VEOR qZero,qZero
ADD pX,pMask,mask,LSL #4
VLD1 {dMaskTmp_0,dMaskTmp_1},[pTemp]
VLD1 {dMask1_0,dMask1_1},[pX]
@/* Copy blockCnt number of new input samples into the state buffer */
SUBS blockSize,#4
BLT firEndOuterLoop
@/* Compute 4 outputs at a time*/
firOuterLoop:
VLD1 {dTemp_0,dTemp_1},[pSrc]!
MOV pX,pState
MOV pB,pCoeffs
@/* Read the first four samples from the state buffer:
@* x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2],x[n-numTaps-3] */
VST1 {dTemp_0,dTemp_1},[pStateCurnt]!
@/* Zero the Accumulators*/
VEOR qAcc0,qAcc0
VLD1 {dInp_0,dInp_1},[pX]!
@//* Read the first four coefficients b[numTaps] to b[numTaps-3] */
VLD1 {dCoeff_0,dCoeff_1},[pB]!
@/* Loop unrolling. Process 4 taps at a time. */
SUBS tapCnt,numTaps,#4
VLD1 {dTemp_0,dTemp_1},[pX]!
BLT firEndInnerLoop
firInnerLoop:
VEXT qTemp1,qInp,qTemp,#1
@/* acc0 += b[numTaps] * x[n-numTaps-1]+ b[numTaps] * x[n-numTaps-2] +
@* b[numTaps] * x[n-numTaps-3] + b[numTaps] * x[n-numTaps-4]*/
VMLA qAcc0,qInp,dCoeff_0[0]
@/* acc1 += b[numTaps-1] * x[n-numTaps-2]+ b[numTaps-1] * x[n-numTaps-3] +
@b[numTaps-1] * x[n-numTaps-4] +*b[numTaps-1] * x[n-numTaps-5]*/
VMUL qAcc1,qTemp1,dCoeff_0[1]
VEXT qTemp2,qInp,qTemp,#2
@/* acc2 += b[numTaps-2] * x[n-numTaps-3]+ b[numTaps-2] * x[n-numTaps-4] +
@b[numTaps-2] * x[n-numTaps-5] + *b[numTaps-2] * x[n-numTaps-6]*/
VMUL qAcc2,qTemp2,dCoeff_1[0]
VADD qAcc0, qAcc0, qAcc1
VEXT qTemp3,qInp,qTemp,#3
@/* acc3 += b[numTaps-3] * x[n-numTaps-4]+ b[numTaps-3] * x[n-numTaps-5] +
@b[numTaps-3] * x[n-numTaps-6] +*b[numTaps-3] * x[n-numTaps-7] */
VMUL qAcc3,qTemp3,dCoeff_1[1]
VADD qAcc0, qAcc0, qAcc2
VMOV qInp,qTemp
VLD1 {dTemp_0,dTemp_1},[pX]!
VADD qAcc0, qAcc0, qAcc3
SUBS tapCnt,#4
@/* Read the b[numTaps-4] to b[numTaps-7] coefficients */
VLD1 {dCoeff_0,dCoeff_1},[pB]!
BGE firInnerLoop
firEndInnerLoop:
ADDS tapCnt, tapCnt, #4
BEQ firStoreOutput
@/* If the filter length is not a multiple of 4, compute the remaining filter taps */
@/*Select only the remaining filter Taps*/
VMOV qMask,qMaskTmp
VBSL qMask,qCoeff,qZero
VEXT qTemp1,qInp,qTemp,#1
VMLA qAcc0,qInp,dOut_0[0]
VEXT qTemp2,qInp,qTemp,#2
VMLA qAcc0,qTemp1,dOut_0[1]
VMLA qAcc0,qTemp2,dOut_1[0]
firStoreOutput:
@/* Advance the state pointer by 4 to process the next group of 4 samples */
ADD pState,#16
@/* The results in the 4 accumulators are in 2.30 format. Convert to 1.31
@ * Then store the 4 outputs in the destination buffer. */
SUBS blockSize,#4
VST1 {dAcc0_0,dAcc0_1},[pDst]!
BGE firOuterLoop
firEndOuterLoop:
@/*Handle BlockSize Not a Multiple of 4*/
ADDS blockSize,#4
BEQ firCopyData
@/*Copy the Remaining BlockSize Number of Input Sample to state Buffer*/
VMOV qMask,qMask1
VLD1 {dTemp1_0,dTemp1_1},[pStateCurnt]
VLD1 {dTemp_0,dTemp_1},[pSrc]
ADD pSrc,pSrc,blockSize,LSL #2
MOV pX,pState
MOV pB,pCoeffs
VBSL qMask,qTemp,qTemp1
VST1 {dMask_0,dMask_1},[pStateCurnt]
VLD1 {dInp_0,dInp_1},[pX]!
ADD pStateCurnt,pStateCurnt,blockSize, LSL #2
@/* Zero the Accumulators*/
VEOR qAcc0,qAcc0
VLD1 {dCoeff_0,dCoeff_1},[pB]!
SUBS tapCnt,numTaps,#4
VLD1 {dTemp_0,dTemp_1},[pX]!
BLT firEndInnerLoop1
firInnerLoop1:
VEXT qTemp1,qInp,qTemp,#1
VMLA qAcc0,qInp,dCoeff_0[0]
VEXT qTemp2,qInp,qTemp,#2
VMLA qAcc0,qTemp1,dCoeff_0[1]
VEXT qTemp3,qInp,qTemp,#3
VMLA qAcc0,qTemp2,dCoeff_1[0]
VMOV qInp,qTemp
VMLA qAcc0,qTemp3,dCoeff_1[1]
VLD1 {dCoeff_0,dCoeff_1},[pB]!
SUBS tapCnt,#4
VLD1 {dTemp_0,dTemp_1},[pX]!
BGE firInnerLoop1
firEndInnerLoop1:
VMOV qMask,qMaskTmp
VBSL qMask,qCoeff,qZero
VEXT qTemp1,qInp,qTemp,#1
VMLA qAcc0,qInp,dOut_0[0]
VEXT qTemp2,qInp,qTemp,#2
VMLA qAcc0,qTemp1,dOut_0[1]
VMLA qAcc0,qTemp2,dOut_1[0]
VMOV qMask,qMask1
VLD1 {dTemp_0,dTemp_1},[pDst]
@/* If the blockSize is not a multiple of 4, Mask the unwanted Output */
VBSL qMask,qAcc0,qTemp
VST1 {dMask_0,dMask_1},[pDst]
ADD pDst,pDst,blockSize,LSL #2
ADD pState,pState,blockSize,LSL #2
firCopyData:
@/* Processing is complete. Now shift the data in the state buffer down by
@** blockSize samples. This prepares the state buffer for the next function
@** call. */
@/* Points to the start of the state buffer */
SUB numTaps,numTaps,#1
AND mask,numTaps,#3
LDR pStateCurnt,[pStateStruct,#-8]
ADD pTemp,pMask,mask,LSL #4
VLD1 {dInp_0,dInp_1},[pState]!
VLD1 {dMask_0,dMask_1},[pTemp]
@/* copy data */
SUBS Count,numTaps,#4
BLT firEnd
firCopyLoop:
VST1 {dInp_0,dInp_1},[pStateCurnt]!
SUBS Count,#4
VLD1 {dInp_0,dInp_1},[pState]!
BGE firCopyLoop
firEnd:
VLD1 {dTemp_0,dTemp_1},[pStateCurnt]
VBSL qMask,qInp,qTemp
VST1 {dOut_0,dOut_1},[pStateCurnt]
ADD pStateCurnt,pStateCurnt,mask, LSL #2
@/*Return From Function*/
POP {r4-r12,pc}
@/*ARM Registers*/
.unreq pStateStruct
.unreq pSrc
.unreq pDst
.unreq blockSize
.unreq pState
.unreq pCoeffs
.unreq pStateCurnt
.unreq pX
.unreq pB
.unreq numTaps
.unreq tapCnt
.unreq Count
.unreq pTemp
.unreq pMask
.unreq mask
@/*NEON variale Declaration*/
.unreq qInp
.unreq dInp_0
.unreq dInp_1
.unreq qCoeff
.unreq dCoeff_0
.unreq dCoeff_1
.unreq qZero
.unreq qMask
.unreq dMask_0
.unreq dMask_1
.unreq dOut_0
.unreq dOut_1
.unreq qAcc0
.unreq dAcc0_0
.unreq dAcc0_1
.unreq qTemp
.unreq dTemp_0
.unreq dTemp_1
.unreq qTemp1
.unreq dTemp1_0
.unreq dTemp1_1
.unreq qTemp2
.unreq qTemp3
.unreq qMask1
.unreq dMask1_0
.unreq dMask1_1
.unreq qMaskTmp
.unreq dMaskTmp_0
.unreq dMaskTmp_1
.unreq qAcc1
.unreq qAcc2
.unreq qAcc3
.end
解决方案
Got it .req is same as RN and made some adjustments and removed unnesaary directives got it working!!
这篇关于转换GCC汇编code与armasm汇编code的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!
查看全文