ARM头部以获得适当的调用栈 [英] ARM Headers to Get Proper Call Stacks

查看:178
本文介绍了ARM头部以获得适当的调用栈的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我目前正在ARM处理器上对基于Linux的软件进行优化。这些优化主要以ARM和ARM NEON函数的形式提供。



为了描述我使用perf记录和火焰图的软件,然而,一旦我介绍了汇编函数,它们不会堆栈在调用它们的函数之上,而是看起来随机的地方。



因此,我的问题是,我应该在他们的函数中包含什么正确地出现在调用堆栈中。



有一个稍微相关的主题,但没有给出好的答案。我使用相同的标志和mapcs-frame。



下面我给出一个由GCC转换为ARM的C函数的例子。这个ARM函数似乎产生了不错的堆栈,但我想知道为什么。

  int half(int in); 
int sum(int in1,int in2);
int mean(int in1,int in2);

int half(int i)
{
return i / 2;
}

int sum(int i,int j)
{
return i + j;


int mean(int i,int j)
{
int s = sum(i,j);
int m =一半(s);
返回m;
}

int main()
{
int a = 1;
int b = 5;
int i;
int结果;
for(i = 0; i <10000000; i ++){
result = mean(a,b);
}
返回0;





  .cpu cortex-a9 
.eabi_attribute 27,3
.eabi_attribute 28,1
.fpu neon
.eabi_attribute 20,1
.eabi_attribute 21,1
.eabi_attribute 23,3
.eabi_attribute 24,1
.eabi_attribute 25,1
.eabi_attribute 26,2
.eabi_attribute 30,6
.eabi_attribute 34,1
.eabi_attribute 18,4
.fileac
.text
.align 2
.global half
.type half,%函数
half:
@ args = 0,假装= 0,frame = 8
@frame_needed = 1,uses_anonymous_args = 0
mov ip,sp
stmfd sp! ,{fp,ip,lr,pc}
sub fp,ip,#4
sub sp,sp,#8
str r0,[fp,#-16]
ldr r3,[fp,#-16]
mov r2,r3,lsr#31
add r3,r2,r3
mo v r3,r3,asr#1
mov r0,r3
sub sp,fp,#12
ldmfd sp,{fp,sp,pc}
.size half,。 -half
.align 2
.global sum
.type sum,%function
sum:
@ args = 0,pretend = 0,frame = 8
@ frame_needed = 1,uses_anonymous_args = 0
mov ip,sp
stmfd sp !, {fp,ip,lr,pc}
sub fp,ip,#4
sub sp,sp,#8
str r0,[fp,#-16]
str r1,[fp,#-20]
ldr r2,[fp,#-16]
ldr r3,[fp,#-20]
add r3,r2,r3
mov r0,r3
sub sp,fp,#12
ldmfd sp,{ fp,sp,pc}
.size sum,。-sum
.align 2
.global mean
.type mean,%function
mean:
@ args = 0,pretend = 0,frame = 16
@frame_needed = 1,uses_anonymous_args = 0
mov ip,sp
stmfd s p!,{fp,ip,lr,pc}
sub fp,ip,#4
sub sp,sp,#16
str r0,[fp,#-24]
str r1,[fp,#-28]
ldr r1,[fp,#-28]
ldr r0,[fp,#-24]
b1b
str r0,[fp,#-16]
ldr r0,[fp,#-16]
bl half
str r0,[fp,#-20]
ldr r3 ,[fp,#-20]
mov r0,r3
sub sp,fp,#12
ldmfd sp,{fp,sp,pc}
.size mean,。 -mean
.align 2
.global main
.type main,%function
main:
@ args = 0,pretend = 0,frame = 16
@ frame_needed = 1,uses_anonymous_args = 0
mov ip,sp
stmfd sp !, {fp,ip,lr,pc}
sub fp,ip,#4
sub sp,sp,#16
mov r3,#1
str r3,[fp,#-20]
mov r3,#5
str r3,[fp,# -24]
mov r3,#0
str r3,[fp,#-16]
b .L8
.L9:
ldr r1,[fp,#-24]
ldr r0,[fp,#-20]
bl意味着
str r0,[fp,#-28]
ldr r3,[fp,#-16]
add r3,r3,#1
str r3,[fp,#-16]
.L8:
ldr r2,[fp,#-16]
movw r3,#38527
movt r3,152
cmp r2,r3
ble .L9
mov r3,#0
mov r0,r3
sub sp,fp,#12
ldmfd sp,{fp,sp,pc}
。 GCC :( crosstool-NG linaro-1.13.1-4.9-2014.09 - Linaro GCC 4.9-2014.09)4.9.2 20140904(prerelease)
.section。 note.GNU-stack,,%progbits

---------- ---------编辑-------------------



以下是示例我正在尝试整合的功能。在连接方面,它所做的一切就是在开始时保存堆栈和链接寄存器并将其设置为结束。我应该添加什么?

  .section .text 

.global ARM_smoothing

ARM_smoothing:
STMFD sp!,{r4-r12,lr} //将使用的寄存器移到堆栈上(避免分段错误)
MOV r5,r0
ADD r0,r0, r2
ADD r0,r0,r2
MOV r8,r0
ADD r8,r8,r2
ADD r8,r8,r2 // 6条指令创建3行指针上面和下面以及当前一个
ADD r1,r1,r2
ADD r1,r1,r2
ADD r1,r1,#2 //将目标指针移到第一个元素(1
SUB r2,r2,#2
SUB r3,r3,#2 //计数器递减,因为平滑函数在每边上的边距为1
LDR r9,= 0x1C71C71D //(1/9)* 2 ^ 32注入效果器分区参数9
LDR r10,= 0x2
LDR r11, = 0xC //将指针转换为数据
VLDR.U64 d20,= 0x1C71C71D //(1/9)* 2 ^ 32 pour effectuer la division par 9
VLDR.U64 d22,= 0x0 //初始化要使用的零(不是必需的)
VLDR.U64 d23,= 0x0
VDUP.32 d20,d20 [0] //初始化乘法向量
height_loop:
MOV r4,r2 //复位宽度计数器
CMP r4,#8
BLGE width_loop_eight_smoothing //使用氖而8行以上的元素需要平滑
CMP r4,#1
BLGE width_loop_rest //对于其余元素使用普通的ARM,不能在NEON中因为边距
ADD r0,r0,#4 //跳过边距
ADD r1,r1,#4
ADD r5,r5,#4
ADD r8,r8,#4
SUBS r3,r3,#1 //递减行计数器
BNE height_loop //循环时仍有行
LDMFD sp! ,{r4-r12,pc} //恢复堆栈并返回调用函数


width_loop_eight_smoothing:
SUB r4,r4,#8 //递减宽度计数器
VLD1.16 {d0,d1},[r5],r10 //加载左上角元素
VLD1.16 {d2,d3},[r5],r10 //加载中间中间元素
VADDL .S16 q2,d0,d2 //长时间添加元素以确保不会丢失任何数据
VADDL.S16 q3,d1,d3
VLD1.16 {d0,d1},[r5], r11 //加载右上元素
VLD1.16 {d2,d3},[r0],r10 //加载中左元素
VADDL.S16 q4,d0,d2
VADDL.S16 q5,d1,d3
VADD.S32 q2,q4 //加总计
VADD.S32 q3,q5
VLD1.16 {d0,d1},[r0],r10 / /加载当前元素
VLD1.16 {d2,d3},[r0],r11 //加载中间右元素
VADDL.S16 q4,d0,d2
VADDL.S16 q5,d1 ,d3
VADD.S32 q2,q4
VADD.S32 q3,q5
VLD1.16 {d0,d1},[r8],r10 //加载左下元素
VLD1.16 {d2,d3}, [r8],r10 //加载较低中间元素
VADDL.S16 q4,d0,d2
VADDL.S16 q5,d1,d3
VADD.S32 q2,q4
VADD .S32 q3,q5
VLD1.16 {d0,d1},[r8],r11 //加载右下角元素
VADDL.S16 q4,d0,d22
VADDL.S16 q5, d1,d23
VADD.S32 q2,q4
VADD.S32 q3,q5
VMULL.S32 q6,d4,d20 //除以9(上方元素除以9)
VMULL.S32 q7,d5,d20
VMULL.S32 q8,d6,d20
VMULL.S32 q9,d7,d20
VUZP.32 q6,q7 //将结果打包更少的寄存器和更小的元素
VUZP.32 q8,q9
VUZP.16 q7,q9
VSHR.U16 q8,q7,#15 //当乘积元素为负值时,结果总是为1低于
VADD.S16 q7,q8 //通过添加符号位来纠正总计
VST1.16 {d14,d15},[r1]! //存储结果
CMP r4,#8 //检查是否有足够的元素在NEON中执行8次
BCS width_loop_eight_smoothing //如果是,则循环霓虹灯代码
MOV PC,LR //返回ARM_smoothing如果不是



width_loop_rest://与NEON类似地工作,但每次只有一个元素
LDRSH r6,[r0],#2 //将已加载的半字转换为带符号的完整单词
LDRSH r7,[r0] //主要区别在于由于存在重叠,所以增量完成
ADD r6,r7,r6
LDRSH r7 ,[r0,#2]
ADD r6,r7,r6
LDRSH r7,[r5],#2
ADD r6,r7,r6
LDRSH r7,[r5]
ADD r6,r7,r6
LDRSH r7,[r5,#2]
ADD r6,r7,r6
LDRSH r7,[r8],#2
ADD r6,r7,r6
LDRSH r7, [r8]
ADD r6,r7,r6
LDRSH r7,[r8,#2]
ADD r6,r7,r6
SMULLS r6,r7,r6,r9
ADDMI r7,#1
STRH r7,[r1],#2
SUBS r4,#1 //递减宽度计数器并检查是否有剩余
BNE width_loop_rest
MOV PC,LR


解决方案

您可以清楚地看到编译器使用一些伪操作来注释汇编器...

...

  .global mean 
.type mean,%function

...

  .size mean,。-mean 

这些是放入COFF部分并需要进行编译,以便调用图工具可以知道 PC 范围适用于您的汇编程序函数。

  .global ARM_smoothing 
+ .type ARM_smoothing,%function

...

  + .size ARM_smoothing,。-ARM_smoothing 

其他伪操作取决于所需的调试信息。





其他 .fnend .fnstart .movsp .save .setfp 等。

这取决于调试/对象该工具预期的格式。还有两种类型的数据;
$ b


  1. 代码范围信息
  2. 堆栈和框架使用

  3. ol>

    这两种方法通常都需要用于展开(或回溯曲线),但采样性能工具可能只能用于第一个。



    相关: 20pointerrel =nofollow noreferrer> ARM链接和帧寄存器


    I am currently carrying out optimizations on a linux-based software itself on an ARM processor. Those optimizations are mostly in the form of ARM and ARM NEON functions.

    In order to profile the software I use perf record and flame-graphs, however, once I introduce the assembler functions, they do not stack on top of the functions that call them but rather seemingly random places.

    My question therefore was, what should I include in my functions for them to appear properly in the call stacks.

    There was a slightly related topic but no good answer was given How to get call graph profiling working with gcc compiled code and ARM Cortex A8 target?. I use the same flags plus mapcs-frame.

    Below, I give an example of a C function translated to ARM by GCC. This ARM function seems to produces decent stacks but I would like to understand why.

    int half(int in);
    int sum(int in1, int in2);
    int mean(int in1, int in2);
    
    int half(int i)
    {
        return i / 2;
    }
    
    int sum(int i, int j)
    {
        return i + j;
    }
    
    int mean(int i, int j)
    {
        int s = sum(i, j);
        int m = half(s);
        return m;
    }
    
    int main()
    {
        int a = 1;
        int b = 5;
        int i;
        int result;
        for (i = 0; i<10000000; i++) { 
            result = mean(a, b);
        }
        return 0;
    }
    

    .cpu cortex-a9
            .eabi_attribute 27, 3
            .eabi_attribute 28, 1
            .fpu neon
            .eabi_attribute 20, 1
            .eabi_attribute 21, 1
            .eabi_attribute 23, 3
            .eabi_attribute 24, 1
            .eabi_attribute 25, 1
            .eabi_attribute 26, 2
            .eabi_attribute 30, 6
            .eabi_attribute 34, 1
            .eabi_attribute 18, 4
            .file   "a.c"
            .text
            .align  2
            .global half
            .type   half, %function
        half:
            @ args = 0, pretend = 0, frame = 8
            @ frame_needed = 1, uses_anonymous_args = 0
            mov ip, sp
            stmfd   sp!, {fp, ip, lr, pc}
            sub fp, ip, #4
            sub sp, sp, #8
            str r0, [fp, #-16]
            ldr r3, [fp, #-16]
            mov r2, r3, lsr #31
            add r3, r2, r3
            mov r3, r3, asr #1
            mov r0, r3
            sub sp, fp, #12
            ldmfd   sp, {fp, sp, pc}
            .size   half, .-half
            .align  2
            .global sum
            .type   sum, %function
        sum:
            @ args = 0, pretend = 0, frame = 8
            @ frame_needed = 1, uses_anonymous_args = 0
            mov ip, sp
            stmfd   sp!, {fp, ip, lr, pc}
            sub fp, ip, #4
            sub sp, sp, #8
            str r0, [fp, #-16]
            str r1, [fp, #-20]
            ldr r2, [fp, #-16]
            ldr r3, [fp, #-20]
            add r3, r2, r3
            mov r0, r3
            sub sp, fp, #12
            ldmfd   sp, {fp, sp, pc}
            .size   sum, .-sum
            .align  2
            .global mean
            .type   mean, %function
        mean:
            @ args = 0, pretend = 0, frame = 16
            @ frame_needed = 1, uses_anonymous_args = 0
            mov ip, sp
            stmfd   sp!, {fp, ip, lr, pc}
            sub fp, ip, #4
            sub sp, sp, #16
            str r0, [fp, #-24]
            str r1, [fp, #-28]
            ldr r1, [fp, #-28]
            ldr r0, [fp, #-24]
            bl  sum
            str r0, [fp, #-16]
            ldr r0, [fp, #-16]
            bl  half
            str r0, [fp, #-20]
            ldr r3, [fp, #-20]
            mov r0, r3
            sub sp, fp, #12
            ldmfd   sp, {fp, sp, pc}
            .size   mean, .-mean
            .align  2
            .global main
            .type   main, %function
        main:
            @ args = 0, pretend = 0, frame = 16
            @ frame_needed = 1, uses_anonymous_args = 0
            mov ip, sp
            stmfd   sp!, {fp, ip, lr, pc}
            sub fp, ip, #4
            sub sp, sp, #16
            mov r3, #1
            str r3, [fp, #-20]
            mov r3, #5
            str r3, [fp, #-24]
            mov r3, #0
            str r3, [fp, #-16]
            b   .L8
        .L9:
            ldr r1, [fp, #-24]
            ldr r0, [fp, #-20]
            bl  mean
            str r0, [fp, #-28]
            ldr r3, [fp, #-16]
            add r3, r3, #1
            str r3, [fp, #-16]
        .L8:
            ldr r2, [fp, #-16]
            movw    r3, #38527
            movt    r3, 152
            cmp r2, r3
            ble .L9
            mov r3, #0
            mov r0, r3
            sub sp, fp, #12
            ldmfd   sp, {fp, sp, pc}
            .size   main, .-main
            .ident  "GCC: (crosstool-NG linaro-1.13.1-4.9-2014.09 - Linaro GCC 4.9-2014.09) 4.9.2 20140904 (prerelease)"
            .section    .note.GNU-stack,"",%progbits
    

    -------------------EDIT-------------------

    Here is the example of the kind of function I am trying to integrate. In terms of linkage, all it does is save the stack and link register at the beginning and set them a the end. What should I add to it?

    .section .text
    
    .global ARM_smoothing
    
    ARM_smoothing:
        STMFD       sp!, {r4-r12,lr} //move used registers on stack (avoid segmentation fault)
        MOV         r5, r0              
        ADD         r0, r0, r2
        ADD         r0, r0, r2
        MOV         r8, r0
        ADD         r8, r8, r2
        ADD         r8, r8, r2       //the 6 instructions create 3 pointers to the row above and below as well as the current one
        ADD         r1, r1, r2
        ADD         r1, r1, r2       
        ADD         r1, r1, #2       //move destination pointer to first element (1 row down, 1 element left)
        SUB         r2, r2, #2
        SUB         r3, r3, #2       //counters decremented because smoothing function works with a margin of 1 on every side
        LDR         r9, =0x1C71C71D  //(1/9)*2^32 pour effectuer la division par 9
        LDR         r10, =0x2
        LDR         r11, =0xC        //shifts for pointers to data  
        VLDR.U64    d20, =0x1C71C71D //(1/9)*2^32 pour effectuer la division par 9
        VLDR.U64    d22, =0x0        //initialization of zeros to be used (not ncessarily needed)
        VLDR.U64    d23, =0x0
        VDUP.32     d20, d20[0]      //initialize vector for multiplication
    height_loop:
        MOV         r4, r2           //reset width counter
        CMP         r4, #8
        BLGE        width_loop_eight_smoothing //use neon while more than 8 elements in row need smoothing
        CMP         r4, #1
        BLGE        width_loop_rest  //use normal ARM for remaining elements, can't do in NEON because of margin
        ADD         r0, r0, #4       //skip margin
        ADD         r1, r1, #4
        ADD         r5, r5, #4
        ADD         r8, r8, #4
        SUBS        r3, r3, #1       //decrement row counter
        BNE         height_loop      //loop while there still are rows
        LDMFD       sp!, {r4-r12,pc} //restore stack and return to calling function
    
    
    width_loop_eight_smoothing:
        SUB         r4, r4, #8       //decrement width counter
        VLD1.16     {d0, d1}, [r5], r10     //load upper left elements
        VLD1.16     {d2, d3}, [r5], r10     //load upper middle elements
        VADDL.S16   q2, d0, d2              //long addition of elements to be sure to not lose any data
        VADDL.S16   q3, d1, d3              
        VLD1.16     {d0, d1}, [r5], r11     //load upper right elements     
        VLD1.16     {d2, d3}, [r0], r10     //load middle left elements
        VADDL.S16   q4, d0, d2
        VADDL.S16   q5, d1, d3
        VADD.S32    q2, q4                  //add to grand total
        VADD.S32    q3, q5
        VLD1.16     {d0, d1}, [r0], r10     //load current elements
        VLD1.16     {d2, d3}, [r0], r11     //load middle right elements
        VADDL.S16   q4, d0, d2
        VADDL.S16   q5, d1, d3
        VADD.S32    q2, q4
        VADD.S32    q3, q5
        VLD1.16     {d0, d1}, [r8], r10     //load lower left elements
        VLD1.16     {d2, d3}, [r8], r10     //load lower middle elements
        VADDL.S16   q4, d0, d2
        VADDL.S16   q5, d1, d3
        VADD.S32    q2, q4
        VADD.S32    q3, q5
        VLD1.16     {d0, d1}, [r8], r11     //load lower right elements
        VADDL.S16   q4, d0, d22
        VADDL.S16   q5, d1, d23
        VADD.S32    q2, q4
        VADD.S32    q3, q5  
        VMULL.S32   q6, d4, d20             //divide by 9 (upper element is total divided by 9)
        VMULL.S32   q7, d5, d20
        VMULL.S32   q8, d6, d20
        VMULL.S32   q9, d7, d20
        VUZP.32     q6, q7                  //pack results into less registers and smaller elements
        VUZP.32     q8, q9
        VUZP.16     q7, q9
        VSHR.U16    q8, q7, #15             //when multiplied element is negative, result is always one under
        VADD.S16    q7, q8                  //rectifying by adding sign bit to total
        VST1.16     {d14, d15}, [r1]!       //store results
        CMP         r4, #8                  //check if theres enough elements to do 8 more in NEON          
        BCS         width_loop_eight_smoothing      //if yes, loop neon code
        MOV         PC, LR                  //return to ARM_smoothing if not
    
    
    
    width_loop_rest:                        //works similaarly to NEON but one element at a time
        LDRSH       r6, [r0], #2            //converts loaded half words to signed full words
        LDRSH       r7, [r0]                //main difference is with the way increments are done since there is an overlap
        ADD         r6, r7, r6
        LDRSH       r7, [r0, #2]
        ADD         r6, r7, r6
        LDRSH       r7, [r5], #2
        ADD         r6, r7, r6
        LDRSH       r7, [r5]
        ADD         r6, r7, r6
        LDRSH       r7, [r5, #2]
        ADD         r6, r7, r6
        LDRSH       r7, [r8], #2
        ADD         r6, r7, r6
        LDRSH       r7, [r8]
        ADD         r6, r7, r6
        LDRSH       r7, [r8, #2]
        ADD         r6, r7, r6  
        SMULLS      r6, r7, r6, r9
        ADDMI       r7, #1
        STRH        r7, [r1], #2
        SUBS        r4, #1          //decrement width counter and check if there's any left
        BNE         width_loop_rest
        MOV         PC, LR
    

    解决方案

    You can clearly see how the compiler is annotating the assembler with some pseudo-ops...

        .global mean
        .type   mean, %function
    

    ...

       .size   mean, .-mean   
    

    These are put in COFF sections and need to make it to a build so that the call graph tools can know what PC range is for your assembler function.

        .global ARM_smoothing
     +  .type ARM_smoothing, %function
    

    ...

     +  .size ARM_smoothing, .-ARM_smoothing
    

    Other pseudo-ops depend on the debug information needed.

    Others are .fnend, .fnstart, .movsp, .save, .setfp, etc.

    It depends on the debug/object format expected by the tool. There are also two types of data;

    1. code extent information
    2. stack and frame use

    Both are typically needed for unwinding (or a stack back trace) but a sampling performance tool might only get away with the first. Exception handling code that does object clean up requires the most information.

    Related: ARM Link and frame register

    这篇关于ARM头部以获得适当的调用栈的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆