ARM头部以获得适当的调用栈 [英] ARM Headers to Get Proper Call Stacks

查看：178 发布时间：2018/4/21 11:08:38 gcc arm perf

本文介绍了ARM头部以获得适当的调用栈的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我目前正在ARM处理器上对基于Linux的软件进行优化。这些优化主要以ARM和ARM NEON函数的形式提供。

为了描述我使用perf记录和火焰图的软件，然而，一旦我介绍了汇编函数，它们不会堆栈在调用它们的函数之上，而是看起来随机的地方。

因此，我的问题是，我应该在他们的函数中包含什么正确地出现在调用堆栈中。

有一个稍微相关的主题，但没有给出好的答案。我使用相同的标志和mapcs-frame。

下面我给出一个由GCC转换为ARM的C函数的例子。这个ARM函数似乎产生了不错的堆栈，但我想知道为什么。

  int half（int in）; 
 int sum（int in1，int in2）; 
 int mean（int in1，int in2）; 
 
 int half（int i）
 {
 return i / 2; 
} 
 
 int sum（int i，int j）
 {
 return i + j; 
 
 
 int mean（int i，int j）
 {
 int s = sum（i，j）; 
 int m =一半（s）; 
返回m; 
} 
 
 int main（）
 {
 int a = 1; 
 int b = 5; 
 int i; 
 int结果; 
 for（i = 0; i <10000000; i ++）{
 result = mean（a，b）; 
} 
返回0;

  .cpu cortex-a9 
 .eabi_attribute 27,3 
 .eabi_attribute 28,1 
 .fpu neon 
 .eabi_attribute 20,1 
 .eabi_attribute 21,1 
 .eabi_attribute 23,3 
 .eabi_attribute 24,1 
 .eabi_attribute 25,1 
 .eabi_attribute 26,2 
 .eabi_attribute 30，6 
 .eabi_attribute 34，1 
 .eabi_attribute 18，4 
 .fileac
 .text 
 .align 2 
 .global half 
 .type half，％函数
 half：
 @ args = 0，假装= 0，frame = 8 
 @frame_needed = 1，uses_anonymous_args = 0 
 mov ip，sp 
 stmfd sp！ ，{fp，ip，lr，pc} 
 sub fp，ip，＃4 
 sub sp，sp，＃8 
 str r0，[fp，＃-16] 
 ldr r3，[fp，＃-16] 
 mov r2，r3，lsr＃31 
 add r3，r2，r3 
 mo v r3，r3，asr＃1 
 mov r0，r3 
 sub sp，fp，＃12 
 ldmfd sp，{fp，sp，pc} 
 .size half，。 -half 
 .align 2 
 .global sum 
 .type sum，％function 
 sum：
 @ args = 0，pretend = 0，frame = 8 
 @ frame_needed = 1，uses_anonymous_args = 0 
 mov ip，sp 
 stmfd sp !, {fp，ip，lr，pc} 
 sub fp，ip，＃4 
 sub sp，sp，＃8 
 str r0，[fp，＃-16] 
 str r1，[fp，＃-20] 
 ldr r2，[fp，＃-16] 
 ldr r3，[fp，＃-20] 
 add r3，r2，r3 
 mov r0，r3 
 sub sp，fp，＃12 
 ldmfd sp，{ fp，sp，pc} 
 .size sum，。-sum 
 .align 2 
 .global mean 
 .type mean，％function 
 mean：
 @ args = 0，pretend = 0，frame = 16 
 @frame_needed = 1，uses_anonymous_args = 0 
 mov ip，sp 
 stmfd s p！，{fp，ip，lr，pc} 
 sub fp，ip，＃4 
 sub sp，sp，＃16 
 str r0，[fp，＃-24] 
 str r1，[fp，＃-28] 
 ldr r1，[fp，＃-28] 
 ldr r0，[fp，＃-24] 
 b1b 
 str r0，[fp，＃-16] 
 ldr r0，[fp，＃-16] 
 bl half 
 str r0，[fp，＃-20] 
 ldr r3 ，[fp，＃-20] 
 mov r0，r3 
 sub sp，fp，＃12 
 ldmfd sp，{fp，sp，pc} 
 .size mean，。 -mean 
 .align 2 
 .global main 
 .type main，％function 
 main：
 @ args = 0，pretend = 0，frame = 16 
 @ frame_needed = 1，uses_anonymous_args = 0 
 mov ip，sp 
 stmfd sp !, {fp，ip，lr，pc} 
 sub fp，ip，＃4 
 sub sp，sp，＃16 
 mov r3，＃1 
 str r3，[fp，＃-20] 
 mov r3，＃5 
 str r3，[fp，＃ -24] 
 mov r3，＃0 
 str r3，[fp，＃-16] 
b .L8 
 .L9：
 ldr r1，[fp，＃-24] 
 ldr r0，[fp，＃-20] 
 bl意味着
 str r0，[fp，＃-28] 
 ldr r3，[fp，＃-16] 
 add r3，r3，＃1 
 str r3，[fp，＃-16] 
 .L8：
 ldr r2，[fp，＃-16] 
 movw r3，＃38527 
 movt r3，152 
 cmp r2，r3 
 ble .L9 
 mov r3，＃0 
 mov r0，r3 
 sub sp，fp，＃12 
 ldmfd sp，{fp，sp，pc} 
。 GCC :( crosstool-NG linaro-1.13.1-4.9-2014.09  -  Linaro GCC 4.9-2014.09）4.9.2 20140904（prerelease）
 .section。 note.GNU-stack，，％progbits

---------- ---------编辑-------------------

以下是示例我正在尝试整合的功能。在连接方面，它所做的一切就是在开始时保存堆栈和链接寄存器并将其设置为结束。我应该添加什么？

  .section .text 
 
 .global ARM_smoothing 
 
 ARM_smoothing：
 STMFD sp！，{r4-r12，lr} //将使用的寄存器移到堆栈上（避免分段错误）
 MOV r5，r0 
 ADD r0，r0， r2 
 ADD r0，r0，r2 
 MOV r8，r0 
 ADD r8，r8，r2 
 ADD r8，r8，r2 // 6条指令创建3行指针上面和下面以及当前一个
 ADD r1，r1，r2 
 ADD r1，r1，r2 
 ADD r1，r1，＃2 //将目标指针移到第一个元素（1 
 SUB r2，r2，＃2 
 SUB r3，r3，＃2 //计数器递减，因为平滑函数在每边上的边距为1 
 LDR r9，= 0x1C71C71D //（1/9）* 2 ^ 32注入效果器分区参数9 
 LDR r10，= 0x2 
 LDR r11， = 0xC //将指针转换为数据
 VLDR.U64 d20，= 0x1C71C71D //（1/9）* 2 ^ 32 pour effectuer la division par 9 
 VLDR.U64 d22，= 0x0 //初始化要使用的零（不是必需的）
 VLDR.U64 d23，= 0x0 
 VDUP.32 d20，d20 [0] //初始化乘法向量
 height_loop：
 MOV r4，r2 //复位宽度计数器
 CMP r4，＃8 
 BLGE width_loop_eight_smoothing //使用氖而8行以上的元素需要平滑
 CMP r4，＃1 
 BLGE width_loop_rest //对于其余元素使用普通的ARM，不能在NEON中因为边距
 ADD r0，r0，＃4 //跳过边距
 ADD r1，r1，＃4 
 ADD r5，r5，＃4 
 ADD r8，r8，＃4 
 SUBS r3，r3，＃1 //递减行计数器
 BNE height_loop //循环时仍有行
 LDMFD sp！ ，{r4-r12，pc} //恢复堆栈并返回调用函数
 
 
 width_loop_eight_smoothing：
 SUB r4，r4，＃8 //递减宽度计数器
 VLD1.16 {d0，d1}，[r5]，r10 //加载左上角元素
 VLD1.16 {d2，d3}，[r5]，r10 //加载中间中间元素
 VADDL .S16 q2，d0，d2 //长时间添加元素以确保不会丢失任何数据
 VADDL.S16 q3，d1，d3 
 VLD1.16 {d0，d1}，[r5]， r11 //加载右上元素
 VLD1.16 {d2，d3}，[r0]，r10 //加载中左元素
 VADDL.S16 q4，d0，d2 
 VADDL.S16 q5，d1，d3 
 VADD.S32 q2，q4 //加总计
 VADD.S32 q3，q5 
 VLD1.16 {d0，d1}，[r0]，r10 / /加载当前元素
 VLD1.16 {d2，d3}，[r0]，r11 //加载中间右元素
 VADDL.S16 q4，d0，d2 
 VADDL.S16 q5，d1 ，d3 
 VADD.S32 q2，q4 
 VADD.S32 q3，q5 
 VLD1.16 {d0，d1}，[r8]，r10 //加载左下元素
 VLD1.16 {d2，d3}， [r8]，r10 //加载较低中间元素
 VADDL.S16 q4，d0，d2 
 VADDL.S16 q5，d1，d3 
 VADD.S32 q2，q4 
 VADD .S32 q3，q5 
 VLD1.16 {d0，d1}，[r8]，r11 //加载右下角元素
 VADDL.S16 q4，d0，d22 
 VADDL.S16 q5， d1，d23 
 VADD.S32 q2，q4 
 VADD.S32 q3，q5 
 VMULL.S32 q6，d4，d20 //除以9（上方元素除以9）
 VMULL.S32 q7，d5，d20 
 VMULL.S32 q8，d6，d20 
 VMULL.S32 q9，d7，d20 
 VUZP.32 q6，q7 //将结果打包更少的寄存器和更小的元素
 VUZP.32 q8，q9 
 VUZP.16 q7，q9 
 VSHR.U16 q8，q7，＃15 //当乘积元素为负值时，结果总是为1低于
 VADD.S16 q7，q8 //通过添加符号位来纠正总计
 VST1.16 {d14，d15}，[r1]！ //存储结果
 CMP r4，＃8 //检查是否有足够的元素在NEON中执行8次
 BCS width_loop_eight_smoothing //如果是，则循环霓虹灯代码
 MOV PC，LR //返回ARM_smoothing如果不是
 
 
 
 width_loop_rest：//与NEON类似地工作，但每次只有一个元素
 LDRSH r6，[r0]，＃2 //将已加载的半字转换为带符号的完整单词
 LDRSH r7，[r0] //主要区别在于由于存在重叠，所以增量完成
 ADD r6，r7，r6 
 LDRSH r7 ，[r0，＃2] 
 ADD r6，r7，r6 
 LDRSH r7，[r5]，＃2 
 ADD r6，r7，r6 
 LDRSH r7，[r5] 
 ADD r6，r7，r6 
 LDRSH r7，[r5，＃2] 
 ADD r6，r7，r6 
 LDRSH r7，[r8]，＃2 
 ADD r6，r7，r6 
 LDRSH r7， [r8] 
 ADD r6，r7，r6 
 LDRSH r7，[r8，＃2] 
 ADD r6，r7，r6 
 SMULLS r6，r7，r6，r9 
 ADDMI r7，＃1 
 STRH r7，[r1]，＃2 
 SUBS r4，＃1 //递减宽度计数器并检查是否有剩余
 BNE width_loop_rest 
 MOV PC，LR

解决方案

您可以清楚地看到编译器使用一些伪操作来注释汇编器...

...

  .global mean 
 .type mean，％function

...

  .size mean，。-mean

这些是放入COFF部分并需要进行编译，以便调用图工具可以知道 PC 范围适用于您的汇编程序函数。

  .global ARM_smoothing 
 + .type ARM_smoothing，％function

...

  + .size ARM_smoothing，。-ARM_smoothing

其他伪操作取决于所需的调试信息。

.func

.endfunc

.size

ARM CFI问题

.cantunwind

其他 .fnend ， .fnstart ， .movsp ， .save ， .setfp 等。

这取决于调试/对象该工具预期的格式。还有两种类型的数据;
$ b

代码范围信息
堆栈和框架使用

这两种方法通常都需要用于展开（或回溯曲线），但采样性能工具可能只能用于第一个。

相关： 20pointerrel =nofollow noreferrer> ARM链接和帧寄存器

I am currently carrying out optimizations on a linux-based software itself on an ARM processor. Those optimizations are mostly in the form of ARM and ARM NEON functions.

In order to profile the software I use perf record and flame-graphs, however, once I introduce the assembler functions, they do not stack on top of the functions that call them but rather seemingly random places.

My question therefore was, what should I include in my functions for them to appear properly in the call stacks.

There was a slightly related topic but no good answer was given How to get call graph profiling working with gcc compiled code and ARM Cortex A8 target?. I use the same flags plus mapcs-frame.

Below, I give an example of a C function translated to ARM by GCC. This ARM function seems to produces decent stacks but I would like to understand why.

int half(int in);
int sum(int in1, int in2);
int mean(int in1, int in2);

int half(int i)
{
    return i / 2;
}

int sum(int i, int j)
{
    return i + j;
}

int mean(int i, int j)
{
    int s = sum(i, j);
    int m = half(s);
    return m;
}

int main()
{
    int a = 1;
    int b = 5;
    int i;
    int result;
    for (i = 0; i<10000000; i++) { 
        result = mean(a, b);
    }
    return 0;
}

.cpu cortex-a9
        .eabi_attribute 27, 3
        .eabi_attribute 28, 1
        .fpu neon
        .eabi_attribute 20, 1
        .eabi_attribute 21, 1
        .eabi_attribute 23, 3
        .eabi_attribute 24, 1
        .eabi_attribute 25, 1
        .eabi_attribute 26, 2
        .eabi_attribute 30, 6
        .eabi_attribute 34, 1
        .eabi_attribute 18, 4
        .file   "a.c"
        .text
        .align  2
        .global half
        .type   half, %function
    half:
        @ args = 0, pretend = 0, frame = 8
        @ frame_needed = 1, uses_anonymous_args = 0
        mov ip, sp
        stmfd   sp!, {fp, ip, lr, pc}
        sub fp, ip, #4
        sub sp, sp, #8
        str r0, [fp, #-16]
        ldr r3, [fp, #-16]
        mov r2, r3, lsr #31
        add r3, r2, r3
        mov r3, r3, asr #1
        mov r0, r3
        sub sp, fp, #12
        ldmfd   sp, {fp, sp, pc}
        .size   half, .-half
        .align  2
        .global sum
        .type   sum, %function
    sum:
        @ args = 0, pretend = 0, frame = 8
        @ frame_needed = 1, uses_anonymous_args = 0
        mov ip, sp
        stmfd   sp!, {fp, ip, lr, pc}
        sub fp, ip, #4
        sub sp, sp, #8
        str r0, [fp, #-16]
        str r1, [fp, #-20]
        ldr r2, [fp, #-16]
        ldr r3, [fp, #-20]
        add r3, r2, r3
        mov r0, r3
        sub sp, fp, #12
        ldmfd   sp, {fp, sp, pc}
        .size   sum, .-sum
        .align  2
        .global mean
        .type   mean, %function
    mean:
        @ args = 0, pretend = 0, frame = 16
        @ frame_needed = 1, uses_anonymous_args = 0
        mov ip, sp
        stmfd   sp!, {fp, ip, lr, pc}
        sub fp, ip, #4
        sub sp, sp, #16
        str r0, [fp, #-24]
        str r1, [fp, #-28]
        ldr r1, [fp, #-28]
        ldr r0, [fp, #-24]
        bl  sum
        str r0, [fp, #-16]
        ldr r0, [fp, #-16]
        bl  half
        str r0, [fp, #-20]
        ldr r3, [fp, #-20]
        mov r0, r3
        sub sp, fp, #12
        ldmfd   sp, {fp, sp, pc}
        .size   mean, .-mean
        .align  2
        .global main
        .type   main, %function
    main:
        @ args = 0, pretend = 0, frame = 16
        @ frame_needed = 1, uses_anonymous_args = 0
        mov ip, sp
        stmfd   sp!, {fp, ip, lr, pc}
        sub fp, ip, #4
        sub sp, sp, #16
        mov r3, #1
        str r3, [fp, #-20]
        mov r3, #5
        str r3, [fp, #-24]
        mov r3, #0
        str r3, [fp, #-16]
        b   .L8
    .L9:
        ldr r1, [fp, #-24]
        ldr r0, [fp, #-20]
        bl  mean
        str r0, [fp, #-28]
        ldr r3, [fp, #-16]
        add r3, r3, #1
        str r3, [fp, #-16]
    .L8:
        ldr r2, [fp, #-16]
        movw    r3, #38527
        movt    r3, 152
        cmp r2, r3
        ble .L9
        mov r3, #0
        mov r0, r3
        sub sp, fp, #12
        ldmfd   sp, {fp, sp, pc}
        .size   main, .-main
        .ident  "GCC: (crosstool-NG linaro-1.13.1-4.9-2014.09 - Linaro GCC 4.9-2014.09) 4.9.2 20140904 (prerelease)"
        .section    .note.GNU-stack,"",%progbits

-------------------EDIT-------------------

Here is the example of the kind of function I am trying to integrate. In terms of linkage, all it does is save the stack and link register at the beginning and set them a the end. What should I add to it?

.section .text

.global ARM_smoothing

ARM_smoothing:
    STMFD       sp!, {r4-r12,lr} //move used registers on stack (avoid segmentation fault)
    MOV         r5, r0              
    ADD         r0, r0, r2
    ADD         r0, r0, r2
    MOV         r8, r0
    ADD         r8, r8, r2
    ADD         r8, r8, r2       //the 6 instructions create 3 pointers to the row above and below as well as the current one
    ADD         r1, r1, r2
    ADD         r1, r1, r2       
    ADD         r1, r1, #2       //move destination pointer to first element (1 row down, 1 element left)
    SUB         r2, r2, #2
    SUB         r3, r3, #2       //counters decremented because smoothing function works with a margin of 1 on every side
    LDR         r9, =0x1C71C71D  //(1/9)*2^32 pour effectuer la division par 9
    LDR         r10, =0x2
    LDR         r11, =0xC        //shifts for pointers to data  
    VLDR.U64    d20, =0x1C71C71D //(1/9)*2^32 pour effectuer la division par 9
    VLDR.U64    d22, =0x0        //initialization of zeros to be used (not ncessarily needed)
    VLDR.U64    d23, =0x0
    VDUP.32     d20, d20[0]      //initialize vector for multiplication
height_loop:
    MOV         r4, r2           //reset width counter
    CMP         r4, #8
    BLGE        width_loop_eight_smoothing //use neon while more than 8 elements in row need smoothing
    CMP         r4, #1
    BLGE        width_loop_rest  //use normal ARM for remaining elements, can't do in NEON because of margin
    ADD         r0, r0, #4       //skip margin
    ADD         r1, r1, #4
    ADD         r5, r5, #4
    ADD         r8, r8, #4
    SUBS        r3, r3, #1       //decrement row counter
    BNE         height_loop      //loop while there still are rows
    LDMFD       sp!, {r4-r12,pc} //restore stack and return to calling function


width_loop_eight_smoothing:
    SUB         r4, r4, #8       //decrement width counter
    VLD1.16     {d0, d1}, [r5], r10     //load upper left elements
    VLD1.16     {d2, d3}, [r5], r10     //load upper middle elements
    VADDL.S16   q2, d0, d2              //long addition of elements to be sure to not lose any data
    VADDL.S16   q3, d1, d3              
    VLD1.16     {d0, d1}, [r5], r11     //load upper right elements     
    VLD1.16     {d2, d3}, [r0], r10     //load middle left elements
    VADDL.S16   q4, d0, d2
    VADDL.S16   q5, d1, d3
    VADD.S32    q2, q4                  //add to grand total
    VADD.S32    q3, q5
    VLD1.16     {d0, d1}, [r0], r10     //load current elements
    VLD1.16     {d2, d3}, [r0], r11     //load middle right elements
    VADDL.S16   q4, d0, d2
    VADDL.S16   q5, d1, d3
    VADD.S32    q2, q4
    VADD.S32    q3, q5
    VLD1.16     {d0, d1}, [r8], r10     //load lower left elements
    VLD1.16     {d2, d3}, [r8], r10     //load lower middle elements
    VADDL.S16   q4, d0, d2
    VADDL.S16   q5, d1, d3
    VADD.S32    q2, q4
    VADD.S32    q3, q5
    VLD1.16     {d0, d1}, [r8], r11     //load lower right elements
    VADDL.S16   q4, d0, d22
    VADDL.S16   q5, d1, d23
    VADD.S32    q2, q4
    VADD.S32    q3, q5  
    VMULL.S32   q6, d4, d20             //divide by 9 (upper element is total divided by 9)
    VMULL.S32   q7, d5, d20
    VMULL.S32   q8, d6, d20
    VMULL.S32   q9, d7, d20
    VUZP.32     q6, q7                  //pack results into less registers and smaller elements
    VUZP.32     q8, q9
    VUZP.16     q7, q9
    VSHR.U16    q8, q7, #15             //when multiplied element is negative, result is always one under
    VADD.S16    q7, q8                  //rectifying by adding sign bit to total
    VST1.16     {d14, d15}, [r1]!       //store results
    CMP         r4, #8                  //check if theres enough elements to do 8 more in NEON          
    BCS         width_loop_eight_smoothing      //if yes, loop neon code
    MOV         PC, LR                  //return to ARM_smoothing if not



width_loop_rest:                        //works similaarly to NEON but one element at a time
    LDRSH       r6, [r0], #2            //converts loaded half words to signed full words
    LDRSH       r7, [r0]                //main difference is with the way increments are done since there is an overlap
    ADD         r6, r7, r6
    LDRSH       r7, [r0, #2]
    ADD         r6, r7, r6
    LDRSH       r7, [r5], #2
    ADD         r6, r7, r6
    LDRSH       r7, [r5]
    ADD         r6, r7, r6
    LDRSH       r7, [r5, #2]
    ADD         r6, r7, r6
    LDRSH       r7, [r8], #2
    ADD         r6, r7, r6
    LDRSH       r7, [r8]
    ADD         r6, r7, r6
    LDRSH       r7, [r8, #2]
    ADD         r6, r7, r6  
    SMULLS      r6, r7, r6, r9
    ADDMI       r7, #1
    STRH        r7, [r1], #2
    SUBS        r4, #1          //decrement width counter and check if there's any left
    BNE         width_loop_rest
    MOV         PC, LR

解决方案

You can clearly see how the compiler is annotating the assembler with some pseudo-ops...

    .global mean
    .type   mean, %function

...

   .size   mean, .-mean

These are put in COFF sections and need to make it to a build so that the call graph tools can know what PC range is for your assembler function.

    .global ARM_smoothing
 +  .type ARM_smoothing, %function

...

 +  .size ARM_smoothing, .-ARM_smoothing

Other pseudo-ops depend on the debug information needed.

Others are .fnend, .fnstart, .movsp, .save, .setfp, etc.

It depends on the debug/object format expected by the tool. There are also two types of data;

code extent information
stack and frame use

Both are typically needed for unwinding (or a stack back trace) but a sampling performance tool might only get away with the first. Exception handling code that does object clean up requires the most information.

Related: ARM Link and frame register

这篇关于ARM头部以获得适当的调用栈的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持IT屋！

查看全文

ARM头部以获得适当的调用栈 [英] ARM Headers to Get Proper Call Stacks

问题描述

相关文章

其他开发最新文章

热门教程

热门工具

登录关闭

ARM头部以获得适当的调用栈 [英] ARM Headers to Get Proper Call Stacks

问题描述

相关文章

其他开发最新文章

热门教程

热门工具

登录 关闭

登录关闭