RDTSC,太多的周期 [英] rdtsc, too many cycles

查看:160
本文介绍了RDTSC,太多的周期的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

 的#include<&stdio.h中GT;
静态内嵌无符号长长打勾()
{
        无符号长长D组;
        __asm​​__ __volatile__(RDTSC:= A(d)条);
        返回D组;
}诠释的main()
{
        长长的资源;
        RES =打勾();        RES =打勾() - 资源;
        的printf(%D,RES);
        返回0;
}

我编这个code与海湾合作委员会与-O0 -O1 -O2 -O3优化。而我总是2000-2500周期。谁能解释这个输出的原因是什么?如何度过这些周期?

<击>第一功能滴答是错误的。这是对

功能嘀的另一个版本。

 静态__inline__无符号长长打勾()
{
  无符号HI,LO;
  __asm​​__ __volatile__(RDTSC:= A(LO),= D(HI));
  返回((无符号很长很长)LO)|(((无符号很长很长)喜)LT;&LT; 32);
}

这是组装code为-O3

  .filerdtsc.c
.section伪.rodata.str1.1,AMS,@ PROGBITS,1
.LC0:
    .string%D
    。文本
    .p2align 4日,15
.globl主
    .TYPE为主,@function
主要:
    莱亚尔4(%ESP),ECX%
    和L $ -16,ESP%
    pushl -4(ECX%)
    pushl%EBP
    MOVL%ESP,EBP%
    subl $ 40%ESP
    MOVL%ECX,-16(%EBP)
    MOVL%EBX,-12(%EBP)
    MOVL%ESI,-8(%EBP)
    MOVL%EDI,-4(%EBP)
#APP
#6rdtsc.c1
    RDTSC
#0,2
#NO_APP
    MOVL%EDX,EDI%
    MOVL%EAX,ESI%
#APP
#6rdtsc.c1
    RDTSC
#0,2
#NO_APP
    MOVL%EAX,ECX%
    MOVL%EDX,EBX%
    subl%ESI,ECX%
    sbbl%EDI,EBX%
    MOVL%ECX,4(%尤)
    MOVL%EBX,8(%ESP)
    MOVL $ .LC0(%ESP)
    调用printf
    MOVL -16(%EBP),ECX%
    xorl%EAX,EAX%
    MOVL -12(%EBP),EBX%
    MOVL -8(%EBP),ESI%
    MOVL -4(%EBP),EDI%
    MOVL%EBP,ESP%
    popl%EBP
    莱亚尔-4(ECX%),ESP%
    RET
    .size为主,。,主
    .identGCC:(Debian的4.3.2-1.1)4.3.2
    .section伪.note.GNU堆栈,,@ PROGBITS

这是CPU

 处理器:0
VENDOR_ID:GenuineIntel
CPU系列:15
型号:4
型号名称:英特尔(R)至强(TM)CPU 3.00GHz
步进:3
CPU兆赫:3000.105
缓存大小:2048 KB。
fdiv_bug:无
hlt_bug:无
f00f_bug:无
coma_bug:无
FPU:是
fpu_exception:是
CPUID级别:5
WP:是
标志:FPU VME德PSE TSC MSR,PAE MCE CX8 APIC月MTRR PGE MCA CMOV拍拍PSE36 CLFLUSH DTS ACPI MMX SSE FXSR SS SSE2高达constant_tsc BTS PEBS PNI
bogomips:6036.62
CLFLUSH尺寸:64


解决方案

我试过你的code对不同的英特尔处理器运行多个Linux发行版(当然所有比Pentium 4 HT 630你显得更近使用)。在所有这些测试中,我25和50周期之间得到的值。

我唯一一个与所有的证据相一致的假设是,你正在运行的虚拟机内部的操作系统,而不是裸机,而TSC是越来越虚拟化。

#include <stdio.h>
static inline unsigned long long tick() 
{
        unsigned long long d;
        __asm__ __volatile__ ("rdtsc" : "=A" (d) );
        return d;
}

int main()
{
        long long res;
        res=tick();

        res=tick()-res;
        printf("%d",res);
        return 0;
}

I have compiled this code with gcc with -O0 -O1 -O2 -O3 optimizations. And I always get 2000-2500 cycles. Can anyone explain the reason for this output? How to spend these cycles?

First function "tick" is wrong. This is right.

Another version of function "tick"

static __inline__ unsigned long long tick()
{
  unsigned hi, lo;
  __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
  return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
}

This is assembly code for -O3

 .file  "rdtsc.c"
.section    .rodata.str1.1,"aMS",@progbits,1
.LC0:
    .string "%d"
    .text
    .p2align 4,,15
.globl main
    .type   main, @function
main:
    leal    4(%esp), %ecx
    andl    $-16, %esp
    pushl   -4(%ecx)
    pushl   %ebp
    movl    %esp, %ebp
    subl    $40, %esp
    movl    %ecx, -16(%ebp)
    movl    %ebx, -12(%ebp)
    movl    %esi, -8(%ebp)
    movl    %edi, -4(%ebp)
#APP
# 6 "rdtsc.c" 1
    rdtsc
# 0 "" 2
#NO_APP
    movl    %edx, %edi
    movl    %eax, %esi
#APP
# 6 "rdtsc.c" 1
    rdtsc
# 0 "" 2
#NO_APP
    movl    %eax, %ecx
    movl    %edx, %ebx
    subl    %esi, %ecx
    sbbl    %edi, %ebx
    movl    %ecx, 4(%esp)
    movl    %ebx, 8(%esp)
    movl    $.LC0, (%esp)
    call    printf
    movl    -16(%ebp), %ecx
    xorl    %eax, %eax
    movl    -12(%ebp), %ebx
    movl    -8(%ebp), %esi
    movl    -4(%ebp), %edi
    movl    %ebp, %esp
    popl    %ebp
    leal    -4(%ecx), %esp
    ret
    .size   main, .-main
    .ident  "GCC: (Debian 4.3.2-1.1) 4.3.2"
    .section    .note.GNU-stack,"",@progbits

This is CPU

processor   : 0
vendor_id   : GenuineIntel
cpu family  : 15
model       : 4
model name  : Intel(R) Xeon(TM) CPU 3.00GHz
stepping    : 3
cpu MHz     : 3000.105
cache size  : 2048 KB
fdiv_bug    : no
hlt_bug     : no
f00f_bug    : no
coma_bug    : no
fpu     : yes
fpu_exception   : yes
cpuid level : 5
wp      : yes
flags       : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss constant_tsc up pebs bts pni
bogomips    : 6036.62
clflush size    : 64

解决方案

I've tried your code on several Linux distros running on different Intel CPUs (admittedly all more recent than the Pentium 4 HT 630 you appear to be using). In all those tests I got values between 25 and 50 cycles.

My only hypothesis that's consistent with all the evidence is that you're running your operating system inside a virtual machine rather than on bare metal, and TSC is getting virtualized.

这篇关于RDTSC,太多的周期的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆