如何在x86_64上准确基准未对齐的访问速度 [英] How can I accurately benchmark unaligned access speed on x86_64

查看:96
本文介绍了如何在x86_64上准确基准未对齐的访问速度的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

答案中,我曾指出,长期以来,未对齐访问的速度几乎与对齐访问的速度相同(在x86/x86_64上).我没有任何数字可以支持此声明,因此我为此创建了一个基准.

您是否看到此基准测试中存在任何缺陷?您可以改进它吗(我的意思是,提高GB/秒,以便更好地反映事实)?

 #include <sys/time.h>
#include <stdio.h>

template <int N>
__attribute__((noinline))
void loop32(const char *v) {
    for (int i=0; i<N; i+=160) {
        __asm__ ("mov     (%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x04(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x08(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x0c(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x10(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x14(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x18(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x1c(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x20(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x24(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x28(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x2c(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x30(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x34(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x38(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x3c(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x40(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x44(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x48(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x4c(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x50(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x54(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x58(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x5c(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x60(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x64(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x68(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x6c(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x70(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x74(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x78(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x7c(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x80(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x84(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x88(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x8c(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x90(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x94(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x98(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x9c(%0), %%eax" : : "r"(v) :"eax");
        v += 160;
    }
}

template <int N>
__attribute__((noinline))
void loop64(const char *v) {
    for (int i=0; i<N; i+=160) {
        __asm__ ("mov     (%0), %%rax" : : "r"(v) :"rax");
        __asm__ ("mov 0x08(%0), %%rax" : : "r"(v) :"rax");
        __asm__ ("mov 0x10(%0), %%rax" : : "r"(v) :"rax");
        __asm__ ("mov 0x18(%0), %%rax" : : "r"(v) :"rax");
        __asm__ ("mov 0x20(%0), %%rax" : : "r"(v) :"rax");
        __asm__ ("mov 0x28(%0), %%rax" : : "r"(v) :"rax");
        __asm__ ("mov 0x30(%0), %%rax" : : "r"(v) :"rax");
        __asm__ ("mov 0x38(%0), %%rax" : : "r"(v) :"rax");
        __asm__ ("mov 0x40(%0), %%rax" : : "r"(v) :"rax");
        __asm__ ("mov 0x48(%0), %%rax" : : "r"(v) :"rax");
        __asm__ ("mov 0x50(%0), %%rax" : : "r"(v) :"rax");
        __asm__ ("mov 0x58(%0), %%rax" : : "r"(v) :"rax");
        __asm__ ("mov 0x60(%0), %%rax" : : "r"(v) :"rax");
        __asm__ ("mov 0x68(%0), %%rax" : : "r"(v) :"rax");
        __asm__ ("mov 0x70(%0), %%rax" : : "r"(v) :"rax");
        __asm__ ("mov 0x78(%0), %%rax" : : "r"(v) :"rax");
        __asm__ ("mov 0x80(%0), %%rax" : : "r"(v) :"rax");
        __asm__ ("mov 0x88(%0), %%rax" : : "r"(v) :"rax");
        __asm__ ("mov 0x90(%0), %%rax" : : "r"(v) :"rax");
        __asm__ ("mov 0x98(%0), %%rax" : : "r"(v) :"rax");
        v += 160;
    }
}

template <int N>
__attribute__((noinline))
void loop128a(const char *v) {
    for (int i=0; i<N; i+=160) {
        __asm__ ("movaps     (%0), %%xmm0" : : "r"(v) :"xmm0");
        __asm__ ("movaps 0x10(%0), %%xmm0" : : "r"(v) :"xmm0");
        __asm__ ("movaps 0x20(%0), %%xmm0" : : "r"(v) :"xmm0");
        __asm__ ("movaps 0x30(%0), %%xmm0" : : "r"(v) :"xmm0");
        __asm__ ("movaps 0x40(%0), %%xmm0" : : "r"(v) :"xmm0");
        __asm__ ("movaps 0x50(%0), %%xmm0" : : "r"(v) :"xmm0");
        __asm__ ("movaps 0x60(%0), %%xmm0" : : "r"(v) :"xmm0");
        __asm__ ("movaps 0x70(%0), %%xmm0" : : "r"(v) :"xmm0");
        __asm__ ("movaps 0x80(%0), %%xmm0" : : "r"(v) :"xmm0");
        __asm__ ("movaps 0x90(%0), %%xmm0" : : "r"(v) :"xmm0");
        v += 160;
    }
}

template <int N>
__attribute__((noinline))
void loop128u(const char *v) {
    for (int i=0; i<N; i+=160) {
        __asm__ ("movups     (%0), %%xmm0" : : "r"(v) :"xmm0");
        __asm__ ("movups 0x10(%0), %%xmm0" : : "r"(v) :"xmm0");
        __asm__ ("movups 0x20(%0), %%xmm0" : : "r"(v) :"xmm0");
        __asm__ ("movups 0x30(%0), %%xmm0" : : "r"(v) :"xmm0");
        __asm__ ("movups 0x40(%0), %%xmm0" : : "r"(v) :"xmm0");
        __asm__ ("movups 0x50(%0), %%xmm0" : : "r"(v) :"xmm0");
        __asm__ ("movups 0x60(%0), %%xmm0" : : "r"(v) :"xmm0");
        __asm__ ("movups 0x70(%0), %%xmm0" : : "r"(v) :"xmm0");
        __asm__ ("movups 0x80(%0), %%xmm0" : : "r"(v) :"xmm0");
        __asm__ ("movups 0x90(%0), %%xmm0" : : "r"(v) :"xmm0");
        v += 160;
    }
}

long long int t() {
    struct timeval tv;
    gettimeofday(&tv, 0);
    return (long long int)tv.tv_sec*1000000 + tv.tv_usec;
}

int main() {
    const int ITER = 10;
    const int N = 1600000000;

    char *data = reinterpret_cast<char *>(((reinterpret_cast<unsigned long long>(new char[N+32])+15)&~15));
    for (int i=0; i<N+16; i++) data[i] = 0;

    {
        long long int t0 = t();
        for (int i=0; i<ITER*100000; i++) {
            loop32<N/100000>(data);
        }
        long long int t1 = t();
        for (int i=0; i<ITER*100000; i++) {
            loop32<N/100000>(data+1);
        }
        long long int t2 = t();
        for (int i=0; i<ITER; i++) {
            loop32<N>(data);
        }
        long long int t3 = t();
        for (int i=0; i<ITER; i++) {
            loop32<N>(data+1);
        }
        long long int t4 = t();

        printf(" 32-bit, cache: aligned: %8.4f GB/sec unaligned: %8.4f GB/sec, difference: %0.3f%%\n", (double)N*ITER/(t1-t0)/1000, (double)N*ITER/(t2-t1)/1000, 100.0*(t2-t1)/(t1-t0)-100.0f);
        printf(" 32-bit,   mem: aligned: %8.4f GB/sec unaligned: %8.4f GB/sec, difference: %0.3f%%\n", (double)N*ITER/(t3-t2)/1000, (double)N*ITER/(t4-t3)/1000, 100.0*(t4-t3)/(t3-t2)-100.0f);
    }
    {
        long long int t0 = t();
        for (int i=0; i<ITER*100000; i++) {
            loop64<N/100000>(data);
        }
        long long int t1 = t();
        for (int i=0; i<ITER*100000; i++) {
            loop64<N/100000>(data+1);
        }
        long long int t2 = t();
        for (int i=0; i<ITER; i++) {
            loop64<N>(data);
        }
        long long int t3 = t();
        for (int i=0; i<ITER; i++) {
            loop64<N>(data+1);
        }
        long long int t4 = t();

        printf(" 64-bit, cache: aligned: %8.4f GB/sec unaligned: %8.4f GB/sec, difference: %0.3f%%\n", (double)N*ITER/(t1-t0)/1000, (double)N*ITER/(t2-t1)/1000, 100.0*(t2-t1)/(t1-t0)-100.0f);
        printf(" 64-bit,   mem: aligned: %8.4f GB/sec unaligned: %8.4f GB/sec, difference: %0.3f%%\n", (double)N*ITER/(t3-t2)/1000, (double)N*ITER/(t4-t3)/1000, 100.0*(t4-t3)/(t3-t2)-100.0f);
    }
    {
        long long int t0 = t();
        for (int i=0; i<ITER*100000; i++) {
            loop128a<N/100000>(data);
        }
        long long int t1 = t();
        for (int i=0; i<ITER*100000; i++) {
            loop128u<N/100000>(data+1);
        }
        long long int t2 = t();
        for (int i=0; i<ITER; i++) {
            loop128a<N>(data);
        }
        long long int t3 = t();
        for (int i=0; i<ITER; i++) {
            loop128u<N>(data+1);
        }
        long long int t4 = t();

        printf("128-bit, cache: aligned: %8.4f GB/sec unaligned: %8.4f GB/sec, difference: %0.3f%%\n", (double)N*ITER/(t1-t0)/1000, (double)N*ITER/(t2-t1)/1000, 100.0*(t2-t1)/(t1-t0)-100.0f);
        printf("128-bit,   mem: aligned: %8.4f GB/sec unaligned: %8.4f GB/sec, difference: %0.3f%%\n", (double)N*ITER/(t3-t2)/1000, (double)N*ITER/(t4-t3)/1000, 100.0*(t4-t3)/(t3-t2)-100.0f);
    }
}
 

解决方案

计时方法.我可能会进行设置,以便由命令行arg选择测试,因此我可以使用perf stat ./unaligned-test对其计时,并获得perf计数器结果,而不仅仅是每次测试的挂钟时间.这样,由于我可以测量核心时钟周期,所以我不必在意涡轮增压/省电. (除非您禁用Turbo和其他频率变化,否则与gettimeofday/rdtsc参考周期不一样.)


您只测试吞吐量,而不是等待时间,因为所有负载都不依赖.

您的缓存号将比您的内存号差,但是您可能不会意识到这是因为您的缓存号可能是由于 缓存提取宽度可能比行大小更窄.最近的Intel CPU可以从高速缓存行中提取所有未对齐的块,但这是因为它们具有特殊的硬件来加快速度,而其他CPU可能仅以最快的速度运行.在自然对齐的16B块中获取内容. @ BeeOnRope表示AMD CPU可能会关注16B和32B边界.)

您根本没有测试存储->负载转发.对于现有测试,以及一种以可视化的方式显示不同比对结果的好方法,请参阅此stuffedcow.net博客文章: x86处理器中的存储到负载转发和内存歧义消除.

通过内存传递数据是一个重要的用例,未对齐+高速缓存行拆分可能会干扰某些CPU上的存储转发.要正确测试,请确保测试不同的错位,而不仅仅是1:15(向量)或1:3(整数). (您目前仅测试相对于16B对齐的+1偏移量.)

我忘记了它是仅用于存储转发还是常规加载,但是当负载在高速缓存行边界上平均分配时(8:8向量,也可能是4:4或2:2整数分割).您应该对此进行测试. (我可能正在考虑P4 lddqu或Core 2 movqdu)

Agner Fog 的东西,以了解更多关于未对齐负载如何变慢的信息​​,以及编写测试以解决这些问题.实际上,Agner可能不是最好的资源,因为他的微体系结构指南主要侧重于通过管道获得微指令.只是简要介绍了缓存行拆分的成本,而没有深入讨论吞吐量与延迟之间的关系.

另请参阅:缓存行拆分,取Dark Shikari的博客(x264首席开发人员)的两篇谈到了Core2上未对齐的加载策略:值得检查对齐并为该块使用其他策略.


脚注:

    最近,
  1. 64B高速缓存行是一个安全的假设.奔腾3和更早版本具有32B行. P4有64B行,但通常以128B对齐的对的形式传输.我以为我记得读过P4在L2或L3中实际上有128B线,但是也许这只是对成对传输的64B线的失真. 7-CPU肯定说在P4 130nm的两个缓存级别中都有64B行.


另请参见 uarch-bench 结果是否存在有关更多详细信息:如果base + offset位于与base不同的页面中,则惩罚吗?:如果base+disp事实与base位于不同页面中,则必须重播负载uop. 其他任何寻址模式:5c延迟,11c高速缓存行拆分,12c 4k拆分(即使在很大的页面内).这包括[rax - 16].差异不是由disp8与disp32构成的.

因此:大页面无助于避免页面拆分的罚款(至少当两个页面在TLB中都很热时).高速缓存行拆分使寻址模式无关紧要,但是快速"寻址模式对普通加载和页面拆分加载的延迟降低了1c.

4k分割处理比以前要好得多,请参阅@harold的数字,其中Haswell的4k分割延迟约为32c. (而且更老的CPU可能比这更糟.我认为在SKL之前,这应该会造成约100个周期的损失.)

吞吐量(与寻址模式无关),它是通过使用rax以外的目标位置进行测量的,因此负载是独立的:

  • 不分割:0.5分.
  • CL分割:1分.
  • 4k分割:〜3.8至3.9c(比Skylake之前的CPU好很多)

movzx/movsx具有相同的吞吐量/延迟(包括WORD拆分),这是按预期的,因为它们是在加载端口中处理的(与某些AMD CPU不同,其中也有ALU uop).

从RS(预留站)重放缓存行拆分的负载. uops_dispatched_port.port_2 + port_3的计数器= mov rdi, [rdi]的2倍,在另一个使用基本相同循环的测试中. (这是一个依赖负载的情况,不受吞吐量限制.)直到AGU之后,您才能检测到拆分负载.

大概是当一个加载uop发现它需要第二行的数据时,它会寻找一个拆分寄存器(Intel CPU用于处理拆分负载的缓冲区),并从第一行放入所需的数据部分进入拆分规则.并且还向RS发出信号,需要重播该信号. (这是猜测.)

我认为,即使在拆分中不存在任何缓存行,拆分加载重播也应在几个周期内发生(也许只要加载端口向RS报告它是拆分的,即在地址-之后一代).因此,拆分两侧的需求负载请求都可以立即发送.


另请参见奇怪的效果在IvyBridge上的指针追逐循环中从附近的从属存储中获取.添加额外的负载可以加快速度吗?有关uop重放的更多信息. (但是请注意,这是针对依赖于的负载,而不是负载uop本身.在该Q& A中,依赖的uops大多也是负载.)

缓存未加载并不需要 进行重放,以在准备好接受"传入数据时进行重播,而只是依赖于微指令.请参阅 https://godbolt.org/z/HJF3BN NASM测试案例显示了相同的内容不论L1d命中还是L3命中,都分配的负载uops数.但是,分派的ALU微指令的数量(不计算循环开销)从每个负载1个到每个负载〜8.75个.当负载数据可能从L2高速缓存到达时,调度程序会主动调度消耗数据的周期,然后在周期中进行主动调度,而不是等待一个额外的周期来查看是否成功. >

当在同一端口上肯定已经准备好进行其他独立但较年轻的工作时,我们还没有测试过积极的重放.


SKL具有两个硬件分页浏览单元,这可能与4k分割性能的大幅提高有关.即使没有TLB遗漏,大概是较旧的CPU也必须考虑到可能存在的事实.

有趣的是4k拆分吞吐量不是整数.我认为我的测量结果具有足够的精度和可重复性.请记住,这是每个每次加载进行4k拆分的过程,并且没有其他工作在进行(除了在小型dec/jnz循环中).如果您在真实代码中使用过此功能,则说明您确实在做错事.

对于为什么它不是整数,我没有任何可靠的猜测,但是很显然,对于4k分割,微体系结构必须进行很多工作.仍然是缓存行拆分,并且必须两次检查TLB.

In an answer, I've stated that unaligned access has almost the same speed as aligned access a long time (on x86/x86_64). I didn't have any numbers to back up this statement, so I've created a benchmark for it.

Do you see any flaws in this benchmark? Can you improve on it (I mean, to increase GB/sec, so it reflects the truth better)?

#include <sys/time.h>
#include <stdio.h>

template <int N>
__attribute__((noinline))
void loop32(const char *v) {
    for (int i=0; i<N; i+=160) {
        __asm__ ("mov     (%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x04(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x08(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x0c(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x10(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x14(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x18(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x1c(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x20(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x24(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x28(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x2c(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x30(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x34(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x38(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x3c(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x40(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x44(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x48(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x4c(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x50(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x54(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x58(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x5c(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x60(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x64(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x68(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x6c(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x70(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x74(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x78(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x7c(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x80(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x84(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x88(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x8c(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x90(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x94(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x98(%0), %%eax" : : "r"(v) :"eax");
        __asm__ ("mov 0x9c(%0), %%eax" : : "r"(v) :"eax");
        v += 160;
    }
}

template <int N>
__attribute__((noinline))
void loop64(const char *v) {
    for (int i=0; i<N; i+=160) {
        __asm__ ("mov     (%0), %%rax" : : "r"(v) :"rax");
        __asm__ ("mov 0x08(%0), %%rax" : : "r"(v) :"rax");
        __asm__ ("mov 0x10(%0), %%rax" : : "r"(v) :"rax");
        __asm__ ("mov 0x18(%0), %%rax" : : "r"(v) :"rax");
        __asm__ ("mov 0x20(%0), %%rax" : : "r"(v) :"rax");
        __asm__ ("mov 0x28(%0), %%rax" : : "r"(v) :"rax");
        __asm__ ("mov 0x30(%0), %%rax" : : "r"(v) :"rax");
        __asm__ ("mov 0x38(%0), %%rax" : : "r"(v) :"rax");
        __asm__ ("mov 0x40(%0), %%rax" : : "r"(v) :"rax");
        __asm__ ("mov 0x48(%0), %%rax" : : "r"(v) :"rax");
        __asm__ ("mov 0x50(%0), %%rax" : : "r"(v) :"rax");
        __asm__ ("mov 0x58(%0), %%rax" : : "r"(v) :"rax");
        __asm__ ("mov 0x60(%0), %%rax" : : "r"(v) :"rax");
        __asm__ ("mov 0x68(%0), %%rax" : : "r"(v) :"rax");
        __asm__ ("mov 0x70(%0), %%rax" : : "r"(v) :"rax");
        __asm__ ("mov 0x78(%0), %%rax" : : "r"(v) :"rax");
        __asm__ ("mov 0x80(%0), %%rax" : : "r"(v) :"rax");
        __asm__ ("mov 0x88(%0), %%rax" : : "r"(v) :"rax");
        __asm__ ("mov 0x90(%0), %%rax" : : "r"(v) :"rax");
        __asm__ ("mov 0x98(%0), %%rax" : : "r"(v) :"rax");
        v += 160;
    }
}

template <int N>
__attribute__((noinline))
void loop128a(const char *v) {
    for (int i=0; i<N; i+=160) {
        __asm__ ("movaps     (%0), %%xmm0" : : "r"(v) :"xmm0");
        __asm__ ("movaps 0x10(%0), %%xmm0" : : "r"(v) :"xmm0");
        __asm__ ("movaps 0x20(%0), %%xmm0" : : "r"(v) :"xmm0");
        __asm__ ("movaps 0x30(%0), %%xmm0" : : "r"(v) :"xmm0");
        __asm__ ("movaps 0x40(%0), %%xmm0" : : "r"(v) :"xmm0");
        __asm__ ("movaps 0x50(%0), %%xmm0" : : "r"(v) :"xmm0");
        __asm__ ("movaps 0x60(%0), %%xmm0" : : "r"(v) :"xmm0");
        __asm__ ("movaps 0x70(%0), %%xmm0" : : "r"(v) :"xmm0");
        __asm__ ("movaps 0x80(%0), %%xmm0" : : "r"(v) :"xmm0");
        __asm__ ("movaps 0x90(%0), %%xmm0" : : "r"(v) :"xmm0");
        v += 160;
    }
}

template <int N>
__attribute__((noinline))
void loop128u(const char *v) {
    for (int i=0; i<N; i+=160) {
        __asm__ ("movups     (%0), %%xmm0" : : "r"(v) :"xmm0");
        __asm__ ("movups 0x10(%0), %%xmm0" : : "r"(v) :"xmm0");
        __asm__ ("movups 0x20(%0), %%xmm0" : : "r"(v) :"xmm0");
        __asm__ ("movups 0x30(%0), %%xmm0" : : "r"(v) :"xmm0");
        __asm__ ("movups 0x40(%0), %%xmm0" : : "r"(v) :"xmm0");
        __asm__ ("movups 0x50(%0), %%xmm0" : : "r"(v) :"xmm0");
        __asm__ ("movups 0x60(%0), %%xmm0" : : "r"(v) :"xmm0");
        __asm__ ("movups 0x70(%0), %%xmm0" : : "r"(v) :"xmm0");
        __asm__ ("movups 0x80(%0), %%xmm0" : : "r"(v) :"xmm0");
        __asm__ ("movups 0x90(%0), %%xmm0" : : "r"(v) :"xmm0");
        v += 160;
    }
}

long long int t() {
    struct timeval tv;
    gettimeofday(&tv, 0);
    return (long long int)tv.tv_sec*1000000 + tv.tv_usec;
}

int main() {
    const int ITER = 10;
    const int N = 1600000000;

    char *data = reinterpret_cast<char *>(((reinterpret_cast<unsigned long long>(new char[N+32])+15)&~15));
    for (int i=0; i<N+16; i++) data[i] = 0;

    {
        long long int t0 = t();
        for (int i=0; i<ITER*100000; i++) {
            loop32<N/100000>(data);
        }
        long long int t1 = t();
        for (int i=0; i<ITER*100000; i++) {
            loop32<N/100000>(data+1);
        }
        long long int t2 = t();
        for (int i=0; i<ITER; i++) {
            loop32<N>(data);
        }
        long long int t3 = t();
        for (int i=0; i<ITER; i++) {
            loop32<N>(data+1);
        }
        long long int t4 = t();

        printf(" 32-bit, cache: aligned: %8.4f GB/sec unaligned: %8.4f GB/sec, difference: %0.3f%%\n", (double)N*ITER/(t1-t0)/1000, (double)N*ITER/(t2-t1)/1000, 100.0*(t2-t1)/(t1-t0)-100.0f);
        printf(" 32-bit,   mem: aligned: %8.4f GB/sec unaligned: %8.4f GB/sec, difference: %0.3f%%\n", (double)N*ITER/(t3-t2)/1000, (double)N*ITER/(t4-t3)/1000, 100.0*(t4-t3)/(t3-t2)-100.0f);
    }
    {
        long long int t0 = t();
        for (int i=0; i<ITER*100000; i++) {
            loop64<N/100000>(data);
        }
        long long int t1 = t();
        for (int i=0; i<ITER*100000; i++) {
            loop64<N/100000>(data+1);
        }
        long long int t2 = t();
        for (int i=0; i<ITER; i++) {
            loop64<N>(data);
        }
        long long int t3 = t();
        for (int i=0; i<ITER; i++) {
            loop64<N>(data+1);
        }
        long long int t4 = t();

        printf(" 64-bit, cache: aligned: %8.4f GB/sec unaligned: %8.4f GB/sec, difference: %0.3f%%\n", (double)N*ITER/(t1-t0)/1000, (double)N*ITER/(t2-t1)/1000, 100.0*(t2-t1)/(t1-t0)-100.0f);
        printf(" 64-bit,   mem: aligned: %8.4f GB/sec unaligned: %8.4f GB/sec, difference: %0.3f%%\n", (double)N*ITER/(t3-t2)/1000, (double)N*ITER/(t4-t3)/1000, 100.0*(t4-t3)/(t3-t2)-100.0f);
    }
    {
        long long int t0 = t();
        for (int i=0; i<ITER*100000; i++) {
            loop128a<N/100000>(data);
        }
        long long int t1 = t();
        for (int i=0; i<ITER*100000; i++) {
            loop128u<N/100000>(data+1);
        }
        long long int t2 = t();
        for (int i=0; i<ITER; i++) {
            loop128a<N>(data);
        }
        long long int t3 = t();
        for (int i=0; i<ITER; i++) {
            loop128u<N>(data+1);
        }
        long long int t4 = t();

        printf("128-bit, cache: aligned: %8.4f GB/sec unaligned: %8.4f GB/sec, difference: %0.3f%%\n", (double)N*ITER/(t1-t0)/1000, (double)N*ITER/(t2-t1)/1000, 100.0*(t2-t1)/(t1-t0)-100.0f);
        printf("128-bit,   mem: aligned: %8.4f GB/sec unaligned: %8.4f GB/sec, difference: %0.3f%%\n", (double)N*ITER/(t3-t2)/1000, (double)N*ITER/(t4-t3)/1000, 100.0*(t4-t3)/(t3-t2)-100.0f);
    }
}

解决方案

Timing method. I probably would have set it up so the test was selected by a command-line arg, so I could time it with perf stat ./unaligned-test, and get perf counter results instead of just wall-clock times for each test. That way, I wouldn't have to care about turbo / power-saving, since I could measure in core clock cycles. (Not the same thing as gettimeofday / rdtsc reference cycles unless you disable turbo and other frequency-variation.)


You're only testing throughput, not latency, because none of the loads are dependent.

Your cache numbers will be worse than your memory numbers, but you maybe won't realize that it's because your cache numbers may be due to bottlenecking on the number of split-load registers that handle loads/stores that cross a cache-line boundary. For sequential-read, the outer levels of cache are still always just going to see a sequence of requests for whole cache lines. It's only the execution units getting data from L1D that have to care about alignment. To test misalignment for the non-cached case, you could do scattered loads, so cache-line splits would need to bring two cache lines into L1.

Cache lines are 64B wide1, so you're always testing a mix of cache-line splits and within-a-cache-line accesses. Testing always-split loads would bottleneck harder on the split-load microarchitectural resources. (Actually, depending on your CPU, the cache-fetch width might be narrower than the line size. Recent Intel CPUs can fetch any unaligned chunk from inside a cache line, but that's because they have special hardware to make that fast. Other CPUs may only be at their fastest when fetching within a naturally-aligned 16B chunk or something. @BeeOnRope says that AMD CPUs may care about 16B and 32B boundaries.)

You're not testing store->load forwarding at all. For existing tests, and a nice way to visualize results for different alignments, see this stuffedcow.net blog post: Store-to-Load Forwarding and Memory Disambiguation in x86 Processors.

Passing data through memory is an important use-case, and misalignment + cache-line splits can interfere with store-forwarding on some CPUs. To properly test this, make sure you test different misalignments, not just 1:15 (vector) or 1:3 (integer). (You currently only test a +1 offset relative to 16B-alignment).

I forget if it's just for store-forwarding, or for regular loads, but there may be less penalty when a load is split evenly across a cache-line boundary (an 8:8 vector, and maybe also 4:4 or 2:2 integer splits). You should test this. (I might be thinking of P4 lddqu or Core 2 movqdu)

Intel's optimization manual has big tables of misalignment vs. store-forwarding from a wide store to narrow reloads that are fully contained in it. On some CPUs, this works in more cases when the wide store was naturally-aligned, even if it doesn't cross any cache-line boundaries. (Maybe on SnB/IvB, since they use a banked L1 cache with 16B banks, and splits across those can affect store forwarding. I didn't re-check the manual, but if you really want to test this experimentally, that's something you should be looking for.)


Which reminds me, misaligned loads are more likely to provoke cache-bank conflicts on SnB/IvB (because one load can touch two banks). But you won't see this loading from a single stream, because accessing the same bank in the same line twice in one cycle is fine. It's only accessing the same bank in different lines that can't happen in the same cycle. (e.g. when two memory accesses are a multiple of 128B apart.)

You don't make any attempt to test 4k page-splits. They's slower than regular cache-line splits, because they also need two TLB checks. (Skylake improved them from ~100 cycle penalty to ~5 cycle penalty beyond the normal load-use latency, though)

You fail to test movups on aligned addresses, so you wouldn't detect that movups is slower than movaps on Core2 and earlier even when the memory is aligned at runtime. (I think unaligned mov loads up to 8 bytes were fine even in Core2, as long as they didn't cross a cache-line boundary. IDK how old a CPU you'd have to look at to find a problem with non-vector loads within a cache line. It would be a 32-bit only CPU, but you could still test 8B loads with MMX or SSE, or even x87. P5 Pentium and later guarantee that aligned 8B loads/stores are atomic, but P6 and newer guarantee that cached 8B loads/stores are atomic as long as no cache-line boundary is crossed. Unlike AMD where 8B boundaries matter for atomicity guarantees even in cacheable memory. Why is integer assignment on a naturally aligned variable atomic on x86?)

Go look at Agner Fog's stuff to learn more about how unaligned loads can be slower, and cook up tests to exercise those cases. Actually, Agner may not be the best resource for that, since his microarch guide mostly focuses on getting uops through the pipeline. Just a brief mention of the cost of cache-line splits, nothing in-depth about throughput vs. latency.

See also: Cacheline splits, take two, from Dark Shikari's blog (x264 lead developer), talking about unaligned load strategies on Core2: it was worth it to check for alignment and use a different strategy for the block.


Footnotes:

  1. 64B cache lines is a safe assumption these days. Pentium 3 and earlier had 32B lines. P4 had 64B lines but they were often transferred in 128B-aligned pairs. I thought I remembered reading that P4 actually had 128B lines in L2 or L3, but maybe that was just a distortion of 64B lines transferred in pairs. 7-CPU definitely says 64B lines in both levels of cache for a P4 130nm.


See also uarch-bench results for Skylake. Apparently someone has already written a tester that checks every possible misalignment relative to a cache-line boundary.


My testing on Skylake desktop (i7-6700k):

Addressing mode affects load-use latency, exactly as Intel documents in their optimization manual. I tested with integer mov rax, [rax+...], and with movzx/sx (in that case using the loaded value as an index, since it's too narrow to be a pointer).

;;;  Linux x86-64 NASM/YASM source.  Assemble into a static binary
;; public domain, originally written by peter@cordes.ca.
;; Share and enjoy.  If it breaks, you get to keep both pieces.

;;; This kind of grew while I was testing and thinking of things to test
;;; I left in some of the comments, but took out most of them and summarized the results outside this code block
;;; When I thought of something new to test, I'd edit, save, and up-arrow my assemble-and-run shell command
;;; Then edit the result into a comment in the source.

section .bss

ALIGN   2 * 1<<20   ; 2MB = 4096*512.  Uses hugepages in .bss but not in .data.  I checked in /proc/<pid>/smaps
buf:    resb 16 * 1<<20

section .text
global _start
_start:
    mov     esi, 128

;   mov             edx, 64*123 + 8
;   mov             edx, 64*123 + 0
;   mov             edx, 64*64 + 0
    xor             edx,edx
   ;; RAX points into buf, 16B into the last 4k page of a 2M hugepage

    mov             eax, buf + (2<<20)*0 + 4096*511 + 64*0 + 16
    mov             ecx, 25000000

%define ADDR(x)  x                     ; SKL: 4c
;%define ADDR(x)  x + rdx              ; SKL: 5c
;%define ADDR(x)  128+60 + x + rdx*2   ; SKL: 11c cache-line split
;%define ADDR(x)  x-8                 ; SKL: 5c
;%define ADDR(x)  x-7                 ; SKL: 12c for 4k-split (even if it's in the middle of a hugepage)
; ... many more things and a block of other result-recording comments taken out

%define dst rax



        mov             [ADDR(rax)], dst
align 32
.loop:
        mov             dst, [ADDR(rax)]
        mov             dst, [ADDR(rax)]
        mov             dst, [ADDR(rax)]
        mov             dst, [ADDR(rax)]
    dec         ecx
    jnz .loop

        xor edi,edi
        mov eax,231
    syscall

Then run with

asm-link load-use-latency.asm && disas load-use-latency && 
    perf stat -etask-clock,cycles,L1-dcache-loads,instructions,branches -r4 ./load-use-latency

+ yasm -felf64 -Worphan-labels -gdwarf2 load-use-latency.asm
+ ld -o load-use-latency load-use-latency.o
 (disassembly output so my terminal history has the asm with the perf results)

 Performance counter stats for './load-use-latency' (4 runs):

     91.422838      task-clock:u (msec)       #    0.990 CPUs utilized            ( +-  0.09% )
   400,105,802      cycles:u                  #    4.376 GHz                      ( +-  0.00% )
   100,000,013      L1-dcache-loads:u         # 1093.819 M/sec                    ( +-  0.00% )
   150,000,039      instructions:u            #    0.37  insn per cycle           ( +-  0.00% )
    25,000,031      branches:u                #  273.455 M/sec                    ( +-  0.00% )

   0.092365514 seconds time elapsed                                          ( +-  0.52% )

In this case, I was testing mov rax, [rax], naturally-aligned, so cycles = 4*L1-dcache-loads. 4c latency. I didn't disable turbo or anything like that. Since nothing is going off the core, core clock cycles is the best way to measure.

  • [base + 0..2047]: 4c load-use latency, 11c cache-line split, 11c 4k-page split (even when inside the same hugepage). See Is there a penalty when base+offset is in a different page than the base? for more details: if base+disp turns out to be in a different page than base, the load uop has to be replayed.
  • any other addressing mode: 5c latency, 11c cache-line split, 12c 4k-split (even inside a hugepage). This includes [rax - 16]. It's not disp8 vs. disp32 that makes the difference.

So: hugepages don't help avoid page-split penalties (at least not when both pages are hot in the TLB). A cache-line split makes addressing mode irrelevant, but "fast" addressing modes have 1c lower latency for normal and page-split loads.

4k-split handling is fantastically better than before, see @harold's numbers where Haswell has ~32c latency for a 4k-split. (And older CPUs may be even worse than that. I thought pre-SKL it was supposed to be ~100 cycle penalty.)

Throughput (regardless of addressing mode), measured by using a destination other than rax so the loads are independent:

  • no split: 0.5c.
  • CL-split: 1c.
  • 4k-split: ~3.8 to 3.9c (much better than pre-Skylake CPUs)

Same throughput/latency for movzx/movsx (including WORD splits), as expected because they're handled in the load port (unlike some AMD CPUs, where there's also an ALU uop).

Cache-line split loads get replayed from the RS (Reservation Station). counters for uops_dispatched_port.port_2 + port_3 = 2x number of mov rdi, [rdi], in another test using basically the same loop. (This was a dependent-load case, not throughput limited.) You can't detect a split load until after AGU.

Presumably when a load uop finds out that it needs data from a 2nd line, it looks for a split register (the buffer that Intel CPUs use to handle split loads), and puts the needed part of the data from the first line into that split reg. And also signals back to the RS that it needs to be replayed. (This is guesswork.)

I think even if neither cache line is present on a split, the split-load replay should happen within a few cycles (perhaps as soon as the load port reports back to the RS that it was a split, i.e. after address-generation). So demand-load requests for both sides of the split can be in flight at once.


See also Weird performance effects from nearby dependent stores in a pointer-chasing loop on IvyBridge. Adding an extra load speeds it up? for more about uop replays. (But note that's for uops dependent on a load, not the load uop itself. In that Q&A, the dependent uops are also mostly loads.)

A cache-miss load doesn't itself need to be replayed to "accept" the incoming data when it's ready, only dependent uops. See chat discussion on Are load ops deallocated from the RS when they dispatch, complete or some other time?. This https://godbolt.org/z/HJF3BN NASM test case on i7-6700k shows the same number of load uops dispatched regardless of L1d hits or L3 hits. But the number of ALU uops dispatched (not counting loop overhead) goes from 1 per load to ~8.75 per load. The the scheduler aggressively schedules uops consuming the data to dispatch in the cycle when load data might arrive from L2 cache (and then very aggressively after that, it seems), instead of waiting one extra cycle to see if it did or not.

We haven't tested how aggressive replay is when there's other independent but younger work that could be done on the same port whose inputs are definitely ready.


SKL has two hardware page-walk units, which is probably related to the massive improvement in 4k-split performance. Even when there are no TLB misses, presumably older CPUs had to account for the fact that there might be.

It's interesting that the 4k-split throughput is non-integer. I think my measurements had enough precision and repeatability to say this. Remember this is with every load being a 4k-split, and no other work going on (except for being inside a small dec/jnz loop). If you ever have this in real code, you're doing something really wrong.

I don't have any solid guesses at why it might be non-integer, but clearly there's a lot that has to happen microarchitecturally for a 4k-split. It's still a cache-line split, and it has to check the TLB twice.

这篇关于如何在x86_64上准确基准未对齐的访问速度的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆