测量缓存潜伏期 [英] Measuring Cache Latencies

查看：288 发布时间：2016/5/30 22:24:02 c arrays performance caching memory

本文介绍了测量缓存潜伏期的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

所以我试图用C.我知道他们的大小，我觉得我的理解概念如何做到这一点来衡量L1，L2，L3缓存的延迟，但我遇到了我的执行问题。我想知道如果其他一些硬件错综复杂像pre取的是造成问题。

 的#include＆LT;＆time.h中GT;
＃包括LT＆;＆stdio.h中GT;
＃包括LT＆;＆string.h中GT;诠释主（）{
    函数srand（时间（NULL））; //种子ONCE
    const int的L1_CACHE_SIZE = 32768 /的sizeof（int）的;
    const int的L2_CACHE_SIZE = 262144 /的sizeof（int）的;
    const int的L3_CACHE_SIZE = 6587392 /的sizeof（int）的;
    const int的NUM_ACCESSES = 1000000;
    const int的SECONDS_PER_NS = 10亿;
    INT了ArrayAccess [L1_CACHE_SIZE]
    INT arrayInvalidateL1 [L1_CACHE_SIZE]
    INT arrayInvalidateL2 [L2_CACHE_SIZE]
    INT arrayInvalidateL3 [L3_CACHE_SIZE]
    诠释计数= 0;
    INT索引= 0;
    INT I = 0;
    结构的timespec startAccess，endAccess;
    双mainMemAccess，L1Access，L2Access，L3Access;
    INT readValue = 0;    memset的（ArrayAccess接口，0，L1_CACHE_SIZE *的sizeof（INT））;
    memset的（arrayInvalidateL1，0，L1_CACHE_SIZE *的sizeof（INT））;
    memset的（arrayInvalidateL2，0，L2_CACHE_SIZE *的sizeof（INT））;
    memset的（arrayInvalidateL3，0，L3_CACHE_SIZE *的sizeof（INT））;    索引= 0;
    clock_gettime（CLOCK_REALTIME，＆安培; startAccess）; //启动时钟
    而（指数＆LT; L1_CACHE_SIZE）{
        INT TMP = ArrayAccess接口[指数]从L2 //访问值
        指数=（指数+ TMP +（（指数及4）28：36））; //平均这应该给32元跳跃，不断变化的步伐
        算上++; //这个划分总时间
    }
    clock_gettime（CLOCK_REALTIME，＆安培; endAccess）; //结束时钟
    mainMemAccess =（（endAccess.tv_sec  -  startAccess.tv_sec）* SECONDS_PER_NS）+（endAccess.tv_nsec  -  startAccess.tv_nsec）;
    mainMemAccess / =计数;    的printf（主存储器存取％LF \\ n，mainMemAccess）;    索引= 0;
    计数= 0;
    clock_gettime（CLOCK_REALTIME，＆安培; startAccess）; //启动时钟
    而（指数＆LT; L1_CACHE_SIZE）{
        INT TMP = ArrayAccess接口[指数]从L2 //访问值
        指数=（指数+ TMP +（（指数及4）28：36））; //平均这应该给32元跳跃，不断变化的步伐
        算上++; //这个划分总时间
    }
    clock_gettime（CLOCK_REALTIME，＆安培; endAccess）; //结束时钟
    L1Access =（（endAccess.tv_sec  -  startAccess.tv_sec）* SECONDS_PER_NS）+（endAccess.tv_nsec  -  startAccess.tv_nsec）;
    L1Access / =计数;    的printf（L1高速缓存访问LF％\\ n，L1Access）;    //通过访问数组的所有元素无效L1比大缓存
    为（计数= 0; COUNT＆LT; L1_CACHE_SIZE;计数++）{
        INT读= arrayInvalidateL1 [统计]
        阅读++;
        readValue + =读;
    }    索引= 0;
    计数= 0;
    clock_gettime（CLOCK_REALTIME，＆安培; startAccess）; //启动时钟
    而（指数＆LT; L1_CACHE_SIZE）{
        INT TMP = ArrayAccess接口[指数]从L2 //访问值
        指数=（指数+ TMP +（（指数及4）28：36））; //平均这应该给32元跳跃，不断变化的步伐
        算上++; //这个划分总时间
    }
    clock_gettime（CLOCK_REALTIME，＆安培; endAccess）; //结束时钟
    L2Access =（（endAccess.tv_sec  -  startAccess.tv_sec）* SECONDS_PER_NS）+（endAccess.tv_nsec  -  startAccess.tv_nsec）;
    L2Access / =计数;    的printf（二级高速缓存艾策斯LF％\\ n，L2Access）;    //通过访问数组的所有元素无效L2比大缓存
    为（计数= 0; COUNT＆LT; L2_CACHE_SIZE;计数++）{
        INT读= arrayInvalidateL2 [统计]
        阅读++;
        readValue + =读;
    }    索引= 0;
    计数= 0;
    clock_gettime（CLOCK_REALTIME，＆安培; startAccess）; // sreadValue + =读;挞时钟
    而（指数＆LT; L1_CACHE_SIZE）{
        INT TMP = ArrayAccess接口[指数]从L2 //访问值
        指数=（指数+ TMP +（（指数及4）28：36））; //平均这应该给32元跳跃，不断变化的步伐
        算上++; //这个划分总时间
    }
    clock_gettime（CLOCK_REALTIME，＆安培; endAccess）; //结束时钟
    L3Access =（（endAccess.tv_sec  -  startAccess.tv_sec）* SECONDS_PER_NS）+（endAccess.tv_nsec  -  startAccess.tv_nsec）;
    L3Access / =计数;    的printf（L3高速缓存访问LF％\\ n，L3Access）;    的printf（读值数：％d，readValue）;}

我通过我想从数据数组中获取的值开始了。这显然应该来自主内存，因为它第一次访问。该阵列是（大于页尺寸时）小，因此应该被复制到L1，L2，L3。我来自同一个阵列，现在应该L1访问值。然后，我访问所有的值从大小相同的L1高速缓存阵列无效数据我想访问（所以现在它应该只是在L2 / 3）。然后，我重复这个过程L2和L3。访问时间显然过，虽然，这意味着我做错了什么......

我觉得可能是与它采取时钟的时间问题（启动和停止将采取在NS一段时间，它会改变时，他们会被缓存/ unchached）

有人可以给我一些指点什么我可能是做错了？

UPDATE1：所以我通过大量的访问摊销定时器的成本，我定我的缓存的大小，我还拿了建议，让一个更复杂的索引模式来避免固定的进步。不幸的是，时间仍然关闭。他们似乎都来找L1。我想这个问题可能与无效，而不是访问。将随机VS LRU方案所影响的无效数据？

UPDATE2：修正了memset的（新增L3 memset的无效在主内存中的数据L3以及因此第一次访问开始）和索引方案，仍然没有运气

。

UPDATE3：我永远无法得到这种方法工作，但也有一些很好的参考答案和我贴我自己的一对夫妇的解决方案

我也跑Cachegrind查看命中/小姐

  == == 6710我裁判：1735104
== == 6710偏出I1：1092
== == 6710 LLI错过：1084
== == 6710 I1命中率：0.06％
== == 6710 LLI命中率：0.06％
== == 6710
== 6710 = D裁判：1250696（721162 RD + 529534 WR）
== == 6710 D1错过：116492（7,627 RD + 108865 WR）
== == 6710 LLD错过：115102（6414 RD + 108688 WR）
== == 6710 D1命中率：9.3％（1.0％+ 20.5％）
== == 6710 LLD命中率：9.2％（0.8％+ 20.5％）
== == 6710
== == 6710 LL裁判：117584（8719 RD + 108865 WR）
== == 6710 LL失误：116186（7498 RD + 108688 WR）
== == 6710 LL命中率：3.8％（0.3％+ 20.5％）
        铱I1mr ILMR博士D1mr DLmr DW D1mw DLmw      。 。 。 。 。 。 。 。 。 ＃包括LT＆;＆time.h中GT;
      。 。 。 。 。 。 。 。 。 ＃包括LT＆;＆stdio.h中GT;
      。 。 。 。 。 。 。 。 。 ＃包括LT＆;＆string.h中GT;
      。 。 。 。 。 。 。 。 。
      6 0 0 0 0 0 2 0 0 INT的main（）{
      5 1 1 0 0 0 2 0 0函数srand（时间（NULL））; //种子ONCE
      1 0 0 0 0 0 1 0 0 const int的L1_CACHE_SIZE = 32768 / sizeof的（INT）;
      1 0 0 0 0 0 1 0 0 const int的L2_CACHE_SIZE = 262144 /的sizeof（int）的;
      1 0 0 0 0 0 1 0 0 const int的L3_CACHE_SIZE = 6587392 /的sizeof（int）的;
      1 0 0 0 0 0 1 0 0 const int的NUM_ACCESSES = 1000000;
      1 0 0 0 0 0 1 0 0 const int的SECONDS_PER_NS = 10亿;
     21 2 2 3 0 0 3 0 0 INT了ArrayAccess [L1_CACHE_SIZE]
     21 1 1 3 0 0 3 0 0 INT arrayInvalidateL1 [L1_CACHE_SIZE]
     21 2 2 3 0 0 3 0 0 INT arrayInvalidateL2 [L2_CACHE_SIZE]
     21 1 1 3 0 0 3 0 0 INT arrayInvalidateL3 [L3_CACHE_SIZE]
      1 0 0 0 0 0 1 0 0诠释计数= 0;
      1 1 1 0 0 0 1 0 0 INT索引= 0;
      1 0 0 0 0 0 1 0 0 INT I = 0;
      。 。 。 。 。 。 。 。 。结构的timespec startAccess，endAccess;
      。 。 。 。 。 。 。 。 。双mainMemAccess，L1Access，L2Access，L3Access;
      1 0 0 0 0 0 1 0 0 INT readValue = 0;
      。 。 。 。 。 。 。 。 。
      7 0 0 2 0 0 1 1 1的memset（ArrayAccess接口，0，L1_CACHE_SIZE *的sizeof（int）的）;
      7 1 1 2 2 0 1 0 0 memset的（arrayInvalidateL1，0，L1_CACHE_SIZE *的sizeof（int）的）;
      7 0 0 2 2 0 1 0 0 memset的（arrayInvalidateL2，0，L2_CACHE_SIZE *的sizeof（int）的）;
      7 1 1 2 2 0 1 0 0 memset的（arrayInvalidateL3，0，L3_CACHE_SIZE *的sizeof（int）的）;
      。 。 。 。 。 。 。 。 。
      1 0 0 0 0 0 1 1 1索引= 0;
      4 0 0 0 0 0 1 0 0 clock_gettime（CLOCK_REALTIME，＆放大器; startAccess）; //启动时钟
    772 1 514 0 0 0 0 0时（指数＆LT; L1_CACHE_SIZE）{
  1,280 11 768 257 257 256 0 0 INT TMP = ArrayAccess接口[指数]从L2 //访问值
  2688 0 0 768 0 0 256 0 0指数=（指数+ TMP +（（指数及4）28：36））; //平均这应该给32元跳跃，不断变化的步伐
    256 0 0 0 256 0 0 0 0计数++; //这个划分总时间
      。 。 。 。 。 。 。 。 。 }
      4 0 0 0 0 0 1 0 0 clock_gettime（CLOCK_REALTIME，＆放大器; endAccess）; //结束时钟
     14 1 1 5 1 1 1 1 1 mainMemAccess =（（endAccess.tv_sec  -  startAccess.tv_sec）* SECONDS_PER_NS）+（endAccess.tv_nsec  -  startAccess.tv_nsec）;
      6 0 0 2 0 0 1 0 0 mainMemAccess / =计数;
      。 。 。 。 。 。 。 。 。
      6 1 1 2 0 0 2 0 0的printf（主存储器存取％LF \\ n，mainMemAccess）;
      。 。 。 。 。 。 。 。 。
      1 0 0 0 0 0 1 0 0索引= 0;
      1 0 0 0 0 0 1 0 0计数= 0;
      4 1 1 0 0 0 1 0 0 clock_gettime（CLOCK_REALTIME，＆放大器; startAccess）; //启动时钟
    772 1 514 0 0 0 0 0时（指数＆LT; L1_CACHE_SIZE）{
  1,280 0 0 768 240 0 256 0 0 INT TMP = ArrayAccess接口[指数]从L2 //访问值
  2688 0 0 768 0 0 256 0 0指数=（指数+ TMP +（（指数及4）28：36））; //平均这应该给32元跳跃，不断变化的步伐
    256 0 0 0 256 0 0 0 0计数++; //这个划分总时间
      。 。 。 。 。 。 。 。 。 }
      4 0 0 0 0 0 1 0 0 clock_gettime（CLOCK_REALTIME，＆放大器; endAccess）; //结束时钟
     14 1 1 5 0 0 1 1 0 L1Access =（（endAccess.tv_sec  -  startAccess.tv_sec）* SECONDS_PER_NS）+（endAccess.tv_nsec  -  startAccess.tv_nsec）;
      6 1 1 2 0 0 1 0 0 L1Access / =计数;
      。 。 。 。 。 。 。 。 。
      6 0 0 2 0 0 2 0 0的printf（L1高速缓存访问LF％\\ n，L1Access）;
      。 。 。 。 。 。 。 。 。
      。 。 。 。 。 。 。 。 。 //通过访问数组的所有元素无效L1比大缓存
 32773 11 24578 0 0 1 0 0（计数= 0; COUNT＆LT; L1_CACHE_SIZE;计数++）{
 40,960 0 0 24,576 513 513 8,192 0 0 INT读= arrayInvalidateL1 [统计]
  8,192 0 0 8,192 0 0 0 0 0读++;
 16,384 0 0 16,384 0 0 0 0 0 readValue + =读;
      。 。 。 。 。 。 。 。 。 }
      。 。 。 。 。 。 。 。 。
      1 0 0 0 0 0 1 0 0索引= 0;
      1 1 1 0 0 0 1 0 0计数= 0;
      4 0 0 0 0 0 1 1 0 clock_gettime（CLOCK_REALTIME，＆放大器; startAccess）; //启动时钟
    772 1 514 0 0 0 0 0时（指数＆LT; L1_CACHE_SIZE）{
  1,280 0 0 768 256 0 256 0 0 INT TMP = ArrayAccess接口[指数]从L2 //访问值
  2688 0 0 768 0 0 256 0 0指数=（指数+ TMP +（（指数及4）28：36））; //平均这应该给32元跳跃，不断变化的步伐
    256 0 0 0 256 0 0 0 0计数++; //这个划分总时间
      。 。 。 。 。 。 。 。 。 }
      4 1 1 0 0 0 1 0 0 clock_gettime（CLOCK_REALTIME，＆放大器; endAccess）; //结束时钟
     14 0 0 5 1 0 1 1 0 L2Access =（（endAccess.tv_sec  -  startAccess.tv_sec）* SECONDS_PER_NS）+（endAccess.tv_nsec  -  startAccess.tv_nsec）;
      6 1 1 2 0 0 1 0 0 L2Access / =计数;
      。 。 。 。 。 。 。 。 。
      6 0 0 2 0 0 2 0 0的printf（二级高速缓存艾策斯LF％\\ n，L2Access）;
      。 。 。 。 。 。 。 。 。
      。 。 。 。 。 。 。 。 。 //通过访问数组的所有元素无效L2比大缓存
262149 2 2 196610 0 0 1 0 0（计数= 0;计数＆下; L2_CACHE_SIZE;计数++）{
327,680 0 0 196,608 4097 4095 65,536 0 0 INT读= arrayInvalidateL2 [统计]
 65,536 0 0 65,536 0 0 0 0 0读++;
131,072 0 0 131,072 0 0 0 0 0 readValue + =读;
      。 。 。 。 。 。 。 。 。 }
      。 。 。 。 。 。 。 。 。
      1 0 0 0 0 0 1 0 0索引= 0;
      1 0 0 0 0 0 1 0 0计数= 0;
      4 0 0 0 0 0 1 1 0 clock_gettime（CLOCK_REALTIME，＆放大器; startAccess）; // sreadValue + =读;挞时钟
    772 1 514 0 0 0 0 0时（指数＆LT; L1_CACHE_SIZE）{
  1,280 0 0 768 256 0 256 0 0 INT TMP = ArrayAccess接口[指数]从L2 //访问值
  2688 0 0 768 0 0 256 0 0指数=（指数+ TMP +（（指数及4）28：36））; //平均这应该给32元跳跃，不断变化的步伐
    256 0 0 0 256 0 0 0 0计数++; //这个划分总时间
      。 。 。 。 。 。 。 。 。 }
      4 0 0 0 0 0 1 0 0 clock_gettime（CLOCK_REALTIME，＆放大器; endAccess）; //结束时钟
     14 1 1 5 1 0 1 1 0 L3Access =（（endAccess.tv_sec  -  startAccess.tv_sec）* SECONDS_PER_NS）+（endAccess.tv_nsec  -  startAccess.tv_nsec）;
      6 0 0 2 0 0 1 0 0 L3Access / =计数;
      。 。 。 。 。 。 。 。 。
      6 1 1 2 0 0 2 0 0的printf（L3高速缓存访问LF％\\ n，L3Access）;
      。 。 。 。 。 。 。 。 。
      6 0 0 1 0 0 1 0 0的printf（读值数：％d，readValue）;
      。 。 。 。 。 。 。 。 。
      3 0 0 3 0 0 0 0 0}

解决方案

我宁愿尝试使用硬件时钟的措施。在 RDTSC 指令会告诉你当前的周期数，因为CPU被电。此外，它是更好地使用 ASM ，以确保始终是相同的指令在两个测量和干运行使用。使用和我进行了一些这方面的统计巧妙很久以前：

 的#include＆LT;＆stdlib.h中GT;
＃包括LT＆;＆stdio.h中GT;
＃包括LT＆;＆stdint.h GT;
＃包括LT＆;＆fcntl.h GT;
＃包括LT＆;＆unistd.h中GT;
＃包括LT＆;＆string.h中GT;
＃包括LT＆; SYS / mman.h＆GT;
INT i386_cpuid_caches（为size_t * data_caches）{
    INT I;
    INT num_data_caches = 0;
    对于（i = 0; I＆LT; 32;我++）{        //变量保持4 i386的遗留寄存器的内容
        uint32_t的EAX，EBX，ECX，EDX;        EAX = 4; //获得缓存信息
        ECX = I; //高速缓存ID        ASM（
            CPUID//调用i386的CPUID指令
            ：+ A（EAX）//包含CPUID命令code，4缓存查询
            ，= B（EBX）
            + C（ECX）//包含高速缓存ID
            ，= D（EDX）
        ）; //生成4个寄存器EAX，EBX，ECX和EDX输出        //从http://download.intel.com/products/processor/manual/325462.pdf卷服用。 2A 3-149
        INT cache_type = EAX和放大器; 0x1F的;        如果（cache_type == 0）//有效的缓存标识的结束
            打破;        字符* cache_type_string;
        开关（cache_type）{
            案例1：cache_type_string =数据高速缓存;打破;
            案例2：cache_type_string =指令Cache打破;
            案例3：cache_type_string =统一缓存;打破;
            默认：cache_type_string =未知类型的缓存打破;
        }        INT cache_level =（EAX＆GT;＆GT; = 5）及为0x7;        INT cache_is_self_initializing =（EAX＆GT;＆GT; = 3）及为0x1; //不需要SW初始化
        INT cache_is_fully_associative =（EAX＆GT;＆GT; = 1）及为0x1;
        //从http://download.intel.com/products/processor/manual/325462.pdf 3-166卷服用。 2A
        // EBX包含10个，10 3的整数和12位分别
        unsigned int类型cache_sets = ECX + 1;
        unsigned int类型cache_coherency_line_size =（EBX＆安培; 0xFFF的）+ 1;
        unsigned int类型cache_physical_line_partitions =（（EBX＆GT;＆GT; = 12）及0x3FF处）+ 1;
        无符号整型cache_ways_of_associativity =（（EBX＆GT;＆GT = 10）及为0x3FF）+ 1;        //总缓存大小是产品
        为size_t cache_total_size = cache_ways_of_associativity * cache_physical_line_partitions * cache_coherency_line_size * cache_sets;        如果（cache_type == 1 || cache_type == 3）{
            data_caches [num_data_caches ++] = cache_total_size;
        }        的printf（
            高速缓存ID％D：\\ n
             - 等级：％d个\\ N
             - 类型：％S \\ n
             - 集数：％d \\ n
             - 系统一致性口径：％d字节\\ n
             - 物理线路的分区数：％d \\ n
             - 关联方式：％d个\\ N
             - 总大小：％祖字节（％祖KB）\\ n
             - 是完全关联：％S \\ n
             - 是自初始化：％S \\ n
            \\ n
            ， 一世
            ，cache_level
            ，cache_type_string
            ，cache_sets
            ，cache_coherency_line_size
            ，cache_physical_line_partitions
            ，cache_ways_of_associativity
            ，cache_total_size，cache_total_size＆GT;＆GT; 10
            ，cache_is_fully_associative？ 真假
            ，cache_is_self_initializing？ 真假
        ）;
    }    返回num_data_caches;
}INT test_cache（为size_t尝试，为size_t lower_cache_size，为int *延迟，为size_t max_latency）{
    INT FD =打开（/开发/ urandom的，O_RDONLY）;
    如果（FD℃，）{
        PERROR（开放）;
        中止（）;
    }
    字符* random_data = MMAP（
          空值
        ，lower_cache_size
        ，PROT_READ | PROT_WRITE
        ，MAP_PRIVATE | MAP_ANON // | MAP_POPULATE
        ，-1
        ，0
        ）; //获得一些随机数据
    如果（random_data == MAP_FAILED）{
        PERROR（MMAP）;
        中止（）;
    }    为size_t我;
    对于（i = 0; I＆LT; lower_cache_size; I + =的sysconf（_SC_PAGESIZE））{
        random_data [I] = 1;
    }
    的int64_t random_offset = 0;
    而（attempts--）{
        //为准确的测量使用的处理器的时钟计时器
        random_offset + = RAND（）;
        random_offset％= lower_cache_size;
        int32_t cycles_used，EDX，temp1目录，TEMP2;
        ASM（
            MFENCE \\ n \\ t//存储栅栏
            RDTSC \\ n \\ t//获得CPU周期数
            MOV %% EDX，％2 \\ n \\ t的
            MOV EAX %%％3 \\ n \\ t的
            MFENCE \\ n \\ t//存储栅栏
            MOV％4 %%人\\ n \\ t//加载数据
            MFENCE \\ n \\ t的
            RDTSC \\ n \\ t的
            分％2 %% EDX \\ n \\ t//循环。减去计数
            SBB％3 %% EAX//循环。减去计数
            ：=一个（cycles_used）
            ，= D（EDX）
            = R（temp1目录）
            = R（TEMP2）
            ：M（random_data [random_offset]）
            ）;
        //的printf（％d个\\ N，cycles_used）;
        如果（cycles_used＆LT; max_latency）
            潜伏期[cycles_used] ++;
        其他
            延迟[max_latency  -  1] ++;
    }    在munmap（random_data，lower_cache_size）;    返回0;
}诠释主（）{
    为size_t cache_sizes [32];
    INT num_data_caches = i386_cpuid_caches（cache_sizes）;    INT延迟[0x400的];
    memset的（延迟，0，sizeof的（延迟））;    INT empty_cycles = 0;    INT I;
    INT试图= 1000000;
    对于（i = 0; I＆LT;尝试;我++）{//衡量我们多么的开销有计数cyscles
        int32_t cycles_used，EDX，temp1目录，TEMP2;
        ASM（
            MFENCE \\ n \\ t//存储栅栏
            RDTSC \\ n \\ t//获得CPU周期数
            MOV %% EDX，％2 \\ n \\ t的
            MOV EAX %%％3 \\ n \\ t的
            MFENCE \\ n \\ t//存储栅栏
            MFENCE \\ n \\ t的
            RDTSC \\ n \\ t的
            分％2 %% EDX \\ n \\ t//循环。减去计数
            SBB％3 %% EAX//循环。减去计数
            ：=一个（cycles_used）
            ，= D（EDX）
            = R（temp1目录）
            = R（TEMP2）
            ：
            ）;
        如果（cycles_used＆LT;的sizeof（潜伏期）/ sizeof的（*延迟））
            潜伏期[cycles_used] ++;
        其他
            潜伏期[的sizeof（潜伏期）/ sizeof的（*延迟） -  1] ++;    }    {
        诠释J;
        为size_t总和= 0;
        为（J = 0; J＆LT;的sizeof（潜伏期）/ sizeof的（*延迟）; J ++）{
            总和+ =延迟[J]。
        }
        为size_t SUM2 = 0;
        为（J = 0; J＆LT;的sizeof（潜伏期）/ sizeof的（*延迟）; J ++）{
            SUM2 + =延迟[J]。
            如果（SUM2＆GT; = SUM * .75）在{
                empty_cycles = j的;
                fprintf中（标准错误，空计数需要％d个循环\\ N，empty_cycles）;
                打破;
            }
        }
    }    对于（i = 0; I＆LT; num_data_caches;我++）{
        test_cache（尝试，cache_sizes [I] * 4，延迟，的sizeof（潜伏期）/ sizeof的（*延迟））;        诠释J;
        为size_t总和= 0;
        为（J = 0; J＆LT;的sizeof（潜伏期）/ sizeof的（*延迟）; J ++）{
            总和+ =延迟[J]。
        }
        为size_t SUM2 = 0;
        为（J = 0; J＆LT;的sizeof（潜伏期）/ sizeof的（*延迟）; J ++）{
            SUM2 + =延迟[J]。
            如果（SUM2＆GT; = SUM * .75）在{
                fprintf中（标准错误，缓存ID％i有延迟％d个周期的\\ n，I，J  -  empty_cycles）;
                打破;
            }
        }    }    返回0;}

在我的酷睿2输出：

 缓存ID 0：
- 1级
 - 类型：数据缓存
 - 总面积：32768字节（32 KB）高速缓存ID 1：
- 1级
 - 类型：指令Cache
 - 总面积：32768字节（32 KB）高速缓存ID 2：
 - 等级：2
 - 类型：统一高速缓存
 - 总大小：262144字节（256 KB）高速缓存ID 3：
 - 等级：3
 - 类型：统一高速缓存
 - 总大小：3145728字节（3072 KB）空计数需90个周期
高速缓存ID 0有延迟6个周期
高速缓存ID 2有延迟21个周期
高速缓存ID 3具有延迟168次

So I am trying to measure the latencies of L1, L2, L3 cache using C. I know the size of them and I feel I understand conceptually how to do it but I am running into problems with my implementation. I am wondering if some of the other hardware intricacies like pre-fetching are causing issues.

#include <time.h>
#include <stdio.h>
#include <string.h>

int main(){
    srand(time(NULL));  // Seed ONCE
    const int L1_CACHE_SIZE =  32768/sizeof(int);
    const int L2_CACHE_SIZE =  262144/sizeof(int);
    const int L3_CACHE_SIZE =  6587392/sizeof(int);
    const int NUM_ACCESSES = 1000000;
    const int SECONDS_PER_NS = 1000000000;
    int arrayAccess[L1_CACHE_SIZE];
    int arrayInvalidateL1[L1_CACHE_SIZE];
    int arrayInvalidateL2[L2_CACHE_SIZE];
    int arrayInvalidateL3[L3_CACHE_SIZE];
    int count=0;
    int index=0;
    int i=0;
    struct timespec startAccess, endAccess;
    double mainMemAccess, L1Access, L2Access, L3Access;
    int readValue=0;

    memset(arrayAccess, 0, L1_CACHE_SIZE*sizeof(int));
    memset(arrayInvalidateL1, 0, L1_CACHE_SIZE*sizeof(int));
    memset(arrayInvalidateL2, 0, L2_CACHE_SIZE*sizeof(int));
    memset(arrayInvalidateL3, 0, L3_CACHE_SIZE*sizeof(int));

    index = 0;
    clock_gettime(CLOCK_REALTIME, &startAccess); //start clock
    while (index < L1_CACHE_SIZE) {
        int tmp = arrayAccess[index];               //Access Value from L2
        index = (index + tmp + ((index & 4) ? 28 : 36));   // on average this should give 32 element skips, with changing strides
        count++;                                           //divide overall time by this 
    }
    clock_gettime(CLOCK_REALTIME, &endAccess); //end clock
    mainMemAccess = ((endAccess.tv_sec - startAccess.tv_sec) * SECONDS_PER_NS) + (endAccess.tv_nsec - startAccess.tv_nsec);
    mainMemAccess /= count;

    printf("Main Memory Access %lf\n", mainMemAccess);

    index = 0;
    count=0;
    clock_gettime(CLOCK_REALTIME, &startAccess); //start clock
    while (index < L1_CACHE_SIZE) {
        int tmp = arrayAccess[index];               //Access Value from L2
        index = (index + tmp + ((index & 4) ? 28 : 36));   // on average this should give 32 element skips, with changing strides
        count++;                                           //divide overall time by this 
    }
    clock_gettime(CLOCK_REALTIME, &endAccess); //end clock              
    L1Access = ((endAccess.tv_sec - startAccess.tv_sec) * SECONDS_PER_NS) + (endAccess.tv_nsec - startAccess.tv_nsec);
    L1Access /= count;

    printf("L1 Cache Access %lf\n", L1Access);

    //invalidate L1 by accessing all elements of array which is larger than cache
    for(count=0; count < L1_CACHE_SIZE; count++){
        int read = arrayInvalidateL1[count]; 
        read++;
        readValue+=read;               
    }

    index = 0;
    count = 0;
    clock_gettime(CLOCK_REALTIME, &startAccess); //start clock
    while (index < L1_CACHE_SIZE) {
        int tmp = arrayAccess[index];               //Access Value from L2
        index = (index + tmp + ((index & 4) ? 28 : 36));   // on average this should give 32 element skips, with changing strides
        count++;                                           //divide overall time by this 
    }
    clock_gettime(CLOCK_REALTIME, &endAccess); //end clock
    L2Access = ((endAccess.tv_sec - startAccess.tv_sec) * SECONDS_PER_NS) + (endAccess.tv_nsec - startAccess.tv_nsec);
    L2Access /= count;

    printf("L2 Cache Acces %lf\n", L2Access);

    //invalidate L2 by accessing all elements of array which is larger than cache
    for(count=0; count < L2_CACHE_SIZE; count++){
        int read = arrayInvalidateL2[count];  
        read++;
        readValue+=read;                        
    }

    index = 0;
    count=0;
    clock_gettime(CLOCK_REALTIME, &startAccess); //sreadValue+=read;tart clock
    while (index < L1_CACHE_SIZE) {
        int tmp = arrayAccess[index];               //Access Value from L2
        index = (index + tmp + ((index & 4) ? 28 : 36));   // on average this should give 32 element skips, with changing strides
        count++;                                           //divide overall time by this 
    }
    clock_gettime(CLOCK_REALTIME, &endAccess); //end clock
    L3Access = ((endAccess.tv_sec - startAccess.tv_sec) * SECONDS_PER_NS) + (endAccess.tv_nsec - startAccess.tv_nsec);
    L3Access /= count;

    printf("L3 Cache Access %lf\n", L3Access);

    printf("Read Value: %d", readValue);

}

I start out by accessing a value in the array I want data from. This should obviously come from main memory because it the first access. The array is small (less than page size) so it should be copied into L1, L2, L3. I access value from the same array which should now be L1. I then access all the values from an array of the same size as L1 cache to invalidate data I want to access (so now it should just be in L2/3). Then I repeat this process for L2 and L3. The access times are clearly off though, which means I am doing something wrong...

I think there might be issues with the time it takes to clock (start and stop are going to take some time in ns and it will change when they are cached/unchached)

Can someone give me some pointers on what I might be doing wrong?

UPDATE1: So i amortized the cost of the timer by making lots of accesses, I fixed the size of my caches and I also took the advice to make a more complex indexing scheme to avoid fixed strides. Unfortunately the times are still off. They all seem to be coming for L1. I am thinking the issue might be with invalidating instead of accessing. Would a random vs LRU scheme affect the data being invalidated?

UPDATE2: Fixed the memset (Added L3 memset to invalidate data in L3 as well so first access starts at main memory) and indexing scheme, still no luck.

UPDATE3: I couldn't ever get this method to work but there were some good suggested answers and I posted a couple solutions of my own.

I also ran Cachegrind to view hit/miss

 ==6710== I   refs:      1,735,104
==6710== I1  misses:        1,092
==6710== LLi misses:        1,084
==6710== I1  miss rate:      0.06%
==6710== LLi miss rate:      0.06%
==6710== 
==6710== D   refs:      1,250,696  (721,162 rd   + 529,534 wr)
==6710== D1  misses:      116,492  (  7,627 rd   + 108,865 wr)
==6710== LLd misses:      115,102  (  6,414 rd   + 108,688 wr)
==6710== D1  miss rate:       9.3% (    1.0%     +    20.5%  )
==6710== LLd miss rate:       9.2% (    0.8%     +    20.5%  )
==6710== 
==6710== LL refs:         117,584  (  8,719 rd   + 108,865 wr)
==6710== LL misses:       116,186  (  7,498 rd   + 108,688 wr)
==6710== LL miss rate:        3.8% (    0.3%     +    20.5%  )


        Ir I1mr ILmr      Dr  D1mr  DLmr     Dw D1mw DLmw 

      .    .    .       .     .     .      .    .    .  #include <time.h>
      .    .    .       .     .     .      .    .    .  #include <stdio.h>
      .    .    .       .     .     .      .    .    .  #include <string.h>
      .    .    .       .     .     .      .    .    .  
      6    0    0       0     0     0      2    0    0  int main(){
      5    1    1       0     0     0      2    0    0      srand(time(NULL));  // Seed ONCE
      1    0    0       0     0     0      1    0    0      const int L1_CACHE_SIZE =  32768/sizeof(int);
      1    0    0       0     0     0      1    0    0      const int L2_CACHE_SIZE =  262144/sizeof(int);
      1    0    0       0     0     0      1    0    0      const int L3_CACHE_SIZE =  6587392/sizeof(int);
      1    0    0       0     0     0      1    0    0      const int NUM_ACCESSES = 1000000;
      1    0    0       0     0     0      1    0    0      const int SECONDS_PER_NS = 1000000000;
     21    2    2       3     0     0      3    0    0      int arrayAccess[L1_CACHE_SIZE];
     21    1    1       3     0     0      3    0    0      int arrayInvalidateL1[L1_CACHE_SIZE];
     21    2    2       3     0     0      3    0    0      int arrayInvalidateL2[L2_CACHE_SIZE];
     21    1    1       3     0     0      3    0    0      int arrayInvalidateL3[L3_CACHE_SIZE];
      1    0    0       0     0     0      1    0    0      int count=0;
      1    1    1       0     0     0      1    0    0      int index=0;
      1    0    0       0     0     0      1    0    0      int i=0;
      .    .    .       .     .     .      .    .    .      struct timespec startAccess, endAccess;
      .    .    .       .     .     .      .    .    .      double mainMemAccess, L1Access, L2Access, L3Access;
      1    0    0       0     0     0      1    0    0      int readValue=0;
      .    .    .       .     .     .      .    .    .  
      7    0    0       2     0     0      1    1    1      memset(arrayAccess, 0, L1_CACHE_SIZE*sizeof(int));
      7    1    1       2     2     0      1    0    0      memset(arrayInvalidateL1, 0, L1_CACHE_SIZE*sizeof(int));
      7    0    0       2     2     0      1    0    0      memset(arrayInvalidateL2, 0, L2_CACHE_SIZE*sizeof(int));
      7    1    1       2     2     0      1    0    0      memset(arrayInvalidateL3, 0, L3_CACHE_SIZE*sizeof(int));
      .    .    .       .     .     .      .    .    .  
      1    0    0       0     0     0      1    1    1      index = 0;
      4    0    0       0     0     0      1    0    0      clock_gettime(CLOCK_REALTIME, &startAccess); //start clock
    772    1    1     514     0     0      0    0    0      while (index < L1_CACHE_SIZE) {
  1,280    1    1     768   257   257    256    0    0          int tmp = arrayAccess[index];               //Access Value from L2
  2,688    0    0     768     0     0    256    0    0          index = (index + tmp + ((index & 4) ? 28 : 36));   // on average this should give 32 element skips, with changing strides
    256    0    0     256     0     0      0    0    0          count++;                                           //divide overall time by this 
      .    .    .       .     .     .      .    .    .      }
      4    0    0       0     0     0      1    0    0      clock_gettime(CLOCK_REALTIME, &endAccess); //end clock
     14    1    1       5     1     1      1    1    1      mainMemAccess = ((endAccess.tv_sec - startAccess.tv_sec) * SECONDS_PER_NS) + (endAccess.tv_nsec - startAccess.tv_nsec);
      6    0    0       2     0     0      1    0    0      mainMemAccess /= count;
      .    .    .       .     .     .      .    .    .  
      6    1    1       2     0     0      2    0    0      printf("Main Memory Access %lf\n", mainMemAccess);
      .    .    .       .     .     .      .    .    .  
      1    0    0       0     0     0      1    0    0      index = 0;
      1    0    0       0     0     0      1    0    0      count=0;
      4    1    1       0     0     0      1    0    0      clock_gettime(CLOCK_REALTIME, &startAccess); //start clock
    772    1    1     514     0     0      0    0    0      while (index < L1_CACHE_SIZE) {
  1,280    0    0     768   240     0    256    0    0          int tmp = arrayAccess[index];               //Access Value from L2
  2,688    0    0     768     0     0    256    0    0          index = (index + tmp + ((index & 4) ? 28 : 36));   // on average this should give 32 element skips, with changing strides
    256    0    0     256     0     0      0    0    0          count++;                                           //divide overall time by this 
      .    .    .       .     .     .      .    .    .      }
      4    0    0       0     0     0      1    0    0      clock_gettime(CLOCK_REALTIME, &endAccess); //end clock              
     14    1    1       5     0     0      1    1    0      L1Access = ((endAccess.tv_sec - startAccess.tv_sec) * SECONDS_PER_NS) + (endAccess.tv_nsec - startAccess.tv_nsec);
      6    1    1       2     0     0      1    0    0      L1Access /= count;
      .    .    .       .     .     .      .    .    .  
      6    0    0       2     0     0      2    0    0      printf("L1 Cache Access %lf\n", L1Access);
      .    .    .       .     .     .      .    .    .  
      .    .    .       .     .     .      .    .    .      //invalidate L1 by accessing all elements of array which is larger than cache
 32,773    1    1  24,578     0     0      1    0    0      for(count=0; count < L1_CACHE_SIZE; count++){
 40,960    0    0  24,576   513   513  8,192    0    0          int read = arrayInvalidateL1[count]; 
  8,192    0    0   8,192     0     0      0    0    0          read++;
 16,384    0    0  16,384     0     0      0    0    0          readValue+=read;               
      .    .    .       .     .     .      .    .    .      }
      .    .    .       .     .     .      .    .    .  
      1    0    0       0     0     0      1    0    0      index = 0;
      1    1    1       0     0     0      1    0    0      count = 0;
      4    0    0       0     0     0      1    1    0      clock_gettime(CLOCK_REALTIME, &startAccess); //start clock
    772    1    1     514     0     0      0    0    0      while (index < L1_CACHE_SIZE) {
  1,280    0    0     768   256     0    256    0    0          int tmp = arrayAccess[index];               //Access Value from L2
  2,688    0    0     768     0     0    256    0    0          index = (index + tmp + ((index & 4) ? 28 : 36));   // on average this should give 32 element skips, with changing strides
    256    0    0     256     0     0      0    0    0          count++;                                           //divide overall time by this 
      .    .    .       .     .     .      .    .    .      }
      4    1    1       0     0     0      1    0    0      clock_gettime(CLOCK_REALTIME, &endAccess); //end clock
     14    0    0       5     1     0      1    1    0      L2Access = ((endAccess.tv_sec - startAccess.tv_sec) * SECONDS_PER_NS) + (endAccess.tv_nsec - startAccess.tv_nsec);
      6    1    1       2     0     0      1    0    0      L2Access /= count;
      .    .    .       .     .     .      .    .    .  
      6    0    0       2     0     0      2    0    0      printf("L2 Cache Acces %lf\n", L2Access);
      .    .    .       .     .     .      .    .    .  
      .    .    .       .     .     .      .    .    .      //invalidate L2 by accessing all elements of array which is larger than cache
262,149    2    2 196,610     0     0      1    0    0      for(count=0; count < L2_CACHE_SIZE; count++){
327,680    0    0 196,608 4,097 4,095 65,536    0    0          int read = arrayInvalidateL2[count];  
 65,536    0    0  65,536     0     0      0    0    0          read++;
131,072    0    0 131,072     0     0      0    0    0          readValue+=read;                        
      .    .    .       .     .     .      .    .    .      }
      .    .    .       .     .     .      .    .    .  
      1    0    0       0     0     0      1    0    0      index = 0;
      1    0    0       0     0     0      1    0    0      count=0;
      4    0    0       0     0     0      1    1    0      clock_gettime(CLOCK_REALTIME, &startAccess); //sreadValue+=read;tart clock
    772    1    1     514     0     0      0    0    0      while (index < L1_CACHE_SIZE) {
  1,280    0    0     768   256     0    256    0    0          int tmp = arrayAccess[index];               //Access Value from L2
  2,688    0    0     768     0     0    256    0    0          index = (index + tmp + ((index & 4) ? 28 : 36));   // on average this should give 32 element skips, with changing strides
    256    0    0     256     0     0      0    0    0          count++;                                           //divide overall time by this 
      .    .    .       .     .     .      .    .    .      }
      4    0    0       0     0     0      1    0    0      clock_gettime(CLOCK_REALTIME, &endAccess); //end clock
     14    1    1       5     1     0      1    1    0      L3Access = ((endAccess.tv_sec - startAccess.tv_sec) * SECONDS_PER_NS) + (endAccess.tv_nsec - startAccess.tv_nsec);
      6    0    0       2     0     0      1    0    0      L3Access /= count;
      .    .    .       .     .     .      .    .    .  
      6    1    1       2     0     0      2    0    0      printf("L3 Cache Access %lf\n", L3Access);
      .    .    .       .     .     .      .    .    .  
      6    0    0       1     0     0      1    0    0      printf("Read Value: %d", readValue);
      .    .    .       .     .     .      .    .    .  
      3    0    0       3     0     0      0    0    0  }

解决方案

I would rather try to use the hardware clock as a measure. The rdtsc instruction will tell you the current cycle count since the CPU was powered up. Also it is better to use asm to make sure always the same instructions are used in both measured and dry runs. Using that and some clever statistics I have made this a long time ago:

#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <fcntl.h>
#include <unistd.h>
#include <string.h>
#include <sys/mman.h>


int i386_cpuid_caches (size_t * data_caches) {
    int i;
    int num_data_caches = 0;
    for (i = 0; i < 32; i++) {

        // Variables to hold the contents of the 4 i386 legacy registers
        uint32_t eax, ebx, ecx, edx; 

        eax = 4; // get cache info
        ecx = i; // cache id

        asm (
            "cpuid" // call i386 cpuid instruction
            : "+a" (eax) // contains the cpuid command code, 4 for cache query
            , "=b" (ebx)
            , "+c" (ecx) // contains the cache id
            , "=d" (edx)
        ); // generates output in 4 registers eax, ebx, ecx and edx 

        // taken from http://download.intel.com/products/processor/manual/325462.pdf Vol. 2A 3-149
        int cache_type = eax & 0x1F; 

        if (cache_type == 0) // end of valid cache identifiers
            break;

        char * cache_type_string;
        switch (cache_type) {
            case 1: cache_type_string = "Data Cache"; break;
            case 2: cache_type_string = "Instruction Cache"; break;
            case 3: cache_type_string = "Unified Cache"; break;
            default: cache_type_string = "Unknown Type Cache"; break;
        }

        int cache_level = (eax >>= 5) & 0x7;

        int cache_is_self_initializing = (eax >>= 3) & 0x1; // does not need SW initialization
        int cache_is_fully_associative = (eax >>= 1) & 0x1;


        // taken from http://download.intel.com/products/processor/manual/325462.pdf 3-166 Vol. 2A
        // ebx contains 3 integers of 10, 10 and 12 bits respectively
        unsigned int cache_sets = ecx + 1;
        unsigned int cache_coherency_line_size = (ebx & 0xFFF) + 1;
        unsigned int cache_physical_line_partitions = ((ebx >>= 12) & 0x3FF) + 1;
        unsigned int cache_ways_of_associativity = ((ebx >>= 10) & 0x3FF) + 1;

        // Total cache size is the product
        size_t cache_total_size = cache_ways_of_associativity * cache_physical_line_partitions * cache_coherency_line_size * cache_sets;

        if (cache_type == 1 || cache_type == 3) {
            data_caches[num_data_caches++] = cache_total_size;
        }

        printf(
            "Cache ID %d:\n"
            "- Level: %d\n"
            "- Type: %s\n"
            "- Sets: %d\n"
            "- System Coherency Line Size: %d bytes\n"
            "- Physical Line partitions: %d\n"
            "- Ways of associativity: %d\n"
            "- Total Size: %zu bytes (%zu kb)\n"
            "- Is fully associative: %s\n"
            "- Is Self Initializing: %s\n"
            "\n"
            , i
            , cache_level
            , cache_type_string
            , cache_sets
            , cache_coherency_line_size
            , cache_physical_line_partitions
            , cache_ways_of_associativity
            , cache_total_size, cache_total_size >> 10
            , cache_is_fully_associative ? "true" : "false"
            , cache_is_self_initializing ? "true" : "false"
        );
    }

    return num_data_caches;
}

int test_cache(size_t attempts, size_t lower_cache_size, int * latencies, size_t max_latency) {
    int fd = open("/dev/urandom", O_RDONLY);
    if (fd < 0) {
        perror("open");
        abort();
    }
    char * random_data = mmap(
          NULL
        , lower_cache_size
        , PROT_READ | PROT_WRITE
        , MAP_PRIVATE | MAP_ANON // | MAP_POPULATE
        , -1
        , 0
        ); // get some random data
    if (random_data == MAP_FAILED) {
        perror("mmap");
        abort();
    }

    size_t i;
    for (i = 0; i < lower_cache_size; i += sysconf(_SC_PAGESIZE)) {
        random_data[i] = 1;
    }


    int64_t random_offset = 0;
    while (attempts--) {
        // use processor clock timer for exact measurement
        random_offset += rand();
        random_offset %= lower_cache_size;
        int32_t cycles_used, edx, temp1, temp2;
        asm (
            "mfence\n\t"        // memory fence
            "rdtsc\n\t"         // get cpu cycle count
            "mov %%edx, %2\n\t"
            "mov %%eax, %3\n\t"
            "mfence\n\t"        // memory fence
            "mov %4, %%al\n\t"  // load data
            "mfence\n\t"
            "rdtsc\n\t"
            "sub %2, %%edx\n\t" // substract cycle count
            "sbb %3, %%eax"     // substract cycle count
            : "=a" (cycles_used)
            , "=d" (edx)
            , "=r" (temp1)
            , "=r" (temp2)
            : "m" (random_data[random_offset])
            );
        // printf("%d\n", cycles_used);
        if (cycles_used < max_latency)
            latencies[cycles_used]++;
        else 
            latencies[max_latency - 1]++;
    }

    munmap(random_data, lower_cache_size);

    return 0;
} 

int main() {
    size_t cache_sizes[32];
    int num_data_caches = i386_cpuid_caches(cache_sizes);

    int latencies[0x400];
    memset(latencies, 0, sizeof(latencies));

    int empty_cycles = 0;

    int i;
    int attempts = 1000000;
    for (i = 0; i < attempts; i++) { // measure how much overhead we have for counting cyscles
        int32_t cycles_used, edx, temp1, temp2;
        asm (
            "mfence\n\t"        // memory fence
            "rdtsc\n\t"         // get cpu cycle count
            "mov %%edx, %2\n\t"
            "mov %%eax, %3\n\t"
            "mfence\n\t"        // memory fence
            "mfence\n\t"
            "rdtsc\n\t"
            "sub %2, %%edx\n\t" // substract cycle count
            "sbb %3, %%eax"     // substract cycle count
            : "=a" (cycles_used)
            , "=d" (edx)
            , "=r" (temp1)
            , "=r" (temp2)
            :
            );
        if (cycles_used < sizeof(latencies) / sizeof(*latencies))
            latencies[cycles_used]++;
        else 
            latencies[sizeof(latencies) / sizeof(*latencies) - 1]++;

    }

    {
        int j;
        size_t sum = 0;
        for (j = 0; j < sizeof(latencies) / sizeof(*latencies); j++) {
            sum += latencies[j];
        }
        size_t sum2 = 0;
        for (j = 0; j < sizeof(latencies) / sizeof(*latencies); j++) {
            sum2 += latencies[j];
            if (sum2 >= sum * .75) {
                empty_cycles = j;
                fprintf(stderr, "Empty counting takes %d cycles\n", empty_cycles);
                break;
            }
        }
    }

    for (i = 0; i < num_data_caches; i++) {
        test_cache(attempts, cache_sizes[i] * 4, latencies, sizeof(latencies) / sizeof(*latencies));

        int j;
        size_t sum = 0;
        for (j = 0; j < sizeof(latencies) / sizeof(*latencies); j++) {
            sum += latencies[j];
        }
        size_t sum2 = 0;
        for (j = 0; j < sizeof(latencies) / sizeof(*latencies); j++) {
            sum2 += latencies[j];
            if (sum2 >= sum * .75) {
                fprintf(stderr, "Cache ID %i has latency %d cycles\n", i, j - empty_cycles);
                break;
            }
        }

    }

    return 0;

}

Output on my Core2Duo:

Cache ID 0:
- Level: 1
- Type: Data Cache
- Total Size: 32768 bytes (32 kb)

Cache ID 1:
- Level: 1
- Type: Instruction Cache
- Total Size: 32768 bytes (32 kb)

Cache ID 2:
- Level: 2
- Type: Unified Cache
- Total Size: 262144 bytes (256 kb)

Cache ID 3:
- Level: 3
- Type: Unified Cache
- Total Size: 3145728 bytes (3072 kb)

Empty counting takes 90 cycles
Cache ID 0 has latency 6 cycles
Cache ID 2 has latency 21 cycles
Cache ID 3 has latency 168 cycles

这篇关于测量缓存潜伏期的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持IT屋！

查看全文

测量缓存潜伏期 [英] Measuring Cache Latencies

问题描述

相关文章

其他开发最新文章

热门教程

热门工具

登录关闭

测量缓存潜伏期 [英] Measuring Cache Latencies

问题描述

相关文章

其他开发最新文章

热门教程

热门工具

登录 关闭

登录关闭