您的系统缓存大小估计？ [英] Cache size estimation on your system?

查看：200 发布时间：2016/8/18 14:35:36 c performance caching cpu-cache

本文介绍了您的系统缓存大小估计？的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我从这个链接得到这个程序（ https://gist.github.com/jiewmeng/3787223).I 已与更好地了解处理器的缓存（L1和L2）。我希望能够写一个程序，这将使我猜L1和L2缓存大小上的想法在网上搜索我的新的笔记本电脑。（只是学习purpose.I知道我可以检查规范。）

 的#include＆LT;＆stdio.h中GT;
＃包括LT＆;＆stdlib.h中GT;
＃包括LT＆;＆time.h中GT;＃定义KB 1024
＃定义MB 1024 * 1024诠释主（）{
    unsigned int类型的步骤= 256 * 1024 * 1024;
    静态INT ARR [4 * 1024 * 1024];
    INT lengthMod;
    无符号整型我;
    双timeTaken;
    clock_t表示启动;
    INT尺寸[] = {
        1 * KB，4 * KB，8 * KB，16 KB * 32 * KB，64 * KB，128 * KB，256 * KB，
        512 * KB，1 * MB，1.5 * MB，2 * MB，2.5 * MB，3 * MB，3.5 * MB，4 MB *
    };
    INT结果[的sizeof（大小）/的sizeof（INT）];
    int类型;    / *每个尺寸来测试... * /
    为（S = 0; S＆LT;的sizeof（尺寸）/的sizeof（INT）; S ++）
    {
            lengthMod =尺寸[S]  -  1;
            开始=时钟（）;
            对于（i = 0; I＆LT;步骤;我++）
            {
                改编[（我* 16）及lengthMod] * = 10;
                改编[（我* 16）及lengthMod] / = 10;
            }            timeTaken =（双）（时钟（） - 启动）/ CLOCKS_PER_SEC;
            的printf（％D，％.8f \\ n，大小[S] / 1024，timeTaken）;
    }    返回0;
}

在我的机器上的程序的输出是follows.How做我间preT的数字？这是什么程序告诉我？

  1，1.07000000
4，1.04000000
8，1.06000000
16，1.13000000
32，1.14000000
64，1.17000000
128，1.20000000
256，1.21000000
512，1.19000000
1024 1.23000000
1536年，1.23000000
2048 1.46000000
2560，1.21000000
3072，1.45000000
3584，1.47000000
4096，1.94000000

解决方案

你需要直接访问内存
我没有意思的 DMA 按转让。内存必须由 CPU访问当然，（否则你不能测量的缓存 S）但直接因为它可以...所以测量可能不会在非常准确的 Windows / Linux版，因为服务和其他进程可以乱用运行时缓存。测量很多次，平均获得更好的结果（或使用最快的时间或一起进行过滤）。为了获得最佳的精度使用 DOS 和 ASM 例如：
```
 代表+ MOVSB，MOVSW，MOVSD
代表+ STOSB，STOSW，STOSD
 
```
让您测量code中的内存传输，而不是别的东西一样！

测量原始传输时间和绘制图形
- X 轴传输块大小
- 是轴传输速度
用相同的传输速率区是适当的缓存层

找不到我的旧的源代码code这让我打掉，现在在 C ++ 东西的窗口：

时间测量：

  // ------------------------------------ ---------------------------------------
双performance_Tms = -1.0，// perioda citaca [毫秒]
       performance_tms = 0.0; // zmerany CAS [毫秒]
// ------------------------------------------------ ---------------------------
无效tbeg（）
    {
    LARGE_INTEGER我;
    如果（performance_Tms＆LT; = 0.0）{QueryPerformanceFrequency的（放;我）; performance_Tms = 1000.0 /双（i.QuadPart）; }
    QueryPerformanceCounter的（放;我）; performance_tms =双（i.QuadPart）;
    }
// ------------------------------------------------ ---------------------------
双倾向于（）
    {
    LARGE_INTEGER我;
    QueryPerformanceCounter的（放;我）; performance_tms =双（i.QuadPart）-performance_tms; performance_tms * = performance_Tms;
    返回performance_tms;
    }
// ------------------------------------------------ ---------------------------

基准（32位应用程序）：

  // ------------------------------------ ---------------------------------------
DWORD尺寸[] = //用于传输块大小
    {
      1所述;小于10，第2版;小于10，第3版;小于10，4℃;小于10，5℃小于10，第6版;小于10，7所述;小于10，8示小于10，9下;小于10，
     10下;小于10，11下;小于10，12下;小于10，13下;小于10，14下;小于10，15℃;小于10，16下;小于10，17下;小于10，第18版;小于10，
     19下;小于10，20℃;小于10，21下;小于10，22℃;小于10，23℃;小于10，24下;小于10，25℃;小于10，26下;小于10，27下;小于10，
     28和;小于10，29下;小于10，30℃，小于10，31下;小于10，32下;小于10，48下;小于10，64下;小于10，80℃小于10，96下;小于10，
    112<<10,128<<10,192<<10,256<<10,320<<10,384<<10,448<<10,512<<10, 1所述;＆小于20，
      2';＆小于20，第3版;＆小于20，4℃;＆小于20，5';＆小于20，第6版;＆小于20，7所述;＆小于20，8所述;＆小于20，9所述;＆小于20，10下; ＆小于20，
     11下;＆小于20，12下;＆小于20，13下;＆小于20，14下;＆小于20，15℃;＆小于20，16下;＆小于20，17下;＆小于20，第18版;＆小于20，19下; ＆小于20，
     20℃;＆小于20，21下;＆小于20，22℃;＆小于20，23℃;＆小于20，24下;＆小于20，25℃;＆小于20，26下;＆小于20，27下;＆小于20，28和; ＆小于20，
     29下;＆小于20，30℃;＆小于20，31下;＆小于20，32下;＆小于20，
    };
const int的N = sizeof的（大小）GT;＆GT; 2; //使用大小号
双pmovsd [N]; //测量传输速率代表MOVSD [MB /秒]
双pstosd [N]; //测量传输速率代表STOSD [MB /秒]
// ------------------------------------------------ ---------------------------
无效的措施（）
    {
    INT I;
    BYTE * DAT; //指针使用的内存
    DWORD ADR，SIZ，NUM;对于ASM //局部变量
    双T，T0;
    HANDLE HND; //进程句柄    //使优先级变化（差异巨大）
    ＃定义measure_priority    //使临界区（没有区别）
//＃定义measure_lock    对于（i = 0; I＆LT; N;我++）pmovsd [I] = 0.0;
    对于（i = 0; I＆LT; N;我++）pstosd [I] = 0.0;
    DAT =新的字节[大小[N-1] +4]; //最后一个DWORD +4字节（应该是3，但我喜欢4更多）
    如果（DAT == NULL）回报;
    #IFDEF measure_priority
    HND = GetCurrentProcess（）;如果（HND！= NULL）{SetPriorityClass（HND，REALTIME_PRIORITY_CLASS）; CloseHandle的（HND）; }
    睡眠（200）; //等待更改生效
    ＃万一
    #IFDEF measure_lock
    CRITICAL_SECTION的锁; //锁手柄
    InitializeCriticalSectionAndSpinCount（安培;锁定，0x00000400时）;
    EnterCriticalSection的（安培;锁定）;
    ＃万一
    ADR =（DWORD）（DAT）;
    对于（i = 0; I＆LT; N;我++）
        {
        SIZ =尺寸[I] // SIZ =实际块大小
        NUM =（δ;＆小于20）/ SIZ; //计算N（次重复测量）
        如果（NUM 4;）NUM = 4;
        SIZ＆GT;＆GT; = 2; //因为32位传输的大小/ 4
        //措施开销
        tbeg（）; //启动时间meassurement
        ASM {
            推ESI
            推EDI
            推ECX
            推EBX
            推EAX
            MOV EBX，NUM
            MOV人，0
    LOOP0：MOV ESI，ADR
            MOV EDI，ADR
            MOV ECX，SIZ
//代表MOVSD // ES，DS已经用C ++设置
//代表STOSD // ES已被设置C ++
            十二月EBX
            JNZ LOOP0
            流行EAX
            流行EBX
            流行ECX
            流行EDI
            流行ESI
            }
        T0 =趋向（）; //停止时间meassurement
        //测量1
        tbeg（）; //启动时间meassurement
        ASM {
            推ESI
            推EDI
            推ECX
            推EBX
            推EAX
            MOV EBX，NUM
            MOV人，0
    循环1：MOV ESI，ADR
            MOV EDI，ADR
            MOV ECX，SIZ
            代表MOVSD // ES，DS已经用C ++设置
//代表STOSD // ES已被设置C ++
            十二月EBX
            JNZ循环1
            流行EAX
            流行EBX
            流行ECX
            流行EDI
            流行ESI
            }
        T =倾向于（）; //停止时间meassurement
        T-T0 =;如果（T＆LT; 1E-6）T = 1E-6; //开销消除和避免被零除
        T =双（SIZ＆LT; 2）*双（NUM）/吨; //字节/ MS
        pmovsd [I] = T /（1.024 * 1024.0）; //兆字节/秒
        //测量2
        tbeg（）; //启动时间meassurement
        ASM {
            推ESI
            推EDI
            推ECX
            推EBX
            推EAX
            MOV EBX，NUM
            MOV人，0
    循环2：MOV ESI，ADR
            MOV EDI，ADR
            MOV ECX，SIZ
//代表MOVSD // ES，DS已经用C ++设置
            代表STOSD // ES已被设置C ++
            十二月EBX
            JNZ循环2
            流行EAX
            流行EBX
            流行ECX
            流行EDI
            流行ESI
            }
        T =倾向于（）; //停止时间meassurement
        T-T0 =;如果（T＆LT; 1E-6）T = 1E-6; //开销消除和避免被零除
        T =双（SIZ＆LT; 2）*双（NUM）/吨; //字节/ MS
        pstosd [I] = T /（1.024 * 1024.0）; //兆字节/秒
        }
    #IFDEF measure_lock
    LeaveCriticalSection（安培;锁定）;
    DeleteCriticalSection（安培;锁定）;
    ＃万一
    #IFDEF measure_priority
    HND = GetCurrentProcess（）;如果（HND！= NULL）{SetPriorityClass（HND，NORMAL_PRIORITY_CLASS）; CloseHandle的（HND）; }
    ＃万一
    删除DAT;
    }
// ------------------------------------------------ ---------------------------

其中阵列 pmovsd [] 和 pstosd [] 保存测量 32位传输速率 [MB /秒] 。您可以将两个定义了测量功能的配置开始通过使用/ REM的code。

图形输出：

要最大限度地提高精度，您可以更改进程优先级以最大。因此，创建具有最高优先级的措施线程（我尝试它，但它混乱的东西反倒），并添加临界区它，因此测试将不会被 OS 经常不间断（带和不带螺纹）没有明显的区别。如果你想使用字节转移，然后考虑它仅使用 16位寄存器，所以你需要添加环和地址迭代。

PS。

如果您试试这个笔记本上，那么你应该在过热的 CPU ，以确保您在上面测量的 CPU / MEM 的速度。因此，没有睡眠秒。测量前一些愚蠢的循环将做到这一点，但他们应该至少运行几秒钟。您也可以通过 CPU 频率测量和循环而上升同步此。停止后，饱和...

ASM <指令code> RDTSC 是最适合这个（但要注意它的意义与新的架构有所改变）。

如果你不是在窗口，然后更改功能 tbeg，往往来您的操作系统等值

精度 进一步改进

好以后终于解决问题的 VCL 影响测量精度，我发现它<感谢这个问题，更多的是HREF =http://stackoverflow.com/q/21516244/2521214 >这里，以提高精确度，你可以之前的基准做到这一点：

设置进程的优先级类实时

设置进程亲和力，单个CPU
所以你刚才测量单一的 CPU 多核

将数据刷新和指令高速缓存

例如：

  //之前纪念品基准
    DWORD process_affinity_mask = 0;
    DWORD system_affinity_mask = 0;
    HANDLE HND = GetCurrentProcess（）;
    如果（HND！= NULL）
        {
        // 优先
        SetPriorityClass（HND，REALTIME_PRIORITY_CLASS）;
        //亲和力
        GetProcessAffinityMask（HND，＆安培; process_affinity_mask，＆安培; system_affinity_mask）;
        process_affinity_mask = 1;
        SetProcessAffinityMask（HND，process_affinity_mask）;
        GetProcessAffinityMask（HND，＆安培; process_affinity_mask，＆安培; system_affinity_mask）;
        }
    //刷新缓存
    为（DWORD I = 0; I＆下;尺寸[N-1]; I + = 7）
        {
        DAT [I] + =我;
        DAT [I] * = I;
        DAT [1]  - 安培; = I;
        }    //纪念品基准后
    如果（HND！= NULL）
        {
        SetPriorityClass（HND，NORMAL_PRIORITY_CLASS）;
        SetProcessAffinityMask（HND，system_affinity_mask）;
        }

因此，更准确的测量是这样的：

I got this program from this link (https://gist.github.com/jiewmeng/3787223).I have been searching the web with the idea of gaining a better understanding of processor caches (L1 and L2).I want to be able to write a program that would enable me to guess the size of L1 and L2 cache on my new Laptop.(just for learning purpose.I know I could check the spec.)

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define KB 1024
#define MB 1024 * 1024

int main() {
    unsigned int steps = 256 * 1024 * 1024;
    static int arr[4 * 1024 * 1024];
    int lengthMod;
    unsigned int i;
    double timeTaken;
    clock_t start;
    int sizes[] = {
        1 * KB, 4 * KB, 8 * KB, 16 * KB, 32 * KB, 64 * KB, 128 * KB, 256 * KB,
        512 * KB, 1 * MB, 1.5 * MB, 2 * MB, 2.5 * MB, 3 * MB, 3.5 * MB, 4 * MB
    };
    int results[sizeof(sizes)/sizeof(int)];
    int s;

    /*for each size to test for ... */
    for (s = 0; s < sizeof(sizes)/sizeof(int); s++)
    {
            lengthMod = sizes[s] - 1;
            start = clock();
            for (i = 0; i < steps; i++)
            {
                arr[(i * 16) & lengthMod] *= 10;
                arr[(i * 16) & lengthMod] /= 10;
            }

            timeTaken = (double)(clock() - start)/CLOCKS_PER_SEC;
            printf("%d, %.8f \n", sizes[s] / 1024, timeTaken);
    }

    return 0;
}

The output of the program in my machine is as follows.How do I interpret the numbers? What does this program tell me.?

1, 1.07000000 
4, 1.04000000 
8, 1.06000000 
16, 1.13000000 
32, 1.14000000 
64, 1.17000000 
128, 1.20000000 
256, 1.21000000 
512, 1.19000000 
1024, 1.23000000 
1536, 1.23000000 
2048, 1.46000000 
2560, 1.21000000 
3072, 1.45000000 
3584, 1.47000000 
4096, 1.94000000

解决方案

you need direct access to memory

I am not meaning DMA transfer by this. Memory must be accessed by CPU of course (otherwise you are not measuring CACHEs) but as directly as it can be ... so measurements will probably not be very accurate on Windows/Linux because services and other processes can mess with caches during runtime. Measure many times and average for better results (or use the fastest time or filter it together). For best accuracy use DOS and asm for example
```
rep + movsb,movsw,movsd 
rep + stosb,stosw,stosd
```
so you measure the memory transfer and not something else like in your code !!!
measure the raw transfer times and plot a graph
- x axis is transfer block size
- y axis is transfer speed
zones with the same transfer rate are consistent with appropriate CACHE layer

[Edit1] could not find my old source code for this so I busted something right now in C++ for windows:

Time measurement:

//---------------------------------------------------------------------------
double performance_Tms=-1.0,    // perioda citaca [ms]
       performance_tms= 0.0;    // zmerany cas [ms]
//---------------------------------------------------------------------------
void tbeg()
    {
    LARGE_INTEGER i;
    if (performance_Tms<=0.0) { QueryPerformanceFrequency(&i); performance_Tms=1000.0/double(i.QuadPart); }
    QueryPerformanceCounter(&i); performance_tms=double(i.QuadPart);
    }
//---------------------------------------------------------------------------
double tend()
    {
    LARGE_INTEGER i;
    QueryPerformanceCounter(&i); performance_tms=double(i.QuadPart)-performance_tms; performance_tms*=performance_Tms;
    return performance_tms;
    }
//---------------------------------------------------------------------------

Benchmark (32bit app):

//---------------------------------------------------------------------------
DWORD sizes[]=                  // used transfer block sizes
    {
      1<<10,  2<<10,  3<<10,  4<<10,  5<<10,  6<<10,  7<<10,  8<<10,  9<<10,
     10<<10, 11<<10, 12<<10, 13<<10, 14<<10, 15<<10, 16<<10, 17<<10, 18<<10,
     19<<10, 20<<10, 21<<10, 22<<10, 23<<10, 24<<10, 25<<10, 26<<10, 27<<10,
     28<<10, 29<<10, 30<<10, 31<<10, 32<<10, 48<<10, 64<<10, 80<<10, 96<<10,
    112<<10,128<<10,192<<10,256<<10,320<<10,384<<10,448<<10,512<<10,  1<<20,
      2<<20,  3<<20,  4<<20,  5<<20,  6<<20,  7<<20,  8<<20,  9<<20, 10<<20,
     11<<20, 12<<20, 13<<20, 14<<20, 15<<20, 16<<20, 17<<20, 18<<20, 19<<20,
     20<<20, 21<<20, 22<<20, 23<<20, 24<<20, 25<<20, 26<<20, 27<<20, 28<<20,
     29<<20, 30<<20, 31<<20, 32<<20,
    };
const int N=sizeof(sizes)>>2;   // number of used sizes
double pmovsd[N];               // measured transfer rate rep MOVSD [MB/sec]
double pstosd[N];               // measured transfer rate rep STOSD [MB/sec]
//---------------------------------------------------------------------------
void measure()
    {
    int i;
    BYTE *dat;                              // pointer to used memory
    DWORD adr,siz,num;                      // local variables for asm
    double t,t0;
    HANDLE hnd;                             // process handle

    // enable priority change (huge difference)
    #define measure_priority

    // enable critical sections (no difference)
//  #define measure_lock

    for (i=0;i<N;i++) pmovsd[i]=0.0;
    for (i=0;i<N;i++) pstosd[i]=0.0;
    dat=new BYTE[sizes[N-1]+4];             // last DWORD +4 Bytes (should be 3 but i like 4 more)
    if (dat==NULL) return;
    #ifdef measure_priority
    hnd=GetCurrentProcess(); if (hnd!=NULL) { SetPriorityClass(hnd,REALTIME_PRIORITY_CLASS); CloseHandle(hnd); }
    Sleep(200);                             // wait to change take effect
    #endif
    #ifdef measure_lock
    CRITICAL_SECTION lock;                  // lock handle
    InitializeCriticalSectionAndSpinCount(&lock,0x00000400);
    EnterCriticalSection(&lock);
    #endif
    adr=(DWORD)(dat);
    for (i=0;i<N;i++)
        {
        siz=sizes[i];                       // siz = actual block size
        num=(8<<20)/siz;                    // compute n (times to repeat the measurement)
        if (num<4) num=4;
        siz>>=2;                            // size / 4 because of 32bit transfer
        // measure overhead
        tbeg();                             // start time meassurement
        asm {
            push esi
            push edi
            push ecx
            push ebx
            push eax
            mov ebx,num
            mov al,0
    loop0:  mov esi,adr
            mov edi,adr
            mov ecx,siz
//          rep movsd                       // es,ds already set by C++
//          rep stosd                       // es already set by C++
            dec ebx
            jnz loop0
            pop eax
            pop ebx
            pop ecx
            pop edi
            pop esi
            }
        t0=tend();                          // stop time meassurement
        // measurement 1
        tbeg();                             // start time meassurement
        asm {
            push esi
            push edi
            push ecx
            push ebx
            push eax
            mov ebx,num
            mov al,0
    loop1:  mov esi,adr
            mov edi,adr
            mov ecx,siz
            rep movsd                       // es,ds already set by C++
//          rep stosd                       // es already set by C++
            dec ebx
            jnz loop1
            pop eax
            pop ebx
            pop ecx
            pop edi
            pop esi
            }
        t=tend();                           // stop time meassurement
        t-=t0; if (t<1e-6) t=1e-6;          // remove overhead and avoid division by zero
        t=double(siz<<2)*double(num)/t;     // Byte/ms
        pmovsd[i]=t/(1.024*1024.0);         // MByte/s
        // measurement 2
        tbeg();                             // start time meassurement
        asm {
            push esi
            push edi
            push ecx
            push ebx
            push eax
            mov ebx,num
            mov al,0
    loop2:  mov esi,adr
            mov edi,adr
            mov ecx,siz
//          rep movsd                       // es,ds already set by C++
            rep stosd                       // es already set by C++
            dec ebx
            jnz loop2
            pop eax
            pop ebx
            pop ecx
            pop edi
            pop esi
            }
        t=tend();                           // stop time meassurement
        t-=t0; if (t<1e-6) t=1e-6;          // remove overhead and avoid division by zero
        t=double(siz<<2)*double(num)/t;     // Byte/ms
        pstosd[i]=t/(1.024*1024.0);         // MByte/s
        }
    #ifdef measure_lock
    LeaveCriticalSection(&lock);
    DeleteCriticalSection(&lock);
    #endif
    #ifdef measure_priority
    hnd=GetCurrentProcess(); if (hnd!=NULL) { SetPriorityClass(hnd,NORMAL_PRIORITY_CLASS); CloseHandle(hnd); }
    #endif
    delete dat;
    }
//---------------------------------------------------------------------------

Where arrays pmovsd[] and pstosd[] holds the measured 32bit transfer rates [MByte/sec]. You can configure the code by use/rem two defines at the start of measure function.

Graphical Output:

To maximize accuracy you can change process priority class to maximum. So create measure thread with max priority (I try it but it mess thing up actually) and add critical section to it so the test will not be uninterrupted by OS as often (no visible difference with and without threads). If you want to use Byte transfers then take account that it uses only 16bit registers so you need to add loop and address iterations.

PS.

If you try this on notebook then you should overheat the CPU to be sure that you measure on top CPU/Mem speed. So no Sleeps. Some stupid loops before measurement will do it but they should run at least few seconds. Also you can synchronize this by CPU frequency measurement and loop while is rising. Stop after it saturates ...

asm instruction RDTSC is best for this (but beware its meaning has slightly changed with new architectures).

If you are not under Windows then change functions tbeg,tend to your OS equivalents

[edit2] further improvements of accuracy

Well after finally solving problem with VCL affecting measurement accuracy which I discover thanks to this question and more about it here, to improve accuracy you can prior to benchmark do this:

set process priority class to realtime
set process affinity to single CPU

so you measure just single CPU on multi-core
flush DATA and Instruction CACHEs

For example:

    // before mem benchmark
    DWORD process_affinity_mask=0;
    DWORD system_affinity_mask =0;
    HANDLE hnd=GetCurrentProcess();
    if (hnd!=NULL)
        {
        // priority
        SetPriorityClass(hnd,REALTIME_PRIORITY_CLASS);
        // affinity
        GetProcessAffinityMask(hnd,&process_affinity_mask,&system_affinity_mask);
        process_affinity_mask=1;
        SetProcessAffinityMask(hnd,process_affinity_mask);
        GetProcessAffinityMask(hnd,&process_affinity_mask,&system_affinity_mask);
        }
    // flush CACHEs
    for (DWORD i=0;i<sizes[N-1];i+=7)
        {
        dat[i]+=i;
        dat[i]*=i;
        dat[i]&=i;
        }

    // after mem benchmark
    if (hnd!=NULL)
        {
        SetPriorityClass(hnd,NORMAL_PRIORITY_CLASS);
        SetProcessAffinityMask(hnd,system_affinity_mask);
        }

So the more accurate measurement looks like this:

这篇关于您的系统缓存大小估计？的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持IT屋！

查看全文

您的系统缓存大小估计？ [英] Cache size estimation on your system?

问题描述

相关文章

C/C++最新文章

热门教程

热门工具

登录关闭

您的系统缓存大小估计？ [英] Cache size estimation on your system?

问题描述

相关文章

C/C++最新文章

热门教程

热门工具

登录 关闭

登录关闭