如何使用clflush? [英] How to use clflush?

查看:151
本文介绍了如何使用clflush?的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我想测量clflush后访问表条目和访问另一个条目之间的时间差。
您将在下面找到我的尝试,上述两项操作几乎不会受到任何惩罚。该表的长度为256,每个条目中有8位。我怀疑我的clflush无法正常工作。我正在gcc中使用-O3标志进行编译。

I want to measure the time difference between accessing a table entry and accessing another entry after a clflush. Below you will find my attempt, I get almost no penalty for the above two operations. The table is of length 256 with 8 bits in each entry. I suspect my clflush is not working properly. I am compiling with -O3 flag in gcc.

            #include <stdio.h>
            #include <stdlib.h>
            #include <stdint.h>
            #define ARRAYSIZE(arr) (sizeof(arr)/sizeof(arr[0]))

            #define REPEAT 10000

            unsigned char table[256]={103,198,105,115,81,255,74,236,41,205,186,171,242,251,227,70,124,194,84,248,27,232,231,141,118,90,46,99,51,159,201,154,102,50,13,183,49,88,163,90,37,93,5,23,88,233,94,212,171,178,205,198,155,180,84,17,14,130,116,65,33,61,220,135,112,233,62,161,65,225,252,103,62,1,126,151,234,220,107,150,143,56,92,42,236,176,59,251,50,175,60,84,236,24,219,92,2,26,254,67,251,250,170,58,251,41,209,230,5,60,124,148,117,216,190,97,137,249,92,187,168,153,15,149,177,235,241,179,5,239,247,0,233,161,58,229,202,11,203,208,72,71,100,189,31,35,30,168,28,123,100,197,20,115,90,197,94,75,121,99,59,112,100,36,17,158,9,220,170,212,172,242,27,16,175,59,51,205,227,80,72,71,21,92,187,111,34,25,186,155,125,245,11,225,26,28,127,35,248,41,248,164,27,19,181,202,78,232,152,50,56,224,121,77,61,52,188,95,78,119,250,203,108,5,172,134,33,43,170,26,85,162,190,112,181,115,59,4,92,211,54,148,179,175,226,240,228,158,79,50,21,73,253,130,78,169};



            inline void clflush(volatile void *p)
            {
                asm volatile ("clflush (%0)" :: "r"(p));
            }

            inline uint64_t rdtsc()
            {
                unsigned long a, d;
                asm volatile ("cpuid; rdtsc" : "=a" (a), "=d" (d) : : "ebx", "ecx");
                return a | ((uint64_t)d << 32);
            }

            inline int func(int *a) { 
                int i;
                for(i=0;i<REPEAT;i++){
                    a[i]=(int)table[rand()%256];
                }

            }
            void flushCache(unsigned char *start)
            {
                // flush table
                unsigned char* fPtr = (unsigned char*)start;
                clflush(fPtr);
                clflush(fPtr+64);
                clflush(fPtr+128);
                clflush(fPtr+192);
                clflush(fPtr+256);
            }


            inline void test()
            {
                int i=0;
                uint64_t start, end;
                char c;
                int temp[REPEAT];

                start = rdtsc();

                func(temp);

                end = rdtsc();

                //following line of code to prevent compiler from optimizing. do something with the return value
                for(i-0;i<REPEAT;i++){
                temp[i]=temp[i]+temp[i/2];
                }

                printf("%ld ticks\n", end - start);
            }

            inline void testflush()
            {
                int i=0;
                uint64_t start, end;
                char c;
                int temp[REPEAT];

                start = rdtsc();

                func(temp);
                flushCache(table); //flush afer every read

                end = rdtsc();

                //following line of code to prevent compiler from optimizing. do something with the return value
                for(i-0;i<REPEAT;i++){
                temp[i]=temp[i]+temp[i/2];
                }

                printf("%ld ticks\n", end - start);
            }



            int main(int ac, char **av)
            {
                test();
                printf("Tables in cache!\n");
                testflush();
                printf("Tables evicted from cache.\n");
                test();

                return 0;
            }

update:我知道由于对表的访问可能会出现一些问题。这是另一个代码,它驱逐单个变量而不是整个表。当使用clflush()时,此时钟周期显着增加。这是否意味着clflush()正常工作并且时间增加是由于从内存访问变量所致?

update : I understand ther might be some problem due to table access. Here is another code that evicts a single variable instead of the whole table. This one shows significant inclrease in clock cycle when using the clflush(). Does it mean clflush() is working properly and the incrased time is due to access the variable from memory?

            #include <stdint.h>
            #include <stdio.h>
            #define REPEAT 100000
            inline void clflush(volatile void *p)
            {
                asm volatile ("clflush (%0)" :: "r"(p));
            }

            inline uint64_t rdtsc()
            {
                unsigned long a, d;
                asm volatile ("rdtsc" : "=a" (a), "=d" (d));
                return a | ((uint64_t)d << 32);
            }

            volatile int i;

            inline void test()
            {
                uint64_t start, end,clock;
                volatile int j;
                long int rep;
                int k;

                clock=0;
                for(rep=0;rep<REPEAT;rep++){
                    start = rdtsc();
                    j = i+1;
                    end = rdtsc();
                    clock=clock+(end-start);
                    k=j;
                }
                printf("took %lu ticks\n", clock);
            }

            inline void testflush()
            {
                uint64_t start, end,clock;
                volatile int j;
                int k;
                long int rep;

                clock=0;
                for(rep=0;rep<REPEAT;rep++){
                    start = rdtsc();
                    j = i+1;
                    end = rdtsc();
                    clflush(&i);
                    clock=clock+(end-start);
                    k=j;
                }
                printf("took %lu ticks\n", clock);
            }


            int main(int ac, char **av)
            {
                i=5;
                printf("------------------------------------------\n");
                test();
                printf("------------------------------------------\n");
                testflush();
                printf("------------------------------------------\n");
                test();
                return 0;
            }


推荐答案

我看到的一些问题代码。

Some issues I see with the code.

调用 clflush testflush 的计时器c>。因此,您也正在计时处理这些指令所需的周期。我不认为这是故意的。

You end the timer for testflush after calling clflush. Therefore you are timing the cycles necessary for processing these instructions, too. I don't think that is intended.

在您的测试函数中,您有一个带有10000次迭代的循环。每次迭代可能会引用一个新的缓存行,但是中只有4个缓存行。因此,至少9996次迭代始终不会调用任何高速缓存。

In your test function you have a loop with 10000 iterations. Each iteration may invoke a reference to one new cache line, but there are only 4 cache lines in table. So at least 9996 iterations invoke no cache miss anyway.

因此,您的计时时间是10000次 rand()%256 再加上4次缓存加载。即使缓存加载需要几百个周期, rand()%256 的10000次迭代仍将使它黯然失色。

Thus you are timing 10000 times rand()%256 plus 4 cache loads. Even if cache loads take a few hundred cycles, 10000 iterations of rand()%256 will still overshadow that.

生成的这些10000个整数也必须回写。我不确定L1-> L2缓存带宽是否会是一个限制因素,但可能是限制因素。

These 10000 integers generated must also be written back. I am not sure whether L1->L2 cache bandwidth would be a limiting factor, but it might be.

您还需要运行测试约数千次平均而言,样本方差太大了。

You also need to run the test a few thousand times or so and average, the sample variance is way too high otherwise.

然后,在您请求它们之前,CPU可能还会通过推测再次预取缓存行。可以这样做,但是我不知道当前的cpus有多聪明。

Then it might also be possible that the cpu prefetches the cache lines again by speculation before you request them. It is allowed to do so, but I don't know how clever current cpus are.

这篇关于如何使用clflush?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆