如何为大型缓冲区操作优化XOR功能 [英] How to optimise XOR function for large buffer manipulation

查看:61
本文介绍了如何为大型缓冲区操作优化XOR功能的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我正在研究如何在32位Intel处理器上优化两个数据缓冲区的异或运算.为了使工作正常,我在
的行上编写了一个简单的函数

I am investigating how to optimise XORing two data buffers together on a 32 bit Intel processor. To get something working I am writing a simple function on the lines of

void XOR(char* inBuf1, char* inBuf2, char*outBuf, int size)
{
    while(size--)
    {
        *outBuf++ = *inBuf1++ ^ *inBuf2++;
    }
}


我确实需要功能来提高效率.到目前为止,我想到的选项是:-
反而对__int64数据进行XORing
看使用BitBlt
使用SSE2.

谁能建议其他替代方法或提供任何线索来证明上述哪种方法可能更好?


I need really the function to be more efficient. The options I''ve thought of so far are:-
XORing __int64 data instead
Looking at using BitBlt
Use SSE2.

Can anyone suggest other alternatives or provide any clues as to which of the above is likely to be better?

推荐答案

看看这篇文章:
SSE编程简介 [ http://msdn.microsoft.com/en-us/library/fzt08www (v = vs.80).aspx [ http://bmagic.sourceforge.net/bmsse2opt.html [
Have a look at this article:
Introduction to SSE Programming[^]

You would simply use __m128i _mm_xor_si128:
http://msdn.microsoft.com/en-us/library/fzt08www(v=vs.80).aspx[^]

Other good resource:
http://bmagic.sourceforge.net/bmsse2opt.html[^]

Good luck!


一个小例子来评估:
A little example to evaluate:
double FreqTime(const LARGE_INTEGER& t,const LARGE_INTEGER& f)
{
  typedef struct { static double tod(const LARGE_INTEGER& li){ return (4294967296.0*li.HighPart+li.LowPart); } }_;
  return 1e3*_::tod(t)/_::tod(f);
}

void XOR(char* inBuf1, char* inBuf2, char*outBuf, unsigned int size)
{
    while(size--)
    {
        *outBuf++ = *inBuf1++ ^ *inBuf2++;
    }
}

template <class TI>
void tXOR(char* i1, char* i2, char* o, unsigned int cb)
{
  unsigned int  i;
  TI*            ii1 = (TI*)i1;
  TI*            ii2 = (TI*)i2;
  TI*            oo  = (TI*)o;
  unsigned int  nn = cb/sizeof(TI);
  unsigned int  dd = sizeof(TI)*nn;
  for(i=0;i<nn;i++) oo[i]=ii1[i]^ii2[i];
  if(dd<cb) tXOR<char>(i1+dd,i2+dd,o+dd,cb-dd);
}

template <>
void tXOR<__m128>(char* i1, char* i2, char* o, unsigned int cb)
{
  unsigned int  i;
  __m128*        ii1 = (__m128*)i1;
  __m128*        ii2 = (__m128*)i2;
  __m128*        oo  = (__m128*)o;
  unsigned int  nn = cb/sizeof(__m128);
  unsigned int  dd = sizeof(__m128)*nn;
  for(i=0;i<nn;i++) oo[i]=_mm_xor_ps(ii1[i],ii2[i]);
  if(dd<cb) tXOR<char>(i1+dd,i2+dd,o+dd,cb-dd);
}

int _tmain(int argc, _TCHAR* argv[])
{

  LARGE_INTEGER freq = {0,0};
  LARGE_INTEGER t0 = {0,0};
  LARGE_INTEGER t1 = {0,0};

  QueryPerformanceFrequency(&freq);

  enum{ BUFFSIZE=1<<20, LOOPS=1000, };
  unsigned int  i;
  char*          in1 = (char*)malloc(BUFFSIZE);
  char*          in2 = (char*)malloc(BUFFSIZE);
  char*          out = (char*)malloc(BUFFSIZE);
  
  QueryPerformanceCounter(&t0);
  srand(t0.LowPart);
  for(i=0;i<BUFFSIZE;i++)
  {
    in1[i] = MulDiv(rand(),1,RAND_MAX);
    in2[i] = MulDiv(rand(),1,RAND_MAX);
  }

  QueryPerformanceCounter(&t0);
  for(i=0;i<LOOPS;i++) XOR(in1,in2,out,BUFFSIZE);
  QueryPerformanceCounter(&t1);
  _tprintf(__T("%s(%i,%i) = %lfms\r\n"),__TEXT("XOR               "),LOOPS,BUFFSIZE,FreqTime(t1,freq)-FreqTime(t0,freq));

  QueryPerformanceCounter(&t0);
  for(i=0;i<LOOPS;i++) tXOR<char>(in1,in2,out,BUFFSIZE);
  QueryPerformanceCounter(&t1);
  _tprintf(__T("%s(%i,%i) = %lfms\r\n"),__TEXT("tXOR<char>        "),LOOPS,BUFFSIZE,FreqTime(t1,freq)-FreqTime(t0,freq));

  QueryPerformanceCounter(&t0);
  for(i=0;i<LOOPS;i++) tXOR<short>(in1,in2,out,BUFFSIZE);
  QueryPerformanceCounter(&t1);
  _tprintf(__T("%s(%i,%i) = %lfms\r\n"),__TEXT("tXOR<short>       "),LOOPS,BUFFSIZE,FreqTime(t1,freq)-FreqTime(t0,freq));

  QueryPerformanceCounter(&t0);
  for(i=0;i<LOOPS;i++) tXOR<unsigned int>(in1,in2,out,BUFFSIZE);
  QueryPerformanceCounter(&t1);
  _tprintf(__T("%s(%i,%i) = %lfms\r\n"),__TEXT("tXOR<unsigned int>"),LOOPS,BUFFSIZE,FreqTime(t1,freq)-FreqTime(t0,freq));

  QueryPerformanceCounter(&t0);
  for(i=0;i<LOOPS;i++) tXOR<__int64>(in1,in2,out,BUFFSIZE);
  QueryPerformanceCounter(&t1);
  _tprintf(__T("%s(%i,%i) = %lfms\r\n"),__TEXT("tXOR<__int64>     "),LOOPS,BUFFSIZE,FreqTime(t1,freq)-FreqTime(t0,freq));

  QueryPerformanceCounter(&t0);
  for(i=0;i<LOOPS;i++) tXOR<__m128>(in1,in2,out,BUFFSIZE);
  QueryPerformanceCounter(&t1);
  _tprintf(__T("%s(%i,%i) = %lfms\r\n"),__TEXT("tXOR<__m128>      "),LOOPS,BUFFSIZE,FreqTime(t1,freq)-FreqTime(t0,freq));

  free(in1);
  free(in2);
  free(out);

  _getch();
  return 0;
}


输出:


output:

XOR               (1000,1048576) = 1447.370089ms
tXOR<char>        (1000,1048576) = 993.742449ms
tXOR<short>       (1000,1048576) = 511.465385ms
tXOR<unsigned int>(1000,1048576) = 334.088394ms
tXOR<__int64>     (1000,1048576) = 232.502586ms
tXOR<__m128>      (1000,1048576) = 201.321703ms


在i3 CPU上进行了测试.
问候.


Tested on i3 CPU.
Regards.


我认为,如果一轮处理更多数据,您将获得更有效的结果.它将同时减少异或操作计数(以时钟周期为单位)和内存访问计数.如果支持MMX,则可以通过"32位Intel处理器"中的MMX寄存器使用64位(8字节)数据.如果使用SSE2指令,则数据大小将为128位.但是,在确定最有效的情况之前,应对这些情况进行分析.据我所知,SIMD功能提供了对向量进行操作的有用指令,但您可以对大小写缓冲区进行操作.如果您在实施过程中需要帮助,我可以提供进一步的帮助.

在非常大的缓冲区上,下面的这样的代码将比上面的代码更有效.

I think, If you xor more data in one turn, you will get more efficient result. It will decrease both xor operation count (in clock cycles) and the memory access count. You can use 64 bit (8 byte) data by using MMX registers in an "32 bit Intel processor", if MMX supported. If you use SSE2 instructions, data size will be 128 Bit. But, these cases should be profiled before deciding that they are most efficient. SIMD functionality provides useful instructions operates on vectors however your case buffer manipulation, as far as I understand. I can help further, if you need help while implementing.

Such a code below will be more efficient than above on a very large buffer.

void XOR(char* inBuf1, char* inBuf2, char*outBuf, int size)
{
	int size8 = size / sizeof(unsigned __int64); // 8 actually
	if(0 < size8)
	{
		unsigned __int64 * p1 = (unsigned __int64 *)inBuf1;
		unsigned __int64 * p2 = (unsigned __int64 *)inBuf2;
		unsigned __int64 * po = (unsigned __int64 *)outBuf;

		while(size8--)
		{
			*po++ = *p1++ ^ *p2++;
		}
	}

	int size1 = size % sizeof(unsigned __int64); // 8 actually
	if(0 < size1)
	{
		const int done = size - size1;
		inBuf1 += done;
		inBuf2 += done;
		outBuf += done;

		while(size1--)
		{
			*outBuf++ = *inBuf1++ ^ *inBuf2++;
		}
	}
}


这篇关于如何为大型缓冲区操作优化XOR功能的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆