与AVX2和范围保存的按位类型转换 [英] bitwise type convertion with AVX2 and range preservation
问题描述
我想将signed char的向量转换为unsigned char的向量。
我想保留每个类型的值范围。
I want to convert a vector of signed char into a vector of unsigned char. I want to preserve the value range for each type.
我的意思是signed char的值范围是-128和+127, unsigned char元素在0到255之间。
I mean the value range of signed char is -128 and +127 when the value range of an unsigned char element is between 0 - 255.
没有内联函数我可以这样做:
Without intrinsics I can do this almost like that :
#include <iostream>
int main(int argc,char* argv[])
{
typedef signed char schar;
typedef unsigned char uchar;
schar a[]={-1,-2,-3,4,5,6,-7,-8,9,10,-11,12,13,14,15,16,17,-128,19,20,21,22,23,24,25,26,27,28,29,30,31,32};
uchar b[32] = {0};
for(int i=0;i<32;i++)
b[i] = 0xFF & ~(0x7F ^ a[i]);
return 0;
}
所以使用AVX2我写了以下程序:
So using AVX2 I wrote the following program :
#include <immintrin.h>
#include <iostream>
int main(int argc,char* argv[])
{
schar a[]={-1,-2,-3,4,5,6,-7,-8,9,10,-11,12,13,14,15,16,17,-128,19,20,21,22,23,24,25,26,27,28,29,30,31,32};
uchar b[32] = {0};
__m256i _a = _mm256_stream_load_si256(reinterpret_cast<const __m256i*>(a));
__m256i _b;
__m256i _cst1 = _mm256_set1_epi8(0x7F);
__m256i _cst2 = _mm256_set1_epi8(0xFF);
_a = _mm256_xor_si256(_a,_cst1);
_a = _mm256_andnot_si256(_cst2,_a);
// The way I do the convertion is inspired by an algorithm from OpenCV.
// Convertion from epi8 -> epi16
_b = _mm256_srai_epi16(_mm256_unpacklo_epi8(_mm256_setzero_si256(),_a),8);
_a = _mm256_srai_epi16(_mm256_unpackhi_epi8(_mm256_setzero_si256(),_a),8);
// convert from epi16 -> epu8.
_b = _mm256_packus_epi16(_b,_a);
_mm256_stream_si256(reinterpret_cast<__m256i*>(b),_b);
return 0;
}
当我显示varaible b时,
我还检查以下情况:
When I display the varaible b it was fully empty. I check also the following situations :
#include <immintrin.h>
#include <iostream>
int main(int argc,char* argv[])
{
schar a[]={-1,-2,-3,4,5,6,-7,-8,9,10,-11,12,13,14,15,16,17,-128,19,20,21,22,23,24,25,26,27,28,29,30,31,32};
uchar b[32] = {0};
__m256i _a = _mm256_stream_load_si256(reinterpret_cast<const __m256i*>(a));
__m256i _b;
__m256i _cst1 = _mm256_set1_epi8(0x7F);
__m256i _cst2 = _mm256_set1_epi8(0xFF);
// The way I do the convertion is inspired by an algorithm from OpenCV.
// Convertion from epi8 -> epi16
_b = _mm256_srai_epi16(_mm256_unpacklo_epi8(_mm256_setzero_si256(),_a),8);
_a = _mm256_srai_epi16(_mm256_unpackhi_epi8(_mm256_setzero_si256(),_a),8);
// convert from epi16 -> epu8.
_b = _mm256_packus_epi16(_b,_a);
_b = _mm256_xor_si256(_b,_cst1);
_b = _mm256_andnot_si256(_cst2,_b);
_mm256_stream_si256(reinterpret_cast<__m256i*>(b),_b);
return 0;
}
和:
#include <immintrin.h>
#include <iostream>
int main(int argc,char* argv[])
{
schar a[]={-1,-2,-3,4,5,6,-7,-8,9,10,-11,12,13,14,15,16,17,-128,19,20,21,22,23,24,25,26,27,28,29,30,31,32};
uchar b[32] = {0};
__m256i _a = _mm256_stream_load_si256(reinterpret_cast<const __m256i*>(a));
__m256i _b;
__m256i _cst1 = _mm256_set1_epi8(0x7F);
__m256i _cst2 = _mm256_set1_epi8(0xFF);
// The way I do the convertion is inspired by an algorithm from OpenCV.
// Convertion from epi8 -> epi16
_b = _mm256_srai_epi16(_mm256_unpacklo_epi8(_mm256_setzero_si256(),_a),8);
_a = _mm256_srai_epi16(_mm256_unpackhi_epi8(_mm256_setzero_si256(),_a),8);
_a = _mm256_xor_si256(_a,_cst1);
_a = _mm256_andnot_si256(_cst2,_a);
_b = _mm256_xor_si256(_b,_cst1);
_b = _mm256_andnot_si256(_cst2,_b);
_b = _mm256_packus_epi16(_b,_a);
_mm256_stream_si256(reinterpret_cast<__m256i*>(b[0]),_b);
return 0;
}
我的调查显示我的一部分问题是与and_not操作有关。
但我不知道为什么。
My investigation show me a part of the issue is related to the and_not operation. But I don't find why.
变量b应该包含以下序列:
[127,126,125,132,133 ,134,121,120,137,138,117,140,141,142,143,144,145,0,147,148,149,150,151,152,153,154,155,156,157,158的氨基酸序列(SEQ ID NO: ,159,160]。
The variable b should contain the following sequence : [127, 126, 125, 132, 133, 134, 121, 120, 137, 138, 117, 140, 141, 142, 143, 144, 145, 0, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160].
感谢您提供任何帮助。
推荐答案
是的,andnot绝对看起来粗略。由于 _cst2
值设置为 0xFF
,此操作将与您的 _b
向量为零。我想你混淆了参数的顺序。这是被反转的第一个参数。 查看参考。
Yeah, the "andnot" definitely looks sketchy. Since _cst2
values are set to 0xFF
, this operation will AND your _b
vector with zero. I think you mixed up the order of arguments. It's the first argument that gets inverted. See the reference.
我不明白其余的转换等。您只需要这个:
I don't understand the rest of the guff with conversions etc either. You just need this:
__m256i _a, _b;
_a = _mm256_stream_load_si256( reinterpret_cast<__m256i*>(a) );
_b = _mm256_xor_si256( _a, _mm256_set1_epi8( 0x7f ) );
_b = _mm256_andnot_si256( _b, _mm256_set1_epi8( 0xff ) );
_mm256_stream_si256( reinterpret_cast<__m256i*>(b), _b );
另一个解决方案是只添加128,但我不确定overflow这种情况:
An alternative solution is to just add 128, but I'm not certain of the implications of overflow in this case:
__m256i _a, _b;
_a = _mm256_stream_load_si256( reinterpret_cast<__m256i*>(a) );
_b = _mm256_add_epi8( _a, _mm256_set1_epi8( 0x80 ) );
_mm256_stream_si256( reinterpret_cast<__m256i*>(b), _b );
最后一件重要的事情是你的 a
和 b
数组必须具有32字节对齐。如果您使用C ++ 11,您可以使用 alignas
:
One final important thing is that your a
and b
arrays MUST have 32-byte alignment. If you are using C++11 you can use alignas
:
alignas(32) signed char a[32] = { -1,-2,-3,4,5,6,-7,-8,9,10,-11,12,13,14,15,16,17,
-128,19,20,21,22,23,24,25,26,27,28,29,30,31,32 };
alignas(32) unsigned char b[32] = {0};
否则,您需要使用 即 _mm256_loadu_si256
和 _mm256_storeu_si256
。但是那些不具有与流指令相同的非时间缓存属性。
Otherwise you will need to use non-aligned load and store instructions, i.e. _mm256_loadu_si256
and _mm256_storeu_si256
. But those don't have the same non-temporal cache properties as the stream instructions.
这篇关于与AVX2和范围保存的按位类型转换的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!