与AVX2和范围保存的按位类型转换 [英] bitwise type convertion with AVX2 and range preservation

查看:849
本文介绍了与AVX2和范围保存的按位类型转换的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我想将signed char的向量转换为unsigned char的向量。
我想保留每个类型的值范围。

I want to convert a vector of signed char into a vector of unsigned char. I want to preserve the value range for each type.

我的意思是signed char的值范围是-128和+127, unsigned char元素在0到255之间。

I mean the value range of signed char is -128 and +127 when the value range of an unsigned char element is between 0 - 255.

没有内联函数我可以这样做:

Without intrinsics I can do this almost like that :

#include <iostream>

int main(int argc,char* argv[])
{

typedef signed char schar;
typedef unsigned char uchar;

schar a[]={-1,-2,-3,4,5,6,-7,-8,9,10,-11,12,13,14,15,16,17,-128,19,20,21,22,23,24,25,26,27,28,29,30,31,32};

uchar b[32] = {0};

    for(int i=0;i<32;i++)
        b[i] = 0xFF & ~(0x7F ^ a[i]);

    return 0;

}

所以使用AVX2我写了以下程序:

So using AVX2 I wrote the following program :

#include <immintrin.h>
#include <iostream>

int main(int argc,char* argv[])
{
    schar a[]={-1,-2,-3,4,5,6,-7,-8,9,10,-11,12,13,14,15,16,17,-128,19,20,21,22,23,24,25,26,27,28,29,30,31,32};

     uchar b[32] = {0};

    __m256i _a = _mm256_stream_load_si256(reinterpret_cast<const __m256i*>(a));
    __m256i _b;
    __m256i _cst1 = _mm256_set1_epi8(0x7F);
    __m256i _cst2 = _mm256_set1_epi8(0xFF);

    _a = _mm256_xor_si256(_a,_cst1);
    _a = _mm256_andnot_si256(_cst2,_a);

// The way I do the convertion is inspired by an algorithm from OpenCV. 
// Convertion from epi8 -> epi16
    _b = _mm256_srai_epi16(_mm256_unpacklo_epi8(_mm256_setzero_si256(),_a),8);
    _a = _mm256_srai_epi16(_mm256_unpackhi_epi8(_mm256_setzero_si256(),_a),8);

    // convert from epi16 -> epu8.
    _b = _mm256_packus_epi16(_b,_a);

_mm256_stream_si256(reinterpret_cast<__m256i*>(b),_b);

return 0;
}

当我显示varaible b时,
我还检查以下情况:

When I display the varaible b it was fully empty. I check also the following situations :

   #include <immintrin.h>
    #include <iostream>

    int main(int argc,char* argv[])

{
    schar a[]={-1,-2,-3,4,5,6,-7,-8,9,10,-11,12,13,14,15,16,17,-128,19,20,21,22,23,24,25,26,27,28,29,30,31,32};

     uchar b[32] = {0};

    __m256i _a = _mm256_stream_load_si256(reinterpret_cast<const __m256i*>(a));
    __m256i _b;
    __m256i _cst1 = _mm256_set1_epi8(0x7F);
    __m256i _cst2 = _mm256_set1_epi8(0xFF);


// The way I do the convertion is inspired by an algorithm from OpenCV. 
// Convertion from epi8 -> epi16
    _b = _mm256_srai_epi16(_mm256_unpacklo_epi8(_mm256_setzero_si256(),_a),8);
    _a = _mm256_srai_epi16(_mm256_unpackhi_epi8(_mm256_setzero_si256(),_a),8);

    // convert from epi16 -> epu8.
    _b = _mm256_packus_epi16(_b,_a);

_b = _mm256_xor_si256(_b,_cst1);
_b = _mm256_andnot_si256(_cst2,_b);


_mm256_stream_si256(reinterpret_cast<__m256i*>(b),_b);

return 0;
}

和:

 #include <immintrin.h>
    #include <iostream>

    int main(int argc,char* argv[])

{
    schar a[]={-1,-2,-3,4,5,6,-7,-8,9,10,-11,12,13,14,15,16,17,-128,19,20,21,22,23,24,25,26,27,28,29,30,31,32};

     uchar b[32] = {0};

    __m256i _a = _mm256_stream_load_si256(reinterpret_cast<const __m256i*>(a));
    __m256i _b;
    __m256i _cst1 = _mm256_set1_epi8(0x7F);
    __m256i _cst2 = _mm256_set1_epi8(0xFF);


// The way I do the convertion is inspired by an algorithm from OpenCV. 
// Convertion from epi8 -> epi16
_b = _mm256_srai_epi16(_mm256_unpacklo_epi8(_mm256_setzero_si256(),_a),8);
_a = _mm256_srai_epi16(_mm256_unpackhi_epi8(_mm256_setzero_si256(),_a),8);

_a = _mm256_xor_si256(_a,_cst1);
_a = _mm256_andnot_si256(_cst2,_a);

_b = _mm256_xor_si256(_b,_cst1);
_b = _mm256_andnot_si256(_cst2,_b);

_b = _mm256_packus_epi16(_b,_a);

_mm256_stream_si256(reinterpret_cast<__m256i*>(b[0]),_b);

return 0;
}

我的调查显示我的一部分问题是与and_not操作有关。
但我不知道为什么。

My investigation show me a part of the issue is related to the and_not operation. But I don't find why.

变量b应该包含以下序列:
[127,126,125,132,133 ,134,121,120,137,138,117,140,​​141,142,143,144,145,0,147,148,149,150,151,152,153,154,155,156,157,158的氨基酸序列(SEQ ID NO: ,159,160]。

The variable b should contain the following sequence : [127, 126, 125, 132, 133, 134, 121, 120, 137, 138, 117, 140, 141, 142, 143, 144, 145, 0, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160].

感谢您提供任何帮助。

推荐答案

是的,andnot绝对看起来粗略。由于 _cst2 值设置为 0xFF ,此操作将与您的 _b 向量为零。我想你混淆了参数的顺序。这是被反转的第一个参数。 查看参考

Yeah, the "andnot" definitely looks sketchy. Since _cst2 values are set to 0xFF, this operation will AND your _b vector with zero. I think you mixed up the order of arguments. It's the first argument that gets inverted. See the reference.

我不明白其余的转换。您只需要这个:

I don't understand the rest of the guff with conversions etc either. You just need this:

__m256i _a, _b;
_a = _mm256_stream_load_si256( reinterpret_cast<__m256i*>(a) );
_b = _mm256_xor_si256( _a, _mm256_set1_epi8( 0x7f ) );
_b = _mm256_andnot_si256( _b, _mm256_set1_epi8( 0xff ) );
_mm256_stream_si256( reinterpret_cast<__m256i*>(b), _b );

另一个解决方案是只添加128,但我不确定overflow这种情况:

An alternative solution is to just add 128, but I'm not certain of the implications of overflow in this case:

__m256i _a, _b;
_a = _mm256_stream_load_si256( reinterpret_cast<__m256i*>(a) );
_b = _mm256_add_epi8( _a, _mm256_set1_epi8( 0x80 ) );
_mm256_stream_si256( reinterpret_cast<__m256i*>(b), _b );

最后一件重要的事情是你的 a b 数组必须具有32字节对齐。如果您使用C ++ 11,您可以使用 alignas

One final important thing is that your a and b arrays MUST have 32-byte alignment. If you are using C++11 you can use alignas:

alignas(32) signed char a[32] = { -1,-2,-3,4,5,6,-7,-8,9,10,-11,12,13,14,15,16,17,
                                 -128,19,20,21,22,23,24,25,26,27,28,29,30,31,32 };
alignas(32) unsigned char b[32] = {0};

否则,您需要使用 _mm256_loadu_si256 _mm256_storeu_si256 。但是那些不具有与流指令相同的非时间缓存属性。

Otherwise you will need to use non-aligned load and store instructions, i.e. _mm256_loadu_si256 and _mm256_storeu_si256. But those don't have the same non-temporal cache properties as the stream instructions.

这篇关于与AVX2和范围保存的按位类型转换的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆