间隔单词中的位的快速方法是什么? [英] What's a fast way to space-out bits within a word?
问题描述
我在64位寄存器的下部有一个32位值;顶部为0( X
表示具有信息的位,从LSB到MSB列出的位):
X X X ... X 0 0 0 0 ... 0
现在,我想用信息隔开"位,以便我拥有
X 0 X 0 X 0 ... X 0
(或者,如果您想将0放在首位,那么
0 X 0 X 0 X 0 ... X
也可以.)
最快的方法是什么?
与多CPU架构有关的答案可能会很好,但是与Intel x86_64和/或nVIDIA Pascal SM特定的东西最为相关.
这称为将32个0/1值打包到单个32位变量的位中最快的方法是什么?
一个通用的解决方案可能是
<代码> uint64_t bit_expand(uint64_t x)//00000000ABCDEFGH{x =((x& 0xFFFF0000)<< 32)|((x& 0x0000FFFF)<< 16);//ABCD0000EFGH0000x =(x& 0xFF000000FF000000)|((x& 0x00FF000000FF0000)>> 8);//AB00CD00EF00GH00x =(x& 0xF000F000F000F000)|((x& 0x0F000F000F000F00)>> 4);//A0B0C0D0E0F0G0H0x =(x& 0xC0C0C0C0C0C0C0C0)|((x& 0x3030303030303030)> 2);x =(x& 0xA0A0A0A0A0A0A0A0A)|((x& 0x5050505050505050)> 1);返回x;}
但是,常量生成在RISC架构上可能效率不高,因为64位立即数不能像x86那样存储在单个指令中.甚至在x86 输出组件是相当长.这是 Bit Twiddling Hacks
上描述的另一种可能的实现. 静态const unsigned int B [] = {0x55555555,0x33333333,0x0F0F0F0F,0x00FF00FF};静态const unsigned int S [] = {1,2,4,8};unsigned int x;//交织x和y的低16位,所以x的位无符号整数y;//处于偶数位置,并且位于y的奇数位;unsigned int z;//z获得最终的32位莫顿数.//x和y最初必须小于65536.x =(x |(x << S [3]))&B [3];x =(x |(x << S [2]))&B [2];x =(x |(x<< S [1]))&B [1];x =(x |(x<< S [0]))&B [0];y =(y |(y< S [3]))&B [3];y =(y |(y< S [2]))&B [2];y =(y |(y< S [1]))&B [1];y =(y |(y< S [0]))&B [0];z = x |(y<< 1);
也可以使用查找表
<代码> #define EXPAND4(a)(((((a)& 0x8)<<< 4)|((((a)& 0x4)<< 2)\|((((a)& 0x2)<< 1)|((((a)& 0x1))))const uint8_t LUT [16] = {EXPAND4(0),EXPAND4(1),EXPAND4(2),EXPAND4(3),EXPAND4(4),EXPAND4(5),EXPAND4(6),EXPAND4(7),EXPAND4(8),EXPAND4(9),EXPAND4(10),EXPAND4(11),EXPAND4(12),EXPAND4(13),EXPAND4(14),EXPAND4(15)};输出=((uint64_t)LUT [(x> 28)& 0xF]<< 56)|((uint64_t)LUT [(x> 24)& 0xF]<< 48)|(((uint64_t)LUT [(x> 20)& 0xF]<< 40)|((uint64_t)LUT [(x> 16)& 0xF]<< 32)|((uint64_t)LUT [(x> 12)& 0xF]<< 24)|((uint64_t)LUT [(x> 8)& 0xF]<< 16)|(((uint64_t)LUT [(x> 4)& 0xF]<< 8)|(((uint64_t)LUT [(x> 0)& 0xF]<< 0);
如有必要,可以增加查询表的大小
在带有 BMI2 的x86上, 另一种无需位沉积/扩展指令但具有快速乘法器的体系结构解决方案 它的工作方式是这样的 将比特拉近的魔术数是这样计算的 在输出组件可以看出此处.您可以更改编译器以查看其在各种架构上的完成情况 在 Bit Twiddling Hacks 页面上,还有另一种方法 更多解决方案可以在不使用BMI2的PDEP的便携式高效替代方案中找到? 如您所见,如果没有可用的比特存放指令,操作将非常复杂.如果您不这样做,则最好使用SIMD并行进行 I have a 32-bit value in the lower part of a 64-bit register; the top part is 0 ( Now, I want to "space out" the bits with information, so that I have (or if you'd rather put the 0's first, then is fine too.) What's a fast way to do that? A multi-CPU-architecture-relevant answer would be nice, but something specific to Intel x86_64 and/or nVIDIA Pascal SM's would be the most relevant. This is known as Morton number, which is a specific case of parallel expand which is in turn the reverse of compress right in the following questions One general solution might be However the constant generation might be inefficient on RISC architectures because the 64-bit immediate can't be stored in a single instruction like on x86. Even on x86 the output assembly is quite long. Here is another possible implementation as described on Bit Twiddling Hacks A lookup table can also be used The size of the lookup table may be increased if necessary On x86 with BMI2 there's hardware support with PDEP instruction which can be accessed via the following intrinsic
Another solution on architectures without bit deposit/expand instruction but with fast multipliers The way it works is like this The magic number for bringing the bits closer is calculated like this The output assembly can be seen here. You can change the compiler to see how it's done on various architectures There's also an alternative way on the Bit Twiddling Hacks page More solutions can be found in Portable efficient alternative to PDEP without using BMI2? Related: How to do bit striping on pixel data? As you can see, without the availability of a bit deposit instruction it'll be quite complex in terms of operations. If you do a not of bit striping like this then it'll be better to do in parallel using SIMD 这篇关于间隔单词中的位的快速方法是什么?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋! output = _pdep_u64(x,0xaaaaaaaaaaaaaaaaaaaaULL);
uint64_t spaceOut8bits(uint8_t b){uint64_t MAGIC = 0x8040201008040201;uint64_t MASK = 0x8080808080808080;uint64_t expand8bits = htobe64((((MAGIC * b)& MASK)>> 7);uint64_t spacedOutBits = expand8bits * 0x41041&0xAA000000AA000000;返回(spacedOutBits |(spacedOutBits<< 24))&0xFFFF000000000000;}uint64_t spaceOut64bits(uint64_t x){返回(spaceOut8bits(x> 24)>> 0)|(spaceOut8bits(x> 16)>> 16)|(spaceOut8bits(x> 8)>> 32)|(spaceOut8bits(x> 0)>> 48);}
abcdefgh
扩展到 a0000000 b0000000 c0000000 d0000000 e0000000 f0000000 g0000000 h0000000 并存储在 expand8bits
spacedOutBits
将包含 a0b0c0d0 00000000 00000000 00000000 e0f0g0h0 00000000 00000000 00000000 .我们将结果中的两个字节合并在一起 <代码> a0000000b0000000c0000000d0000000e0000000f0000000g0000000h0000000×1000001000001000001───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────a0000000b0000000c0000000d0000000e0000000f0000000g0000000h000000000b0000000c0000000d0000000e0000000f0000000g0000000h0000000+ 0000c0000000d0000000e0000000f0000000g0000000h0000000000000d0000000e0000000f0000000g0000000h0000000───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────a0b0c0d0b0c0d0e0c0d0e0f0d0e0f0g0e0f0g0h0f0g0h000g0h00000h0000000&101010100000000000000000000000001010101010000000000000000000000000000000──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────a0b0c0d0000000000000000000000000e0f0g0h0000000000000000000000000
z =((x * 0x0101010101010101ULL& 0x8040201008040201ULL)*0x0102040810204081ULL>49)&0x5555 |((y * 0x0101010101010101ULL& 0x8040201008040201ULL)*0x0102040810204081ULL>48)&0xAAAA;
X
denotes a bit with information, bits listed from LSB to MSB):X X X ... X 0 0 0 0 ... 0
X 0 X 0 X 0 ... X 0
0 X 0 X 0 X 0 ... X
uint64_t bit_expand(uint64_t x) // 00000000ABCDEFGH
{
x = ((x & 0xFFFF0000) << 32) | ((x & 0x0000FFFF) << 16); // ABCD0000EFGH0000
x = (x & 0xFF000000FF000000) | ((x & 0x00FF000000FF0000) >> 8); // AB00CD00EF00GH00
x = (x & 0xF000F000F000F000) | ((x & 0x0F000F000F000F00) >> 4); // A0B0C0D0E0F0G0H0
x = (x & 0xC0C0C0C0C0C0C0C0) | ((x & 0x3030303030303030) >> 2);
x = (x & 0xA0A0A0A0A0A0A0A0) | ((x & 0x5050505050505050) >> 1);
return x;
}
static const unsigned int B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF};
static const unsigned int S[] = {1, 2, 4, 8};
unsigned int x; // Interleave lower 16 bits of x and y, so the bits of x
unsigned int y; // are in the even positions and bits from y in the odd;
unsigned int z; // z gets the resulting 32-bit Morton Number.
// x and y must initially be less than 65536.
x = (x | (x << S[3])) & B[3];
x = (x | (x << S[2])) & B[2];
x = (x | (x << S[1])) & B[1];
x = (x | (x << S[0])) & B[0];
y = (y | (y << S[3])) & B[3];
y = (y | (y << S[2])) & B[2];
y = (y | (y << S[1])) & B[1];
y = (y | (y << S[0])) & B[0];
z = x | (y << 1);
#define EXPAND4(a) ((((a) & 0x8) << 4) | (((a) & 0x4) << 2) \
| (((a) & 0x2) << 1) | (((a) & 0x1)))
const uint8_t LUT[16] = {
EXPAND4( 0), EXPAND4( 1), EXPAND4( 2), EXPAND4( 3),
EXPAND4( 4), EXPAND4( 5), EXPAND4( 6), EXPAND4( 7),
EXPAND4( 8), EXPAND4( 9), EXPAND4(10), EXPAND4(11),
EXPAND4(12), EXPAND4(13), EXPAND4(14), EXPAND4(15)
};
output = ((uint64_t)LUT[(x >> 28) & 0xF] << 56) | ((uint64_t)LUT[(x >> 24) & 0xF] << 48)
| ((uint64_t)LUT[(x >> 20) & 0xF] << 40) | ((uint64_t)LUT[(x >> 16) & 0xF] << 32)
| ((uint64_t)LUT[(x >> 12) & 0xF] << 24) | ((uint64_t)LUT[(x >> 8) & 0xF] << 16)
| ((uint64_t)LUT[(x >> 4) & 0xF] << 8) | ((uint64_t)LUT[(x >> 0) & 0xF] << 0);
output = _pdep_u64(x, 0xaaaaaaaaaaaaaaaaULL);
uint64_t spaceOut8bits(uint8_t b)
{
uint64_t MAGIC = 0x8040201008040201;
uint64_t MASK = 0x8080808080808080;
uint64_t expand8bits = htobe64(((MAGIC*b) & MASK) >> 7);
uint64_t spacedOutBits = expand8bits*0x41041 & 0xAA000000AA000000;
return (spacedOutBits | (spacedOutBits << 24)) & 0xFFFF000000000000;
}
uint64_t spaceOut64bits(uint64_t x)
{
return (spaceOut8bits(x >> 24) >> 0)
| (spaceOut8bits(x >> 16) >> 16)
| (spaceOut8bits(x >> 8) >> 32)
| (spaceOut8bits(x >> 0) >> 48);
}
abcdefgh
to a0000000 b0000000 c0000000 d0000000 e0000000 f0000000 g0000000 h0000000 and store in expand8bits
spacedOutBits
will contain a0b0c0d0 00000000 00000000 00000000 e0f0g0h0 00000000 00000000 00000000. We'll merge the two bytes in the result together a0000000b0000000c0000000d0000000e0000000f0000000g0000000h0000000
× 1000001000001000001
────────────────────────────────────────────────────────────────
a0000000b0000000c0000000d0000000e0000000f0000000g0000000h0000000
00b0000000c0000000d0000000e0000000f0000000g0000000h0000000
+ 0000c0000000d0000000e0000000f0000000g0000000h0000000
000000d0000000e0000000f0000000g0000000h0000000
────────────────────────────────────────────────────────────────
a0b0c0d0b0c0d0e0c0d0e0f0d0e0f0g0e0f0g0h0f0g0h000g0h00000h0000000
& 1010101000000000000000000000000010101010000000000000000000000000
────────────────────────────────────────────────────────────────
a0b0c0d0000000000000000000000000e0f0g0h0000000000000000000000000
z = ((x * 0x0101010101010101ULL & 0x8040201008040201ULL) *
0x0102040810204081ULL >> 49) & 0x5555 |
((y * 0x0101010101010101ULL & 0x8040201008040201ULL) *
0x0102040810204081ULL >> 48) & 0xAAAA;