在软件中实现SSE 4.2的CRC32C [英] Implementing SSE 4.2's CRC32C in software

查看：1726 发布时间：2016/10/13 10:15:11 c++ polynomial-math crc32

本文介绍了在软件中实现SSE 4.2的CRC32C的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

所以我有一个结合CRC32C校验和的设计，以确保数据没有被损坏。我决定使用CRC32C，因为如果软件运行的计算机支持SSE 4.2，我可以同时拥有软件版本和硬件加速版本。

我要去英特尔开发人员手册（vol 2A），似乎提供了 crc32 指令的算法。但是，我没有运气。英特尔的开发者指南如下：

  BIT_REFLECT32：DEST [31-0] = SRC [0-31] 
 MOD2：多项式除法模数2的余数
 
 TEMP1 [31-0] < -  BIT_REFLECT（SRC [31-0]）
 TEMP2 [31-0] < -  BIT_REFLECT [31-0]）
 TEMP3 [63-0]<  -  TEMP1 [31-0]<< 32 
 TEMP4 [63-0]<  -  TEMP2 [31-0]<< 32 
 TEMP5 [63-0]<  -  TEMP3 [63-0] XOR TEMP4 [63-0] 
 TEMP6 [31-0]<  -  TEMP5 [63-0] MOD2 0x11EDC6F41 
 DEST [31-0] < -  BIT_REFLECT（TEMP6 [31-0]）

，据我所知，我已经做了一切正确的开始 TEMP6 的行，但我想我可能是误解多项式除法，或实现不正确。如果我的理解正确， 1/1 mod 2 = 1 ， 0/1 mod 2 = 0 除以零是不确定的。

我不明白是如何使用64位和33位操作数的二进制除法将如何工作。如果 SRC 为 0x00000000 ，且 DEST 为 0xFFFFFFFF ， TEMP5 [63-32] 将为所有设置位，而 TEMP5 [31-0] 如果我使用 TEMP5 的位，作为分子，因为多项式 11EDC6F41 只有33位长（因此将其转换为64位无符号整数会留下前30位

但是，如果我使用多项式作为分子，则<$ c $的底部32位c> TEMP5 未设置，导致其中除以零，结果的前30位将为零，因为分子的前30位为零，因为 0/1 mod 2 = 0 。

我误解了这是如何工作的吗？只是平淡地缺少一些东西？或者英特尔在他们的文档中遗漏了一些关键的步骤？

我去英特尔开发者指南看似是他们使用的算法的原因是因为他们使用33并且我想使输出相同，这在我使用32位多项式 1EDC6F41 （如下所示）时没有发生。

  uint32_t poly = 0x1EDC6F41，sres，crcTable [256]，data = 0x00000000; 
 
 for（n = 0; n <256; n ++）{
 sres = n; 
 for（k = 0; k <8; k ++）
 sres =（sres& 1）== 1？聚（sres> 1）：（sres> 1）; 
 crcTable [n] = sres; 
} 
 sres = 0xFFFFFFFF; 
 
 for（n = 0; n <4; n ++）{
 sres = crcTable [（sres ^ data）& 0xFF] ^（sres>> 8）; 
}

上述代码产生 4138093821 作为输出， crc32 操作码使用输入 0x00000000 生成 2346497208 code>。

 
 
 对不起，如果这些地方写得不好或难以理解，对我来说已经晚了。
解决方案
这里有CRC-32C的软件和硬件版本。软件版本已优化，每次可处理8个字节。硬件版本被优化为在单个核上并行地有效地运行三个 crc32q 指令，因为该指令的吞吐量是一个周期，但是延迟是三个周期。 p> 
 
 
  / * crc32c.c  - 使用Intel crc32指令计算CRC-32C 
 *版权所有
 *版本1.1 1 Aug 2013 Mark Adler 
 * / 
 
 / * 
本软件按原样提供，没有任何明示或默示的
保修。在任何情况下，作者不负责任何损害
由于使用本软件。 
 
允许任何人出于任何目的使用此软件，
包括商业应用程序，并自由修改和重新分配
，但有以下限制：
 
 1.本软件的来源不得歪曲;你不能
声称你写的原始软件。如果在产品中使用此软件
，则产品文档中的确认将是
，但不是必需的。 
 2.改变的源版本必须明确标记为这样，并且不能将
表示为原始软件。 
 3.本通知不得从任何来源分发中删除或更改。 
 
 Mark Adler 
 madler@alumni.caltech.edu 
 * / 
 
 / *在英特尔SSE 4.2处理器上使用硬件CRC指令。这将计算一个
 CRC-32C，*不是以太网和zip，gzip等使用的CRC-32。软件
版本作为后退以及速度比较提供。 * / 
 
 / *版本历史：
 1.0 2013年2月10日第一版
 1.1 2013年8月1日正确评论为什么三个crc指令并行
 * / 
 
 #include< stdio.h> 
 #include< stdlib.h> 
 #include< stdint.h> 
 #include< unistd.h> 
 #include< pthread.h> 
 
 / * CRC-32C（iSCSI）多项式，按颠倒的顺序排列。 * / 
 #define POLY 0x82f63b78 
 
 / *一个四字一次软件crc的表。 * / 
 static pthread_once_t crc32c_once_sw = PTHREAD_ONCE_INIT; 
 static uint32_t crc32c_table [8] [256]; 
 
 / *构造软件CRC-32C计算表。 * / 
 static void crc32c_init_sw（void）
 {
 uint32_t n，crc，k; 
 
 for（n = 0; n <256; n ++）{
 crc = n; 
 crc = crc& 1？ （crc>> 1）^ POLY：crc>> 1; 
 crc = crc& 1？ （crc>> 1）^ POLY：crc>> 1; 
 crc = crc& 1？ （crc>> 1）^ POLY：crc>> 1; 
 crc = crc& 1？ （crc>> 1）^ POLY：crc>> 1; 
 crc = crc& 1？ （crc>> 1）^ POLY：crc>> 1; 
 crc = crc& 1？ （crc>> 1）^ POLY：crc>> 1; 
 crc = crc& 1？ （crc>> 1）^ POLY：crc>> 1; 
 crc = crc& 1？ （crc>> 1）^ POLY：crc>> 1; $ b $ c crc32c_table [0] [n] = crc; 
} 
 for（n = 0; n <256; n ++）{
 crc = crc32c_table [0] [n] 
 for（k = 1; k <8; k ++）{
 crc = crc32c_table [0] [crc& 0xff] ^（crc>> 8）; 
 crc32c_table [k] [n] = crc; 
} 
} 
} 
 
 / *表驱动软件版本作为后退。这比使用硬件指令的
慢约15倍。这假设小端整数，
，就像这里的汇编代码所用的英特尔处理器的情况一样。 * / 
 static uint32_t crc32c_sw（uint32_t crci，const void * buf，size_t len）
 {
 const unsigned char * next = buf; 
 uint64_t crc; 
 
 pthread_once（& crc32c_once_sw，crc32c_init_sw）; 
 crc = crci ^ 0xffffffff; 
 while（len&&（（uintptr_t）next& 7）！= 0）{
 crc = crc32c_table [0] [（crc ^ * next ++）& 0xff] ^（crc>> 8）; 
 len--; 
} 
 while（len> = 8）{
 crc ^ = *（uint64_t *）next; 
 crc = crc32c_table [7] [crc& 0xff] ^ 
 crc32c_table [6] [（crc>> 8）& 0xff] ^ 
 crc32c_table [5] [（crc>> 16）& 0xff] ^ 
 crc32c_table [4] [（crc>> 24）& 0xff] ^ 
 crc32c_table [3] [（crc>> 32）& 0xff] ^ 
 crc32c_table [2] [（crc>> 40）& 0xff] ^ 
 crc32c_table [1] [（crc>> 48）& 0xff] ^ 
 crc32c_table [0] [crc>> 56]; 
 next + = 8; 
 len  -  = 8; 
} 
 while（len）{
 crc = crc32c_table [0] [（crc ^ * next ++）& 0xff] ^（crc>> 8）; 
 len--; 
} 
 return（uint32_t）crc ^ 0xffffffff; 
} 
 
 / *将一个矩阵乘以两个元素的Galois域上的向量，
 GF（2）。每个元素都是无符号整数中的一个位。在
 vec中，对于最高有效位，必须具有
的最少数量的项的二的幂。 * / 
 static inline uint32_t gf2_matrix_times（uint32_t * mat，uint32_t vec）
 {
 uint32_t sum; 
 
 sum = 0; 
 while（vec）{
 if（vec& 1）
 sum ^ = * mat; 
 vec>> = 1; 
 mat ++; 
} 
 return sum; 
} 
 
 / *在GF（2）上乘一个矩阵。垫和正方形都必须有32 
行。 * / 
 static inline void gf2_matrix_square（uint32_t * square，uint32_t * mat）
 {
 int n; 
 
 for（n = 0; n <32; n ++）
 square [n] = gf2_matrix_times（mat，mat [n]）; 
} 
 
 / *构造一个运算符以将len零应用于crc。 len必须是
的幂。如果len不是2的幂，则结果与对于
的最大功率相同，两个小于len。 len == 0的结果与len == 1的
相同。这个例程的版本可以很容易地写为任何
 len，但这不是这个应用程序所需要的。 * / 
 static void crc32c_zeros_op（uint32_t * even，size_t len）
 {
 int n; 
 uint32_t row; 
 uint32_t odd [32]; / *奇数幂的两个零操作符* / 
 
 / * put操作符用于奇数中的一个零位* / 
 odd [0] = POLY; / * CRC-32C多项式* / 
行= 1; 
 for（n = 1; n <32; n ++）{
 odd [n] = row; 
 row<< = 1; 
} 
 
 / *在偶数* / 
 gf2_matrix_square（even，odd）中的两个零位放置运算符。 
 
 / *为奇数的四个零位放置运算符* / 
 gf2_matrix_square（odd，even）; 
 
 / *第一个方块将把运算符放到一个零字节（8个零位），
放在偶数下一个正方形舍入运算符中，用于奇数的两个零字节，因此
 on，直到len被旋转到零* / 
 do {
 gf2_matrix_square（even，odd）; 
 len>> = 1; 
 if（len == 0）
 return; 
 gf2_matrix_square（odd，even）; 
 len>> = 1; 
} while（len）;对于（n = 0; n <32; n ++）
 even [n] = odd [n]，
 
 / * answer结果为奇数拷贝到偶数* / 
 n]。 
} 
 
 / *获取一个长度并构建四个查找表，用于在操作数上逐字节应用零长度运算符
。 * / 
 static void crc32c_zeros（uint32_t zeros [] [256]，size_t len）
 {
 uint32_t n; 
 uint32_t op [32]; 
 
 crc32c_zeros_op（op，len）; 
 for（n = 0; n <256; n ++）{
 zeros [0] [n] = gf2_matrix_times 
 zeros [1] [n] = gf2_matrix_times（op，n <8）; 
 zeros [2] [n] = gf2_matrix_times（op，n <16）; 
 zeros [3] [n] = gf2_matrix_times（op，n <24）; 
} 
} 
 
 / *将zeros运算符表应用于crc。 * / 
 static inline uint32_t crc32c_shift（uint32_t zeros [] [256]，uint32_t crc）
 {
 return zeros [0] [crc& 0xff] ^ zeros [1] [（crc>> 8）& 0xff] ^ 
 zeros [2] [（crc>> 16）& 0xff] ^ zeros [3] [crc>> 24]; 
} 
 
 / *三向并行crc计算的块大小。 LONG和SHORT必须
都是两个的幂。相应的字符串常量必须相应地设置为
，以用于构建汇编器指令。 * / 
 #define LONG 8192 
 #define LONGx18192
 #define LONGx216384
 #define SHORT 256 
 #define SHORTx1256
 #define SHORTx2512
 
 / *用于通过LONG和SHORT零移位crc的硬件crc的表。 * / 
 static pthread_once_t crc32c_once_hw = PTHREAD_ONCE_INIT; 
 static uint32_t crc32c_long [4] [256]; 
 static uint32_t crc32c_short [4] [256]; 
 
 / *初始化用于移动crcs的表。 * / 
 static void crc32c_init_hw（void）
 {
 crc32c_zeros（crc32c_long，LONG）; 
 crc32c_zeros（crc32c_short，SHORT）; 
} 
 
 / *使用Intel硬件指令计算CRC-32C。 * / 
 static uint32_t crc32c_hw（uint32_t crc，const void * buf，size_t len）
 {
 const unsigned char * next = buf; 
 const unsigned char * end; 
 uint64_t crc0，crc1，crc2; / *需要64位为crc32q * / 
 
 / *填充移位表第一次通过* / 
 pthread_once（& crc32c_once_hw，crc32c_init_hw）; 
 
 / *预处理crc * / 
 crc0 = crc ^ 0xffffffff; 
 
 / *计算最多七个前导字节的crc，将数据指针
带到一个八字节边界* / 
 while（len&&（（uintptr_t ）next& 7）！= 0）{
 __asm __（crc32b \t（％1），％0
：= r（crc0）
： r（下一个），0（crc0））; 
 next ++; 
 len--; 
} 
 
 / *计算LONG * 3字节集合上的crc，执行三个独立的crc 
指令，每个指令在LONG字节上 - 这是为Nehalem优化的，
 Westmere，Sandy Bridge和Ivy Bridge体系结构，其每周期具有一个crc的
吞吐量，但是具有三个周期的延迟* / 
同时（len> = LONG * 3）{ 
 crc1 = 0; 
 crc2 = 0; 
 end = next + LONG; 
 do {
 __asm __（crc32q\t（％3），％0 \\\
\t
crc32q\tLONGx1（％3）， ％1 \\\
\t
crc32q\tLONGx2（％3），％2
：= r（crc0），= r（crc1） = r（crc2）
：r（next），0（crc0），1（crc1），2（crc2） 
 next + = 8; 
} while（next< end）; 
 crc0 = crc32c_shift（crc32c_long，crc0）^ crc1; 
 crc0 = crc32c_shift（crc32c_long，crc0）^ crc2; 
 next + = LONG * 2; 
 len  -  = LONG * 3; 
} 
 
 / *做同样的事情，但现在在SHORT * 3块为剩余数据少
比一个LONG * 3块* / 
 while len> = SHORT * 3）{
 crc1 = 0; 
 crc2 = 0; 
 end = next + SHORT; 
 do {
 __asm __（crc32q\t（％3），％0 \\\
\t
crc32q\tSHORTx1（％3）， ％1 \\\
\t
crc32q\tSHORTx2（％3），％2
：= r（crc0），= r（crc1） = r（crc2）
：r（next），0（crc0），1（crc1），2（crc2） 
 next + = 8; 
} while（next< end）; 
 crc0 = crc32c_shift（crc32c_short，crc0）^ crc1; 
 crc0 = crc32c_shift（crc32c_short，crc0）^ crc2; 
 next + = SHORT * 2; 
 len  -  = short * 3; 
} 
 
 / *计算剩余的8字节单元的crc小于SHORT * 3 
块* / 
 end = next +（len  - len& 7））; 
 while（next< end）{
 __asm __（crc32q \t（％1），％0
：= r（crc0）
： r（next），0（crc0））; 
 next + = 8; 
} 
 len& = 7; 
 
 / *计算最多七个结尾字节的crc * / 
 while（len）{
 __asm __（crc32b\t（％1），％0 
：= r（crc0）
：r（next），0（crc0） 
 next ++; 
 len--; 
} 
 
 / *返回一个后处理的crc * / 
 return（uint32_t）crc0 ^ 0xffffffff; 
} 
 
 / *检查SSE 4.2。 SSE 4.2在2008年11月推出的Nehalem处理器
中首次得到支持。这不会检查1992年在486SL上引入的
 cpuid指令本身的存在，所以这个
将在早期的x86处理器上失败。 cpuid适用于所有奔腾和以后的
处理器。 * / 
 #define SSE42（have）\ 
 do {\ 
 uint32_t eax，ecx; \ 
 eax = 1; \ 
 __asm __（cpuid\ 
：= c（ecx）\ 
：a（eax）\ 
：％ebx ％edx）; \ 
（have）=（ecx>> 20）& 1; \ 
} while（0）
 
 / *计算CRC-32C。如果crc32指令可用，使用硬件
版本。否则，使用软件版本。 * / 
 uint32_t crc32c（uint32_t crc，const void * buf，size_t len）
 {
 int sse42; 
 
 SSE42（sse42）; 
 return sse42？ crc32c_hw（crc，buf，len）：crc32c_sw（crc，buf，len）; 
} 
 
 #ifdef TEST 
 
 #define SIZE（262144 * 3）
 #define CHUNK SIZE 
 
 int main （int argc，char ** argv）
 {
 char * buf; 
 ssize_t got; 
 size_t off，n; 
 uint32_t crc; 
 
（void）argv; 
 crc = 0; 
 buf = malloc（SIZE）; 
 if（buf == NULL）{
 fputs（out of memory，stderr）; 
 return 1; 
} 
 while（（got = read（0，buf，SIZE））> 0）{
 off = 0; 
 do {
 n =（size_t）get  -  off; 
 if（n> CHUNK）
 n = CHUNK; 
 crc = argc> 1？ crc32c_sw（crc，buf + off，n）：
 crc32c（crc，buf + off，n） 
 off + = n; 
} while（off<（size_t）got）; 
} 
 free（buf）; 
 if（got == -1）{
 fputs（read error\\\
，stderr）; 
 return 1; 
} 
 printf（％08x\\\
，crc）; 
 return 0; 
} 
 
 #endif / * TEST * / 
  
 
So I have a design which incorporates CRC32C checksums to ensure data hasn't been damaged. I decided to use CRC32C because I can have both a software version and a hardware-accelerated version if the computer the software runs on supports SSE 4.2

I'm going by Intel's developer manual (vol 2A), which seems to provide the algorithm behind the crc32 instruction. However, I'm having little luck. Intel's developer guide says the following:
BIT_REFLECT32: DEST[31-0] = SRC[0-31]
MOD2: Remainder from Polynomial division modulus 2

TEMP1[31-0] <- BIT_REFLECT(SRC[31-0])
TEMP2[31-0] <- BIT_REFLECT(DEST[31-0])
TEMP3[63-0] <- TEMP1[31-0] << 32
TEMP4[63-0] <- TEMP2[31-0] << 32
TEMP5[63-0] <- TEMP3[63-0] XOR TEMP4[63-0]
TEMP6[31-0] <- TEMP5[63-0] MOD2 0x11EDC6F41
DEST[31-0]  <- BIT_REFLECT(TEMP6[31-0])
Now, as far as I can tell, I've done everything up to the line starting TEMP6 correctly, but I think I may be either misunderstanding the polynomial division, or implementing it incorrectly. If my understanding is correct, 1 / 1 mod 2 = 1, 0 / 1 mod 2 = 0, and both divides-by-zero are undefined.

What I don't understand is how binary division with 64-bit and 33-bit operands will work. If SRC is 0x00000000, and DEST is 0xFFFFFFFF, TEMP5[63-32] will be all set bits, while TEMP5[31-0] will be all unset bits.

If I was to use the bits from TEMP5 as the numerator, there would be 30 divisions by zero as the polynomial 11EDC6F41 is only 33 bits long (and so converting it to a 64-bit unsigned integer leaves the top 30 bits unset), and so the denominator is unset for 30 bits.

However, if I was to use the polynomial as the numerator, the bottom 32 bits of TEMP5 are unset, resulting in divides by zero there, and the top 30 bits of the result would be zero, as the top 30 bits of the numerator would be zero, as 0 / 1 mod 2 = 0.

Am I misunderstanding how this works? Just plain missing something? Or has Intel left out some crucial step in their documentation?

The reason I went to Intel's developer guide for what appeared to be the algorithm they used is because they used a 33-bit polynomial, and I wanted to make outputs identical, which didn't happen when I used the 32-bit polynomial 1EDC6F41 (show below).
uint32_t poly = 0x1EDC6F41, sres, crcTable[256], data = 0x00000000;

for (n = 0; n < 256; n++) {
    sres = n;
    for (k = 0; k < 8; k++)
        sres = (sres & 1) == 1 ? poly ^ (sres >> 1) : (sres >> 1);
    crcTable[n] = sres;
}
sres = 0xFFFFFFFF;

for (n = 0; n < 4; n++) {
    sres = crcTable[(sres ^ data) & 0xFF] ^ (sres >> 8);
}
The above code produces 4138093821 as an output, and the crc32 opcode produces 2346497208 using the input 0x00000000.

Sorry if this is badly written or incomprehensible in places, it is rather late for me.
 解决方案 
Here are both software and hardware versions of CRC-32C.  The software version is optimized to process eight bytes at a time.  The hardware version is optimized to run three crc32q instructions effectively in parallel on a single core, since the throughput of that instruction is one cycle, but the latency is three cycles.
/* crc32c.c -- compute CRC-32C using the Intel crc32 instruction
 * Copyright (C) 2013 Mark Adler
 * Version 1.1  1 Aug 2013  Mark Adler
 */

/*
  This software is provided 'as-is', without any express or implied
  warranty.  In no event will the author be held liable for any damages
  arising from the use of this software.

  Permission is granted to anyone to use this software for any purpose,
  including commercial applications, and to alter it and redistribute it
  freely, subject to the following restrictions:

  1. The origin of this software must not be misrepresented; you must not
     claim that you wrote the original software. If you use this software
     in a product, an acknowledgment in the product documentation would be
     appreciated but is not required.
  2. Altered source versions must be plainly marked as such, and must not be
     misrepresented as being the original software.
  3. This notice may not be removed or altered from any source distribution.

  Mark Adler
  madler@alumni.caltech.edu
 */

/* Use hardware CRC instruction on Intel SSE 4.2 processors.  This computes a
   CRC-32C, *not* the CRC-32 used by Ethernet and zip, gzip, etc.  A software
   version is provided as a fall-back, as well as for speed comparisons. */

/* Version history:
   1.0  10 Feb 2013  First version
   1.1   1 Aug 2013  Correct comments on why three crc instructions in parallel
 */

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <unistd.h>
#include <pthread.h>

/* CRC-32C (iSCSI) polynomial in reversed bit order. */
#define POLY 0x82f63b78

/* Table for a quadword-at-a-time software crc. */
static pthread_once_t crc32c_once_sw = PTHREAD_ONCE_INIT;
static uint32_t crc32c_table[8][256];

/* Construct table for software CRC-32C calculation. */
static void crc32c_init_sw(void)
{
    uint32_t n, crc, k;

    for (n = 0; n < 256; n++) {
        crc = n;
        crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
        crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
        crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
        crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
        crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
        crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
        crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
        crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
        crc32c_table[0][n] = crc;
    }
    for (n = 0; n < 256; n++) {
        crc = crc32c_table[0][n];
        for (k = 1; k < 8; k++) {
            crc = crc32c_table[0][crc & 0xff] ^ (crc >> 8);
            crc32c_table[k][n] = crc;
        }
    }
}

/* Table-driven software version as a fall-back.  This is about 15 times slower
   than using the hardware instructions.  This assumes little-endian integers,
   as is the case on Intel processors that the assembler code here is for. */
static uint32_t crc32c_sw(uint32_t crci, const void *buf, size_t len)
{
    const unsigned char *next = buf;
    uint64_t crc;

    pthread_once(&crc32c_once_sw, crc32c_init_sw);
    crc = crci ^ 0xffffffff;
    while (len && ((uintptr_t)next & 7) != 0) {
        crc = crc32c_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
        len--;
    }
    while (len >= 8) {
        crc ^= *(uint64_t *)next;
        crc = crc32c_table[7][crc & 0xff] ^
              crc32c_table[6][(crc >> 8) & 0xff] ^
              crc32c_table[5][(crc >> 16) & 0xff] ^
              crc32c_table[4][(crc >> 24) & 0xff] ^
              crc32c_table[3][(crc >> 32) & 0xff] ^
              crc32c_table[2][(crc >> 40) & 0xff] ^
              crc32c_table[1][(crc >> 48) & 0xff] ^
              crc32c_table[0][crc >> 56];
        next += 8;
        len -= 8;
    }
    while (len) {
        crc = crc32c_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
        len--;
    }
    return (uint32_t)crc ^ 0xffffffff;
}

/* Multiply a matrix times a vector over the Galois field of two elements,
   GF(2).  Each element is a bit in an unsigned integer.  mat must have at
   least as many entries as the power of two for most significant one bit in
   vec. */
static inline uint32_t gf2_matrix_times(uint32_t *mat, uint32_t vec)
{
    uint32_t sum;

    sum = 0;
    while (vec) {
        if (vec & 1)
            sum ^= *mat;
        vec >>= 1;
        mat++;
    }
    return sum;
}

/* Multiply a matrix by itself over GF(2).  Both mat and square must have 32
   rows. */
static inline void gf2_matrix_square(uint32_t *square, uint32_t *mat)
{
    int n;

    for (n = 0; n < 32; n++)
        square[n] = gf2_matrix_times(mat, mat[n]);
}

/* Construct an operator to apply len zeros to a crc.  len must be a power of
   two.  If len is not a power of two, then the result is the same as for the
   largest power of two less than len.  The result for len == 0 is the same as
   for len == 1.  A version of this routine could be easily written for any
   len, but that is not needed for this application. */
static void crc32c_zeros_op(uint32_t *even, size_t len)
{
    int n;
    uint32_t row;
    uint32_t odd[32];       /* odd-power-of-two zeros operator */

    /* put operator for one zero bit in odd */
    odd[0] = POLY;              /* CRC-32C polynomial */
    row = 1;
    for (n = 1; n < 32; n++) {
        odd[n] = row;
        row <<= 1;
    }

    /* put operator for two zero bits in even */
    gf2_matrix_square(even, odd);

    /* put operator for four zero bits in odd */
    gf2_matrix_square(odd, even);

    /* first square will put the operator for one zero byte (eight zero bits),
       in even -- next square puts operator for two zero bytes in odd, and so
       on, until len has been rotated down to zero */
    do {
        gf2_matrix_square(even, odd);
        len >>= 1;
        if (len == 0)
            return;
        gf2_matrix_square(odd, even);
        len >>= 1;
    } while (len);

    /* answer ended up in odd -- copy to even */
    for (n = 0; n < 32; n++)
        even[n] = odd[n];
}

/* Take a length and build four lookup tables for applying the zeros operator
   for that length, byte-by-byte on the operand. */
static void crc32c_zeros(uint32_t zeros[][256], size_t len)
{
    uint32_t n;
    uint32_t op[32];

    crc32c_zeros_op(op, len);
    for (n = 0; n < 256; n++) {
        zeros[0][n] = gf2_matrix_times(op, n);
        zeros[1][n] = gf2_matrix_times(op, n << 8);
        zeros[2][n] = gf2_matrix_times(op, n << 16);
        zeros[3][n] = gf2_matrix_times(op, n << 24);
    }
}

/* Apply the zeros operator table to crc. */
static inline uint32_t crc32c_shift(uint32_t zeros[][256], uint32_t crc)
{
    return zeros[0][crc & 0xff] ^ zeros[1][(crc >> 8) & 0xff] ^
           zeros[2][(crc >> 16) & 0xff] ^ zeros[3][crc >> 24];
}

/* Block sizes for three-way parallel crc computation.  LONG and SHORT must
   both be powers of two.  The associated string constants must be set
   accordingly, for use in constructing the assembler instructions. */
#define LONG 8192
#define LONGx1 "8192"
#define LONGx2 "16384"
#define SHORT 256
#define SHORTx1 "256"
#define SHORTx2 "512"

/* Tables for hardware crc that shift a crc by LONG and SHORT zeros. */
static pthread_once_t crc32c_once_hw = PTHREAD_ONCE_INIT;
static uint32_t crc32c_long[4][256];
static uint32_t crc32c_short[4][256];

/* Initialize tables for shifting crcs. */
static void crc32c_init_hw(void)
{
    crc32c_zeros(crc32c_long, LONG);
    crc32c_zeros(crc32c_short, SHORT);
}

/* Compute CRC-32C using the Intel hardware instruction. */
static uint32_t crc32c_hw(uint32_t crc, const void *buf, size_t len)
{
    const unsigned char *next = buf;
    const unsigned char *end;
    uint64_t crc0, crc1, crc2;      /* need to be 64 bits for crc32q */

    /* populate shift tables the first time through */
    pthread_once(&crc32c_once_hw, crc32c_init_hw);

    /* pre-process the crc */
    crc0 = crc ^ 0xffffffff;

    /* compute the crc for up to seven leading bytes to bring the data pointer
       to an eight-byte boundary */
    while (len && ((uintptr_t)next & 7) != 0) {
        __asm__("crc32b\t" "(%1), %0"
                : "=r"(crc0)
                : "r"(next), "0"(crc0));
        next++;
        len--;
    }

    /* compute the crc on sets of LONG*3 bytes, executing three independent crc
       instructions, each on LONG bytes -- this is optimized for the Nehalem,
       Westmere, Sandy Bridge, and Ivy Bridge architectures, which have a
       throughput of one crc per cycle, but a latency of three cycles */
    while (len >= LONG*3) {
        crc1 = 0;
        crc2 = 0;
        end = next + LONG;
        do {
            __asm__("crc32q\t" "(%3), %0\n\t"
                    "crc32q\t" LONGx1 "(%3), %1\n\t"
                    "crc32q\t" LONGx2 "(%3), %2"
                    : "=r"(crc0), "=r"(crc1), "=r"(crc2)
                    : "r"(next), "0"(crc0), "1"(crc1), "2"(crc2));
            next += 8;
        } while (next < end);
        crc0 = crc32c_shift(crc32c_long, crc0) ^ crc1;
        crc0 = crc32c_shift(crc32c_long, crc0) ^ crc2;
        next += LONG*2;
        len -= LONG*3;
    }

    /* do the same thing, but now on SHORT*3 blocks for the remaining data less
       than a LONG*3 block */
    while (len >= SHORT*3) {
        crc1 = 0;
        crc2 = 0;
        end = next + SHORT;
        do {
            __asm__("crc32q\t" "(%3), %0\n\t"
                    "crc32q\t" SHORTx1 "(%3), %1\n\t"
                    "crc32q\t" SHORTx2 "(%3), %2"
                    : "=r"(crc0), "=r"(crc1), "=r"(crc2)
                    : "r"(next), "0"(crc0), "1"(crc1), "2"(crc2));
            next += 8;
        } while (next < end);
        crc0 = crc32c_shift(crc32c_short, crc0) ^ crc1;
        crc0 = crc32c_shift(crc32c_short, crc0) ^ crc2;
        next += SHORT*2;
        len -= SHORT*3;
    }

    /* compute the crc on the remaining eight-byte units less than a SHORT*3
       block */
    end = next + (len - (len & 7));
    while (next < end) {
        __asm__("crc32q\t" "(%1), %0"
                : "=r"(crc0)
                : "r"(next), "0"(crc0));
        next += 8;
    }
    len &= 7;

    /* compute the crc for up to seven trailing bytes */
    while (len) {
        __asm__("crc32b\t" "(%1), %0"
                : "=r"(crc0)
                : "r"(next), "0"(crc0));
        next++;
        len--;
    }

    /* return a post-processed crc */
    return (uint32_t)crc0 ^ 0xffffffff;
}

/* Check for SSE 4.2.  SSE 4.2 was first supported in Nehalem processors
   introduced in November, 2008.  This does not check for the existence of the
   cpuid instruction itself, which was introduced on the 486SL in 1992, so this
   will fail on earlier x86 processors.  cpuid works on all Pentium and later
   processors. */
#define SSE42(have) \
    do { \
        uint32_t eax, ecx; \
        eax = 1; \
        __asm__("cpuid" \
                : "=c"(ecx) \
                : "a"(eax) \
                : "%ebx", "%edx"); \
        (have) = (ecx >> 20) & 1; \
    } while (0)

/* Compute a CRC-32C.  If the crc32 instruction is available, use the hardware
   version.  Otherwise, use the software version. */
uint32_t crc32c(uint32_t crc, const void *buf, size_t len)
{
    int sse42;

    SSE42(sse42);
    return sse42 ? crc32c_hw(crc, buf, len) : crc32c_sw(crc, buf, len);
}

#ifdef TEST

#define SIZE (262144*3)
#define CHUNK SIZE

int main(int argc, char **argv)
{
    char *buf;
    ssize_t got;
    size_t off, n;
    uint32_t crc;

    (void)argv;
    crc = 0;
    buf = malloc(SIZE);
    if (buf == NULL) {
        fputs("out of memory", stderr);
        return 1;
    }
    while ((got = read(0, buf, SIZE)) > 0) {
        off = 0;
        do {
            n = (size_t)got - off;
            if (n > CHUNK)
                n = CHUNK;
            crc = argc > 1 ? crc32c_sw(crc, buf + off, n) :
                             crc32c(crc, buf + off, n);
            off += n;
        } while (off < (size_t)got);
    }
    free(buf);
    if (got == -1) {
        fputs("read error\n", stderr);
        return 1;
    }
    printf("%08x\n", crc);
    return 0;
}

#endif /* TEST */


                        
这篇关于在软件中实现SSE 4.2的CRC32C的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持IT屋！


                    
                        查看全文

在软件中实现SSE 4.2的CRC32C [英] Implementing SSE 4.2's CRC32C in software

问题描述

相关文章

C/C++开发最新文章

热门教程

热门工具

登录关闭

在软件中实现SSE 4.2的CRC32C [英] Implementing SSE 4.2&#39;s CRC32C in software

问题描述

相关文章

C/C++开发最新文章

热门教程

热门工具

登录 关闭

在软件中实现SSE 4.2的CRC32C [英] Implementing SSE 4.2's CRC32C in software

登录关闭