我如何访问SHA内在? [英] How can I access SHA intrinsic?

查看:348
本文介绍了我如何访问SHA内在?的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

Gprof告诉我,我的计算量大的程序使用AP-Hash花费大部分时间(36%)散列。

我无法减少通话计数,但我仍然想让它更快,我可以拨打内部SHA from ac program?



我需要intel编译器还是可以使用gcc?

解决方案

SHA说明现已发布在 Goldmont架构。它于2016年9月左右发布。根据英特尔内部指南,这些是感兴趣的内在因素:


  • __ m128i _mm_sha1msg1_epu32(__m128i a,__m128i b) $ li $ __ m128i _mm_sha1msg2_epu32(__m128i a,__m128i b)

  • __ m128i _mm_sha1nexte_epu32(__m128i a,__m128i b)

  • __ m128i _mm_sha1rnds4_epu32(__m128i a,__m128i b,const int func)

  • __ m128i _mm_sha256msg1_epu32(__m128i a,__m128i b)
  • __ m128i _mm_sha256msg2_epu32(__m128i a,__m128i b)
  • __ m128i _mm_sha256rnds2_epu32(__m128i a,__m128i b,__m128i k )



GCC 5.0及更高版本使内部函数始终可用于功能特定选项Pragma 。但是,您需要Binutils 2.24。测试还显示Clang 3.7和3.8支持内在函数。测试还显示Visual Studio 2015可以使用它们,但VS2013无法编译它们。



您可以通过查找宏中的 __ SHA __ -march = native 会使其可用于处理器本身。如果没有,您可以使用 -msha 来启用它。
$ b

  $ gcc -march = native -dM -E  - < / dev / null | egrep -i'(aes | rdrnd | rdseed | sha)'
#define __RDRND__ 1
#define __SHA__ 1
#define __RDSEED__ 1
#define __AES__ 1

使用SHA1的代码如下所示。其基于英特尔的博客标题为英特尔®SHA扩展。另一个参考实现可从 miTLS项目






以下代码基于英特尔®SHA扩展博客。该代码适用于完整的SHA1块,因此 const uint32_t * data 为64个字节。您必须为最后一个模块添加填充并设置位长。



它在Celeron J3455上以每个字节约1.7个周期(cpb)运行。我相信Andy Polyakov的SHA1运行在 1.5 cpb for OpenSSL 。作为参考,一个优化的C / C ++实现将运行在大约9到10 cpb的范围内。 b

  static void SHA1_SHAEXT_Transform(uint32_t * state,const uint32_t * data)
{
__m128i ABCD,ABCD_SAVE,E0,E0_SAVE,E1;
__m128i MASK,MSG0,MSG1,MSG2,MSG3;

//载入初始值
ABCD = _mm_loadu_si128((__ m128i *)state);
E0 = _mm_set_epi32(状态[4],0,0,0);
ABCD = _mm_shuffle_epi32(ABCD,0x1B);
MASK = _mm_set_epi64x(0x0001020304050607ULL,0x08090a0b0c0d0e0fULL);

//保存当前散列
ABCD_SAVE = ABCD;
E0_SAVE = E0;

//舍入0-3
MSG0 = _mm_loadu_si128(((__ m128i *)data + 0);
MSG0 = _mm_shuffle_epi8(MSG0,MASK);
E0 = _mm_add_epi32(E0,MSG0);
E1 = ABCD;
ABCD = _mm_sha1rnds4_epu32(ABCD,E0,0);

//轮次4-7
MSG1 = _mm_loadu_si128((__ m128i *)(data + 4));
MSG1 = _mm_shuffle_epi8(MSG1,MASK);
E1 = _mm_sha1nexte_epu32(E1,MSG1);
E0 = ABCD;
ABCD = _mm_sha1rnds4_epu32(ABCD,E1,0);
MSG0 = _mm_sha1msg1_epu32(MSG0,MSG1);

//轮次8-11
MSG2 = _mm_loadu_si128((__ m128i *)(data + 8));
MSG2 = _mm_shuffle_epi8(MSG2,MASK);
E0 = _mm_sha1nexte_epu32(E0,MSG2);
E1 = ABCD;
ABCD = _mm_sha1rnds4_epu32(ABCD,E0,0);
MSG1 = _mm_sha1msg1_epu32(MSG1,MSG2);
MSG0 = _mm_xor_si128(MSG0,MSG2);

//轮次12-15
MSG3 = _mm_loadu_si128((__ m128i *)(data + 12));
MSG3 = _mm_shuffle_epi8(MSG3,MASK);
E1 = _mm_sha1nexte_epu32(E1,MSG3);
E0 = ABCD;
MSG0 = _mm_sha1msg2_epu32(MSG0,MSG3);
ABCD = _mm_sha1rnds4_epu32(ABCD,E1,0);
MSG2 = _mm_sha1msg1_epu32(MSG2,MSG3);
MSG1 = _mm_xor_si128(MSG1,MSG3);

//轮次16-19
E0 = _mm_sha1nexte_epu32(E0,MSG0);
E1 = ABCD;
MSG1 = _mm_sha1msg2_epu32(MSG1,MSG0);
ABCD = _mm_sha1rnds4_epu32(ABCD,E0,0);
MSG3 = _mm_sha1msg1_epu32(MSG3,MSG0);
MSG2 = _mm_xor_si128(MSG2,MSG0);

//轮次20-23
E1 = _mm_sha1nexte_epu32(E1,MSG1);
E0 = ABCD;
MSG2 = _mm_sha1msg2_epu32(MSG2,MSG1);
ABCD = _mm_sha1rnds4_epu32(ABCD,E1,1);
MSG0 = _mm_sha1msg1_epu32(MSG0,MSG1);
MSG3 = _mm_xor_si128(MSG3,MSG1);

//轮次24-27
E0 = _mm_sha1nexte_epu32(E0,MSG2);
E1 = ABCD;
MSG3 = _mm_sha1msg2_epu32(MSG3,MSG2);
ABCD = _mm_sha1rnds4_epu32(ABCD,E0,1);
MSG1 = _mm_sha1msg1_epu32(MSG1,MSG2);
MSG0 = _mm_xor_si128(MSG0,MSG2);

// Round 28-31
E1 = _mm_sha1nexte_epu32(E1,MSG3);
E0 = ABCD;
MSG0 = _mm_sha1msg2_epu32(MSG0,MSG3);
ABCD = _mm_sha1rnds4_epu32(ABCD,E1,1);
MSG2 = _mm_sha1msg1_epu32(MSG2,MSG3);
MSG1 = _mm_xor_si128(MSG1,MSG3);

//轮次32-35
E0 = _mm_sha1nexte_epu32(E0,MSG0);
E1 = ABCD;
MSG1 = _mm_sha1msg2_epu32(MSG1,MSG0);
ABCD = _mm_sha1rnds4_epu32(ABCD,E0,1);
MSG3 = _mm_sha1msg1_epu32(MSG3,MSG0);
MSG2 = _mm_xor_si128(MSG2,MSG0);

//轮次36-39
E1 = _mm_sha1nexte_epu32(E1,MSG1);
E0 = ABCD;
MSG2 = _mm_sha1msg2_epu32(MSG2,MSG1);
ABCD = _mm_sha1rnds4_epu32(ABCD,E1,1);
MSG0 = _mm_sha1msg1_epu32(MSG0,MSG1);
MSG3 = _mm_xor_si128(MSG3,MSG1);

//轮次40-43
E0 = _mm_sha1nexte_epu32(E0,MSG2);
E1 = ABCD;
MSG3 = _mm_sha1msg2_epu32(MSG3,MSG2);
ABCD = _mm_sha1rnds4_epu32(ABCD,E0,2);
MSG1 = _mm_sha1msg1_epu32(MSG1,MSG2);
MSG0 = _mm_xor_si128(MSG0,MSG2);

//轮次44-47
E1 = _mm_sha1nexte_epu32(E1,MSG3);
E0 = ABCD;
MSG0 = _mm_sha1msg2_epu32(MSG0,MSG3);
ABCD = _mm_sha1rnds4_epu32(ABCD,E1,2);
MSG2 = _mm_sha1msg1_epu32(MSG2,MSG3);
MSG1 = _mm_xor_si128(MSG1,MSG3);

//轮次48-51
E0 = _mm_sha1nexte_epu32(E0,MSG0);
E1 = ABCD;
MSG1 = _mm_sha1msg2_epu32(MSG1,MSG0);
ABCD = _mm_sha1rnds4_epu32(ABCD,E0,2);
MSG3 = _mm_sha1msg1_epu32(MSG3,MSG0);
MSG2 = _mm_xor_si128(MSG2,MSG0);

//轮次52-55
E1 = _mm_sha1nexte_epu32(E1,MSG1);
E0 = ABCD;
MSG2 = _mm_sha1msg2_epu32(MSG2,MSG1);
ABCD = _mm_sha1rnds4_epu32(ABCD,E1,2);
MSG0 = _mm_sha1msg1_epu32(MSG0,MSG1);
MSG3 = _mm_xor_si128(MSG3,MSG1);

//轮次56-59
E0 = _mm_sha1nexte_epu32(E0,MSG2);
E1 = ABCD;
MSG3 = _mm_sha1msg2_epu32(MSG3,MSG2);
ABCD = _mm_sha1rnds4_epu32(ABCD,E0,2);
MSG1 = _mm_sha1msg1_epu32(MSG1,MSG2);
MSG0 = _mm_xor_si128(MSG0,MSG2);

//轮次60-63
E1 = _mm_sha1nexte_epu32(E1,MSG3);
E0 = ABCD;
MSG0 = _mm_sha1msg2_epu32(MSG0,MSG3);
ABCD = _mm_sha1rnds4_epu32(ABCD,E1,3);
MSG2 = _mm_sha1msg1_epu32(MSG2,MSG3);
MSG1 = _mm_xor_si128(MSG1,MSG3);

//轮次64-67
E0 = _mm_sha1nexte_epu32(E0,MSG0);
E1 = ABCD;
MSG1 = _mm_sha1msg2_epu32(MSG1,MSG0);
ABCD = _mm_sha1rnds4_epu32(ABCD,E0,3);
MSG3 = _mm_sha1msg1_epu32(MSG3,MSG0);
MSG2 = _mm_xor_si128(MSG2,MSG0);

//轮次68-71
E1 = _mm_sha1nexte_epu32(E1,MSG1);
E0 = ABCD;
MSG2 = _mm_sha1msg2_epu32(MSG2,MSG1);
ABCD = _mm_sha1rnds4_epu32(ABCD,E1,3);
MSG3 = _mm_xor_si128(MSG3,MSG1);

//轮次72-75
E0 = _mm_sha1nexte_epu32(E0,MSG2);
E1 = ABCD;
MSG3 = _mm_sha1msg2_epu32(MSG3,MSG2);
ABCD = _mm_sha1rnds4_epu32(ABCD,E0,3);

//轮次76-79
E1 = _mm_sha1nexte_epu32(E1,MSG3);
E0 = ABCD;
ABCD = _mm_sha1rnds4_epu32(ABCD,E1,3);

//将值加回到
E0 = _mm_sha1nexte_epu32(E0,E0_SAVE);
ABCD = _mm_add_epi32(ABCD,ABCD_SAVE);

//保存状态
ABCD = _mm_shuffle_epi32(ABCD,0x1B);
_mm_storeu_si128((__ m128i *)状态,ABCD);
*(state + 4)= _mm_extract_epi32(E0,3);
}






你可以判断你的处理器通过查找 sha_ni 标志来支持Linux下的SHA扩展:

 $ cat / proc / cpuinfo 
处理器:0
vendor_id:GenuineIntel
cpu系列:6
型号:92
型号名称: Intel(R)Celeron(R)CPU J3455 @ 1.50GHz
步进:9
微码:0x1a
cpu MHz:799.987
缓存大小:1024 KB
物理ID :0
siblings:4
核心ID:0
cpu核心:4
apicid:0
初始apicid:0
fpu:是
fpu_exception:是
cpuid等级:21
wp:是
标志:fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36
clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc
art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eager fpu pni pclm
ulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 sdbg cx16 xtpr pdcm sse4_1 sse4_2 x2apic mov
popcnt tsc_deadline_timer aes xsave rdrand lahf_lm 3dnowprefetch intel_pt tpr_shadow vn
mi flexpriority ept vpid fsgsbase tsc_adjust smep erms mpx rdseed smap clflushopt sha_ni x
saveopt xsavec xgetbv1 xsaves dtherm ida arat pln pts
bugs:monitor
bogomips:2995.20
clflush size:64
cache_alignment:64
地址大小:39位物理,48位虚拟
电源管理:
...






另请参阅 x86中是否有任何加速SHA的指令(SHA1 / 2 / 256/512)编码?



您可以在 Noloader GitHub | SHA-内部函数的。它们是C源文件,并为SHA-1,SHA-224和SHA-256提供压缩功能。基于内部的实现方式为SHA-1提高了吞吐量约3倍至4倍,而SHA-224和SHA-256则提高约6倍至12倍。


Gprof tells me that my computationally heavy program spends most of it's time (36%) hashing using AP-Hash.

I can't reduce the call count but I would still like to make it faster, can I call intrinsic SHA from a c program?

Do I need the intel compiler or can I stick with gcc?

解决方案

SHA instructions are now available in Goldmont architecture. It was released around September, 2016. According to the Intel Intrinsics Guide, these are the intrinsics of interest:

  • __m128i _mm_sha1msg1_epu32 (__m128i a, __m128i b)
  • __m128i _mm_sha1msg2_epu32 (__m128i a, __m128i b)
  • __m128i _mm_sha1nexte_epu32 (__m128i a, __m128i b)
  • __m128i _mm_sha1rnds4_epu32 (__m128i a, __m128i b, const int func)
  • __m128i _mm_sha256msg1_epu32 (__m128i a, __m128i b)
  • __m128i _mm_sha256msg2_epu32 (__m128i a, __m128i b)
  • __m128i _mm_sha256rnds2_epu32 (__m128i a, __m128i b, __m128i k)

GCC 5.0 and above make intrinsics available all the time for Function Specific Option Pragmas. You will need Binutils 2.24, however. Testing also shows Clang 3.7 and 3.8 support the intrinsics. Testing also shows Visual Studio 2015 can consume them, but VS2013 failed to compile them.

You can detect the availability of SHA in the preprocessor on Linux by looking for the macro __SHA__. -march=native will make it available if its native to the processor. If not, you can enable it with -msha.

$ gcc -march=native -dM -E - </dev/null | egrep -i '(aes|rdrnd|rdseed|sha)'
#define __RDRND__ 1
#define __SHA__ 1
#define __RDSEED__ 1
#define __AES__ 1

The code for using SHA1 is shown below. Its based on Intel's blog titled Intel® SHA Extensions. Another reference implementation is available from the miTLS project.


The code below is based on Intel® SHA Extensions blog. The code works with full SHA1 blocks, so const uint32_t *data is 64 bytes. You will have to add the padding for the final block and set the bit length.

It runs at about 1.7 cycles-per-byte (cpb) on an Celeron J3455. I believe Andy Polyakov has SHA1 running around 1.5 cpb for OpenSSL. For reference, an optimized C/C++ implementation will run somewhere around 9 to 10 cpb.

static void SHA1_SHAEXT_Transform(uint32_t *state, const uint32_t *data)
{
    __m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1;
    __m128i MASK, MSG0, MSG1, MSG2, MSG3;

    // Load initial values
    ABCD = _mm_loadu_si128((__m128i*) state);
    E0 = _mm_set_epi32(state[4], 0, 0, 0);
    ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
    MASK = _mm_set_epi64x(0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL);

    // Save current hash
    ABCD_SAVE = ABCD;
    E0_SAVE = E0;

    // Rounds 0-3
    MSG0 = _mm_loadu_si128((__m128i*) data+0);
    MSG0 = _mm_shuffle_epi8(MSG0, MASK);
    E0 = _mm_add_epi32(E0, MSG0);
    E1 = ABCD;
    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);

    // Rounds 4-7
    MSG1 = _mm_loadu_si128((__m128i*) (data+4));
    MSG1 = _mm_shuffle_epi8(MSG1, MASK);
    E1 = _mm_sha1nexte_epu32(E1, MSG1);
    E0 = ABCD;
    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
    MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);

    // Rounds 8-11
    MSG2 = _mm_loadu_si128((__m128i*) (data+8));
    MSG2 = _mm_shuffle_epi8(MSG2, MASK);
    E0 = _mm_sha1nexte_epu32(E0, MSG2);
    E1 = ABCD;
    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
    MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
    MSG0 = _mm_xor_si128(MSG0, MSG2);

    // Rounds 12-15
    MSG3 = _mm_loadu_si128((__m128i*) (data+12));
    MSG3 = _mm_shuffle_epi8(MSG3, MASK);
    E1 = _mm_sha1nexte_epu32(E1, MSG3);
    E0 = ABCD;
    MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
    MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
    MSG1 = _mm_xor_si128(MSG1, MSG3);

    // Rounds 16-19
    E0 = _mm_sha1nexte_epu32(E0, MSG0);
    E1 = ABCD;
    MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
    MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
    MSG2 = _mm_xor_si128(MSG2, MSG0);

    // Rounds 20-23
    E1 = _mm_sha1nexte_epu32(E1, MSG1);
    E0 = ABCD;
    MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
    MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
    MSG3 = _mm_xor_si128(MSG3, MSG1);

    // Rounds 24-27
    E0 = _mm_sha1nexte_epu32(E0, MSG2);
    E1 = ABCD;
    MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
    MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
    MSG0 = _mm_xor_si128(MSG0, MSG2);

    // Rounds 28-31
    E1 = _mm_sha1nexte_epu32(E1, MSG3);
    E0 = ABCD;
    MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
    MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
    MSG1 = _mm_xor_si128(MSG1, MSG3);

    // Rounds 32-35
    E0 = _mm_sha1nexte_epu32(E0, MSG0);
    E1 = ABCD;
    MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
    MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
    MSG2 = _mm_xor_si128(MSG2, MSG0);

    // Rounds 36-39
    E1 = _mm_sha1nexte_epu32(E1, MSG1);
    E0 = ABCD;
    MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
    MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
    MSG3 = _mm_xor_si128(MSG3, MSG1);

    // Rounds 40-43
    E0 = _mm_sha1nexte_epu32(E0, MSG2);
    E1 = ABCD;
    MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
    MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
    MSG0 = _mm_xor_si128(MSG0, MSG2);

    // Rounds 44-47
    E1 = _mm_sha1nexte_epu32(E1, MSG3);
    E0 = ABCD;
    MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
    MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
    MSG1 = _mm_xor_si128(MSG1, MSG3);

    // Rounds 48-51
    E0 = _mm_sha1nexte_epu32(E0, MSG0);
    E1 = ABCD;
    MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
    MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
    MSG2 = _mm_xor_si128(MSG2, MSG0);

    // Rounds 52-55
    E1 = _mm_sha1nexte_epu32(E1, MSG1);
    E0 = ABCD;
    MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
    MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
    MSG3 = _mm_xor_si128(MSG3, MSG1);

    // Rounds 56-59
    E0 = _mm_sha1nexte_epu32(E0, MSG2);
    E1 = ABCD;
    MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
    MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
    MSG0 = _mm_xor_si128(MSG0, MSG2);

    // Rounds 60-63
    E1 = _mm_sha1nexte_epu32(E1, MSG3);
    E0 = ABCD;
    MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
    MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
    MSG1 = _mm_xor_si128(MSG1, MSG3);

    // Rounds 64-67
    E0 = _mm_sha1nexte_epu32(E0, MSG0);
    E1 = ABCD;
    MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
    MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
    MSG2 = _mm_xor_si128(MSG2, MSG0);

    // Rounds 68-71
    E1 = _mm_sha1nexte_epu32(E1, MSG1);
    E0 = ABCD;
    MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
    MSG3 = _mm_xor_si128(MSG3, MSG1);

    // Rounds 72-75
    E0 = _mm_sha1nexte_epu32(E0, MSG2);
    E1 = ABCD;
    MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);

    // Rounds 76-79
    E1 = _mm_sha1nexte_epu32(E1, MSG3);
    E0 = ABCD;
    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);

    // Add values back to state
    E0 = _mm_sha1nexte_epu32(E0, E0_SAVE);
    ABCD = _mm_add_epi32(ABCD, ABCD_SAVE);

    // Save state
    ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
    _mm_storeu_si128((__m128i*) state, ABCD);
    *(state+4) = _mm_extract_epi32(E0, 3);
}


You can tell if your processor supports the SHA extensions under Linux by looking for the sha_ni flag:

$ cat /proc/cpuinfo
processor   : 0
vendor_id   : GenuineIntel
cpu family  : 6
model       : 92
model name  : Intel(R) Celeron(R) CPU J3455 @ 1.50GHz
stepping    : 9
microcode   : 0x1a
cpu MHz     : 799.987
cache size  : 1024 KB
physical id : 0
siblings    : 4
core id     : 0
cpu cores   : 4
apicid      : 0
initial apicid  : 0
fpu     : yes
fpu_exception   : yes
cpuid level : 21
wp      : yes
flags       : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 
clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc 
art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclm
ulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 sdbg cx16 xtpr pdcm sse4_1 sse4_2 x2apic mov
be popcnt tsc_deadline_timer aes xsave rdrand lahf_lm 3dnowprefetch intel_pt tpr_shadow vn
mi flexpriority ept vpid fsgsbase tsc_adjust smep erms mpx rdseed smap clflushopt sha_ni x
saveopt xsavec xgetbv1 xsaves dtherm ida arat pln pts
bugs        : monitor
bogomips    : 2995.20
clflush size    : 64
cache_alignment : 64
address sizes   : 39 bits physical, 48 bits virtual
power management:
...


Also see Are there in x86 any instructions to accelerate SHA (SHA1/2/256/512) encoding?

You can find source for both Intel SHA intrinsics and ARMv8 SHA intrinsics at Noloader GitHub | SHA-Intrinsics. They are C source files, and provide the compress function for SHA-1, SHA-224 and SHA-256. The intrinsic based implementations increase throughput approximately 3x to 4x for SHA-1, and approximately 6x to 12x for SHA-224 and SHA-256.

这篇关于我如何访问SHA内在?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆