FLT_EPSILON的n次方根器时，SSE / AVX [英] FLT_EPSILON for a nth root finder with SSE/AVX

查看：155 发布时间：2016/8/24 14:43:36 c floating-point sse avx

本文介绍了FLT_EPSILON的n次方根器时，SSE / AVX的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我试图将其转换发现的n次方根用C从以下链接双重值的函数
HTTP：//rosetta$c$c.org/wiki/Nth_root#C
找到8彩车的n次方根一次使用AVX。

这code部分采用DBL_EPSILON * 10。然而，当我将它转换为使用float与AVX我必须使用FLT_EPSILON * 1000或code挂起，不收敛。当我打印出来FLT_EPSILON我看到它是为了1E-7。但这个环节， http://www.cplusplus.com/reference/cfloat/
说，这应该是1E-5。当我打印出来DBL_EPSILON是1E-16，但该链接说它应该只1E-9。这是怎么回事？

下面是code至今（不完全优化）。

 的#include＆LT;＆stdio.h中GT;
＃包括LT＆;＆FLOAT.H GT;
＃包括LT＆;＆immintrin.h GT; // AVX内联双abs_（双X）{
    返回X＆GT = 0？ X：-x;
}双pow_（双X，INT E）
{
    双RET = 1;
    对于（RET = 1，E，X * = X，E＆GT;＆GT; = 1）{
        如果（（E＆安培; 1））* RET = X;
    }
    返回RET;
}双根（双一，INT N）
{
    双D，X = 1;
    X = A / N;
    如果返回0（！）;
    //如果（N。1 ||（一个＆℃，放大器;＆放大器;！（N安培; 1）））返回0./0 .; / * *为NaN /    INT CNT = 0;
    做{
        CNT ++;
        D =（A / pow_（X，N  -  1） -  X）/ N;
        X + = D;
    }而（abs_（D）＆GT; = abs_（X）*（* DBL_EPSILON 10））;
    的printf（％d个\\ N，CNT）;    返回X;
}
__m256 pow_avx（__ M256 X，INT E）{
    __m256 RET = _mm256_set1_ps（1.0F）;
    对于（，E，X = _mm256_mul_ps（X，X），E＆GT;＆GT; = 1）{
        如果（（E＆安培; 1））= RET _mm256_mul_ps（X，RET）;
    }
    返回RET;
}内嵌__m256 abs_avx（__m256 X）{
    返回_mm256_max_ps（_mm256_sub_ps（_mm256_setzero_ps（）中，x）中，x）;
    //返回X＆GT; = 0？ X：-x;
}INT get_mask（常量__m256 D，const的__m256 X）{
    __m256广告= abs_avx（四）;
    __m256 AX = abs_avx（X）;
    __m256i面膜= _mm256_castps_si256（_mm256_cmp_ps（广告，斧，_CMP_GT_OQ））;
    返回_mm_movemask_epi8（_mm256_castsi256_si128（掩模））+ _mm_movemask_epi8（_mm256_extractf128_si256（屏蔽，1））;
}__m256 root_avx（__ M256一，INT N）{
    的printf（％E \\ N，FLT_EPSILON）;
    的printf（％E \\ N，DBL_EPSILON）;
    的printf（％E \\ N，FLT_EPSILON * 1000.0f）;
    __m256 D组;
    __m256 X = _mm256_set1_ps（1.0F）;
    //如果返回0（！）;
    //如果（N。1 ||（一个＆℃，放大器;＆放大器;！（N安培; 1）））返回0./0 .; / * *为NaN /
    __m256在= _mm256_set1_ps（1.0F / N）;
    __m256 XTMP;
    做{
        D = _mm256_rcp_ps（pow_avx（X，N  -  1））;
        D = _mm256_sub_ps（_mm256_mul_ps（A，D）中，x）;
        D = _mm256_mul_ps（D，中）;
        // D =（A / pow_avx（X，N  -  1） -  X）/ N;
        X = _mm256_add_ps（X，D）; // X + = D;
        XTMP = _mm256_mul_ps（X，_mm256_set1_ps（FLT_EPSILON * 100.0f））;
    //}而（abs_（D）＆GT; = abs_（X）*（* DBL_EPSILON 10））;
    }而（get_mask（D，XTMP））;
    返回X;
}诠释的main（）
{
    __m256 D = _mm256_set1_ps（16.0f）;
    __m256出= root_avx（D，4）;
    float结果[8];
    INT I;
    _mm256_storeu_ps（结果出来）;    对于（i = 0; I＆LT; 8;我++）{
        的printf（％F \\ N，造成[I]）;
    } printf的（\\ n）;    //双X = 16;
    //输出（根（％G，15）=％G \\ N，X，根（X，4））;    //双X = POW _（ -  3.14159，15）;
    //输出（根（％G，15）=％G \\ N，X，根（X，15））;
    返回0;
}

解决方案

_mm256_rcp_ps ，它映射到 rcpps 指令执行只是一个大概的倒数。在<一个href=\"http://www.intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html\"相对=nofollow>英特尔64和IA-32架构软件开发人员手册说，它的相对误差可能高达1.5•2 ^-12。这是不足以使求根准确衔接 100 * FLT_EPSILON 。

您可以使用一个确切的划分，如：

  D = pow_avx（X，N-1）;
D = _mm256_sub_ps（_mm256_div_ps（A，D）中，x）;

或者增加一些细化步骤，倒数的估计。

顺便说一下，如果你的编译器支持使用常规的C运营商SIMD对象，可以考虑使用常规的C运算符来代替：

  D = pow_avx（X，N-1）;
D = A / D  -  X;

I'm trying to convert a function that finds the nth root in C for a double value from the following link http://rosettacode.org/wiki/Nth_root#C to find the nth root for 8 floats at once using AVX.

Part of that code uses DBL_EPSILON * 10. However, when I convert this to use float with AVX I have to use FLT_EPSILON*1000 or the code hangs and does not converge. When I print out FLT_EPSILON I see it is order 1E-7. But this link, http://www.cplusplus.com/reference/cfloat/ , says it should be 1E-5. When I print out DBL_EPSILON it's 1E-16 but the link says it should only be 1E-9. What's going on?

Here is the code so far (not fully optimized).

#include <stdio.h>
#include <float.h>
#include <immintrin.h>                 // AVX

inline double abs_(double x) { 
    return x >= 0 ? x : -x; 
}

double pow_(double x, int e)
{
    double ret = 1;
    for (ret = 1; e; x *= x, e >>= 1) {
        if ((e & 1)) ret *= x;
    }
    return ret;
}

double root(double a, int n)
{
    double d, x = 1;
    x = a/n;
    if (!a) return 0;
    //if (n < 1 || (a < 0 && !(n&1))) return 0./0.; /* NaN */

    int cnt = 0;
    do {
        cnt++;  
        d = (a / pow_(x, n - 1) - x) / n;
        x+= d;
    } while (abs_(d) >= abs_(x) * (DBL_EPSILON * 10));
    printf("%d\n", cnt);    

    return x;
}


__m256 pow_avx(__m256 x, int e) {
    __m256 ret = _mm256_set1_ps(1.0f);
    for (; e; x = _mm256_mul_ps(x,x), e >>= 1) {
        if ((e & 1)) ret = _mm256_mul_ps(x,ret);
    }
    return ret;
}

inline __m256 abs_avx (__m256 x) { 
    return _mm256_max_ps(_mm256_sub_ps(_mm256_setzero_ps(), x), x);
    //return x >= 0 ? x : -x; 
}

int get_mask(const __m256 d, const __m256 x) {
    __m256 ad = abs_avx(d);
    __m256 ax = abs_avx(x);
    __m256i mask = _mm256_castps_si256(_mm256_cmp_ps(ad, ax, _CMP_GT_OQ));
    return _mm_movemask_epi8(_mm256_castsi256_si128(mask)) + _mm_movemask_epi8(_mm256_extractf128_si256(mask,1));
}

__m256 root_avx(__m256 a, int n) {
    printf("%e\n", FLT_EPSILON);
    printf("%e\n", DBL_EPSILON);
    printf("%e\n", FLT_EPSILON*1000.0f);
    __m256 d;
    __m256 x = _mm256_set1_ps(1.0f);
    //if (!a) return 0;
    //if (n < 1 || (a < 0 && !(n&1))) return 0./0.; /* NaN */
    __m256 in = _mm256_set1_ps(1.0f/n);
    __m256 xtmp;
    do {
        d = _mm256_rcp_ps(pow_avx(x, n - 1));
        d = _mm256_sub_ps(_mm256_mul_ps(a,d),x);
        d = _mm256_mul_ps(d,in);
        //d = (a / pow_avx(x, n - 1) - x) / n;
        x = _mm256_add_ps(x, d); //x+= d;
        xtmp =_mm256_mul_ps(x, _mm256_set1_ps(FLT_EPSILON*100.0f));
    //} while (abs_(d) >= abs_(x) * (DBL_EPSILON * 10));
    } while (get_mask(d, xtmp)); 
    return x;
}

int main()
{
    __m256 d = _mm256_set1_ps(16.0f);
    __m256 out = root_avx(d, 4);
    float result[8];
    int i;
    _mm256_storeu_ps(result, out);

    for(i=0; i<8; i++) {
        printf("%f\n", result[i]);
    } printf("\n");

    //double x = 16;
    //printf("root(%g, 15) = %g\n", x, root(x, 4));

    //double x = pow_(-3.14159, 15);
    //printf("root(%g, 15) = %g\n", x, root(x, 15));
    return 0;
}

解决方案

_mm256_rcp_ps, which maps to the rcpps instruction, performs only an approximate reciprocal. The Intel 64 and IA-32 Architectures Software Developer’s Manual says its relative error may be up to 1.5•2^-12. This is insufficient to cause the root finder to converge with accuracy 100*FLT_EPSILON.

You could use an exact division, such as:

d = pow_avx(x, n-1);
d = _mm256_sub_ps(_mm256_div_ps(a, d), x);

or add some refinement steps for the reciprocal estimate.

Incidentally, if your compiler supports using regular C operators with SIMD objects, consider using the regular C operators instead:

d = pow_avx(x, n-1);
d = a/d - x;

这篇关于FLT_EPSILON的n次方根器时，SSE / AVX的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持IT屋！

查看全文

FLT_EPSILON的n次方根器时，SSE / AVX [英] FLT_EPSILON for a nth root finder with SSE/AVX

问题描述

相关文章

C/C++最新文章

热门教程

热门工具

登录关闭

FLT_EPSILON的n次方根器时，SSE / AVX [英] FLT_EPSILON for a nth root finder with SSE/AVX

问题描述

相关文章

C/C++最新文章

热门教程

热门工具

登录 关闭

登录关闭