优化“binary_fold”算法,使其左(或右)关联 [英] optimise "binary_fold" algorithm and make it left (or right) associative

查看:244
本文介绍了优化“binary_fold”算法,使其左(或右)关联的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我的原始问题之后,考虑了一些提出的解决方案我想出了为C ++ 14:

  #include< algorithm> 
#include< exception>
#include< iterator>
#include< cstddef>

template< class It,class Func>
auto binary_fold(it begin,It end,Func op) - > decltype(op(* begin,* end)){
std :: ptrdiff_t diff = end - begin;
switch(diff){
case 0:throw std :: out_of_range(空容器上的二进制折叠);
case 1:return * begin;
case 2:return op(* begin,*(begin + 1));
default:{//首先舍入到最接近的2的倍数,然后提前
it {begin};
int div = diff / 2;
int offset =(div%2 == 1)? (div + 1):div; // round到两个最接近的倍数(向上)
std :: advance(mid,offset);
return op(binary_fold(begin,mid,op),binary_fold(mid,end,op));
}
}
}

此算法将执行二进制操作成对递归,直到获得结果。例如

  std :: vector< int> v = {1,3,5,6,1}; 
auto result = mar :: binary_fold(v.cbegin(),v.cend(),std :: minus< int>());

将解析为:

  1  - (5-6) - (1-3)= 0 

在某些情况下(如上所述),算法将是关联的,但在其他情况下(如下),它将是正确的关联:

  std :: vector< int> v = {7,4,9,2,6,8}; 
auto result = mar :: binary_fold(v.cbegin(),v.cend(),std :: minus< int>());

会导致:

 (7-4) - (9-2) - (6-8)= -2 

我想知道如何进一步优化这个算法,以便:



a。它绝对是左或右的关联



b。它是尽可能快的(这将放在一个openGL绘图循环,所以它必须非常快)。



c。做一个TMP版本,它将计算编译时的偏移量,当容器的大小已知时(这对我的应用程序来说是不必要的,但我只是想知道如何做)。



我对b的第一个想法。是迭代版本可能更快,并且偏移计算可以进一步优化(可能有一些bitwise魔法?)。我被卡住了。

解决方案

我有两个TMP版本。哪一个更好,取决于数据类型,我想:



解决方案A:



首先,让我们为分割点找到一个良好的偏移量(两个幂似乎很大):

  template< std :: ptrdiff_t diff,std :: ptrdiff_t V = 2> 
struct offset
{
static constexpr std :: ptrdiff_t value =
(V * 2< diff - 1)? offset< diff,V * 2> :: value:V;
};

//结束递归
模板< std :: ptrdiff_t diff>
struct offset< diff,1<< 16>
{
static constexpr std :: ptrdiff_t value = 1<< 16;
};

//一些特殊情况
模板<>
struct offset< 0,2>
{
static constexpr std :: ptrdiff_t value = 0;
};

模板<>
struct offset< 1,2>
{
static constexpr std :: ptrdiff_t value = 0;
};

模板<>
struct offset< 2,2>
{
static constexpr std :: ptrdiff_t value = 0;
};

有了这个,我们可以创建一个递归的TMP版本:

  template< std :: ptrdiff_t diff,class It,class Func> 
auto binary_fold_tmp(it begin,It end,Func op)
- > decltype(op(* begin,* end))
{
assert(end - begin == diff);
switch(diff)
{
case 0:
assert(false);
return 0; //这将永远不会发生
case 1:
return * begin;
case 2:
return op(* begin,*(begin + 1));
默认值:
{//首先舍入到最接近的2的倍数,然后提前
it {begin};
std :: advance(mid,offset< diff> :: value);
auto left = binary_fold_tmp< offset< diff> :: value>(begin,mid,op);
auto right =
binary_fold_tmp< diff - offset< diff> :: value>(mid,end,op);
return op(left,right);
}
}
}

非TMP版本这样,例如:

  template< class It,class Func> 
auto binary_fold(it begin,It end,Func op)
- > decltype(op(* begin,* end))
{
const auto diff = end - begin;
assert(diff> 0);
switch(diff)
{
case 1:
return binary_fold_tmp 1(begin,end,op);
case 2:
return binary_fold_tmp 2(begin,end,op);
case 3:
return binary_fold_tmp 3(begin,end,op);
case 4:
return binary_fold_tmp 4(begin,end,op);
case 5:
return binary_fold_tmp 5(begin,end,op);
case 6:
return binary_fold_tmp 6(begin,end,op);
case 7:
return binary_fold_tmp 7(begin,end,op);
case 8:
return binary_fold_tmp 8(begin,end,op);
default:
if(diff< 16)
return op(binary_fold_tmp 8(begin,begin + 8,op),
binary_fold(begin + 8, op));
else if(diff< 32)
return op(binary_fold_tmp 16(begin,begin + 16,op),
binary_fold(begin + 16,end,op)
else
return op(binary_fold_tmp 32(begin,begin + 32,op),
binary_fold(begin + 32,end,op));
}
}

解决方案B: / p>

这会计算成对结果,将它们存储在缓冲区中,然后使用缓冲区调用自身:

  template< std :: ptrdiff_t diff,class It,class Func,size_t ... Is> 
auto binary_fold_pairs_impl(it begin,
It end,
Func op,
const std :: index_sequence< Is ...>&)
- > decltype(op(* begin,* end))
{
std :: decay_t< decltype(* begin)> pair [diff / 2] = {
op(*(begin + 2 * Is),*(begin + 2 * Is + 1))...}

if(diff == 2)
return pairs [0];
else
return binary_fold_pairs_impl< diff / 2>(
& pairs [0],
& pairs [0] + diff / 2,
op,
std :: make_index_equence< diff / 4> {});
}

template< std :: ptrdiff_t diff,class It,class Func>
auto binary_fold_pairs(it begin,It end,Func op) - > decltype(op(* begin,* end))
{
return binary_fold_pairs_impl< diff>(
begin,end,op,std :: make_index_sequence< diff / 2&
}

此模板函数需要 diff 是2的幂。但是,当然你可以将它与非模板版本组合:

 模板< class It,class Func> 
auto binary_fold_mix(It begin,It end,Func op) - > decltype(op(* begin,* end))
{
const auto diff = end - begin;
assert(diff> 0);
switch(diff)
{
case 1:
return * begin;
case 2:
return binary_fold_pairs 2(begin,end,op);
case 3:
return op(binary_fold_pairs 2(begin,begin + 1,op),
*(begin +(diff -1)))
case 4:
return binary_fold_pairs 4(begin,end,op);
case 5:
return op(binary_fold_pairs 4(begin,begin + 4,op),
*(begin +(diff -1)))
case 6:
return op(binary_fold_pairs 4(begin,begin + 4,op),
binary_fold_pairs 4(begin + 4,begin + 6,op)
case 7:
return op(binary_fold_pairs 4(begin,begin + 4,op),
binary_fold_mix(begin + 4,begin + 7,op)
case 8:
return binary_fold_pairs< 8>(begin,end,op);
默认值:
if(diff <= 16)
return op(binary_fold_pairs 8(begin,begin + 8,op),
binary_fold_mix(begin + 8,end ,op));
else if(diff <= 32)
return op(binary_fold_pairs 16(begin,begin + 16,op),
binary_fold_mix(begin + 16,end,op)
else
return op(binary_fold_pairs< 32>(begin,begin + 32,op),
binary_fold_mix(begin + 32,end,op)
}
}



我使用与MtRoad相同的程序测量。在我的机器上,差异不如MtRoad报告的大。使用 -O3 解决方案A和B似乎比MtRoad的版本略快,但在现实中,你需要测试你的类型和数据。



注:我没有太严格地测试我的版本。


Following my original question and considering some of the proposed solutions I came up with this for C++14:

#include <algorithm>
#include <exception>
#include <iterator>
#include <cstddef>

template<class It, class Func>
auto binary_fold(It begin, It end, Func op) ->  decltype(op(*begin, *end)) {
  std::ptrdiff_t diff = end - begin;
  switch (diff) {
    case 0: throw std::out_of_range("binary fold on empty container");
    case 1: return *begin;
    case 2: return op(*begin, *(begin + 1));
    default: { // first round to the nearest multiple of 2 and then advance
      It mid{begin};
      int div = diff/2;
      int offset = (div%2 == 1) ? (div+1) : div; // round to the closest multiple of two (upwards)
      std::advance(mid, offset);
      return op( binary_fold(begin,mid,op), binary_fold(mid,end,op) );
    }
  }
}

this algorithm will perform a binary operation pairwise recursively until a result is obtained. E.g.

 std::vector<int> v = {1,3,5,6,1};
 auto result = mar::binary_fold(v.cbegin(), v.cend(), std::minus<int>());

will resolve in:

1 - (5-6) - (1-3) = 0

In some cases (like the one above) the algorithm will be left associative, but in others (like the following), it will be right associative:

  std::vector<int> v = {7,4,9,2,6,8};
  auto result = mar::binary_fold(v.cbegin(), v.cend(), std::minus<int>());

results in:

(7-4) - (9-2) - (6-8) = -2

I'm wondering how I can further optimise this algorithm so that:

a. it is definitely left or right associative

b. it is as fast as possible (this will be put within an openGL drawing loop, so it has to be very fast).

c. make a TMP version that will compute the offsets in compilation time when the size of the container is known (this is not necessary for my application, but I'm just curious of how it can be done).

my first thoughts on b. is that an iterative version would be probably faster, and that the offset calculation could be further optimised (maybe with some bitwise magic?). I'm stuck nevertheless.

解决方案

I have two TMP versions. Which one is better, depends on the data types, I guess:

Solution A:

First, let's find a good offset for the split point (powers of two seem great):

template<std::ptrdiff_t diff, std::ptrdiff_t V = 2>
struct offset
{
  static constexpr std::ptrdiff_t value =
      (V * 2 < diff - 1) ? offset<diff, V * 2>::value : V;
};

// End recursion
template<std::ptrdiff_t diff>
struct offset<diff, 1<<16>
{
  static constexpr std::ptrdiff_t value = 1<<16;
};

// Some special cases
template<> 
struct offset<0, 2>
{
  static constexpr std::ptrdiff_t value = 0;
};

template<>
struct offset<1, 2> 
{
  static constexpr std::ptrdiff_t value = 0;
};

template<>
struct offset<2, 2>
{
  static constexpr std::ptrdiff_t value = 0;
};

With this, we can create a recursive TMP version:

template <std::ptrdiff_t diff, class It, class Func>
auto binary_fold_tmp(It begin, It end, Func op)
    -> decltype(op(*begin, *end))
{
  assert(end - begin == diff);
  switch (diff)
  {
    case 0:
      assert(false);
      return 0;  // This will never happen
    case 1:
      return *begin;
    case 2:
      return op(*begin, *(begin + 1));
    default:
    {  // first round to the nearest multiple of 2 and then advance
      It mid{begin};
      std::advance(mid, offset<diff>::value);
      auto left = binary_fold_tmp<offset<diff>::value>(begin, mid, op);
      auto right =
          binary_fold_tmp<diff - offset<diff>::value>(mid, end, op);
      return op(left, right);
    }
  }
}

This can be combined with a non-TMP version like this, for instance:

template <class It, class Func>
auto binary_fold(It begin, It end, Func op)
    -> decltype(op(*begin, *end))
{
  const auto diff = end - begin;
  assert(diff > 0);
  switch (diff)
  {
    case 1:
      return binary_fold_tmp<1>(begin, end, op);
    case 2:
      return binary_fold_tmp<2>(begin, end, op);
    case 3:
      return binary_fold_tmp<3>(begin, end, op);
    case 4:
      return binary_fold_tmp<4>(begin, end, op);
    case 5:
      return binary_fold_tmp<5>(begin, end, op);
    case 6:
      return binary_fold_tmp<6>(begin, end, op);
    case 7:
      return binary_fold_tmp<7>(begin, end, op);
    case 8:
      return binary_fold_tmp<8>(begin, end, op);
    default:
      if (diff < 16)
        return op(binary_fold_tmp<8>(begin, begin + 8, op),
                  binary_fold(begin + 8, end, op));
      else if (diff < 32)
        return op(binary_fold_tmp<16>(begin, begin + 16, op),
                  binary_fold(begin + 16, end, op));
      else
        return op(binary_fold_tmp<32>(begin, begin + 32, op),
                  binary_fold(begin + 32, end, op));
  }
}

Solution B:

This calculates the pair-wise results, stores them in a buffer, and then calls itself with the buffer:

template <std::ptrdiff_t diff, class It, class Func, size_t... Is>
auto binary_fold_pairs_impl(It begin,
                            It end,
                            Func op,
                            const std::index_sequence<Is...>&)
    -> decltype(op(*begin, *end))
{
  std::decay_t<decltype(*begin)> pairs[diff / 2] = {
      op(*(begin + 2 * Is), *(begin + 2 * Is + 1))...};

  if (diff == 2)
    return pairs[0];
  else
    return binary_fold_pairs_impl<diff / 2>(
        &pairs[0],
        &pairs[0] + diff / 2,
        op,
        std::make_index_sequence<diff / 4>{});
}

template <std::ptrdiff_t diff, class It, class Func>
auto binary_fold_pairs(It begin, It end, Func op) -> decltype(op(*begin, *end))
{
  return binary_fold_pairs_impl<diff>(
      begin, end, op, std::make_index_sequence<diff / 2>{});
}

This template function requires diff to be a power of 2. But of course you can combine it with a non-template version, again:

template <class It, class Func>
auto binary_fold_mix(It begin, It end, Func op) -> decltype(op(*begin, *end))
{
  const auto diff = end - begin;
  assert(diff > 0);
  switch (diff)
  {
    case 1:
      return *begin;
    case 2:
      return binary_fold_pairs<2>(begin, end, op);
    case 3:
      return op(binary_fold_pairs<2>(begin, begin + 1, op),
                *(begin + (diff - 1)));
    case 4:
      return binary_fold_pairs<4>(begin, end, op);
    case 5:
      return op(binary_fold_pairs<4>(begin, begin + 4, op),
                *(begin + (diff - 1)));
    case 6:
      return op(binary_fold_pairs<4>(begin, begin + 4, op),
                binary_fold_pairs<4>(begin + 4, begin + 6, op));
    case 7:
      return op(binary_fold_pairs<4>(begin, begin + 4, op),
                binary_fold_mix(begin + 4, begin + 7, op));
    case 8:
      return binary_fold_pairs<8>(begin, end, op);
    default:
      if (diff <= 16)
        return op(binary_fold_pairs<8>(begin, begin + 8, op),
                  binary_fold_mix(begin + 8, end, op));
      else if (diff <= 32)
        return op(binary_fold_pairs<16>(begin, begin + 16, op),
                  binary_fold_mix(begin + 16, end, op));
      else
        return op(binary_fold_pairs<32>(begin, begin + 32, op),
                  binary_fold_mix(begin + 32, end, op));
  }
}

I measured with the same program as MtRoad. On my machine, the differences are not as big as MtRoad reported. With -O3 solutions A and B seem to be slightly faster than MtRoad's version, but in reality, you need to test with your types and data.

Remark: I did not test my versions too rigorously.

这篇关于优化“binary_fold”算法,使其左(或右)关联的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆