推力复制 - OutputIterator列主命令 [英] Thrust copy - OutputIterator column-major order

查看：133 发布时间：2017/3/4 14:07:04 cuda thrust

本文介绍了推力复制 - OutputIterator列主命令的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我有一个矩阵的向量（存储为列主数组），我想垂直并置。因此，我想使用推力框架中的复制函数，如下面的示例代码片段：

  int offset = 0; 
 for（int i = 0; i  thrust :: copy（
 thrust :: device_ptr& ），
 thrust :: device_ptr< float>（matrices [i]）+ rows [i] * cols [i]，
 thrust :: device_ptr& ）; 
 
 offset + = rows [i] * cols [i]; 
}

编辑：扩展示例：

$ b b

问题是，如果我有一个矩阵A = [[1,2,3]，[4,5,6]]（2行，3列;在内存[1,4,2,5 ，3,6]）和另一个B = [[7,8,9]]（1行，3列;在存储器[7,8,9]中），所得到的矩阵C不是[[1,2,3 ]，[4,5,6]，[7,8,9]]（3行，3列;在存储器[1,4,7,2,5,8,3,6,9]中），但是[ [1，5,7]，[4,3,8]，[2,6,9]]（3行，3列;在存储器[1,4,2,5,3,6,7,8， 9]）。

有没有办法为这个问题创建一个特殊的OutputIterator（我已经搜索它，但没有发现什么），还是一个快速的替代方法？ / p>

编辑：SSCCE

  #include< thrust / host_vector.h> ; 
 #include< thrust / generate.h> 
 #include< thrust / device_vector.h> 
 #include< iostream> 
 
 void printMat2d（thrust :: device_vector< float>& mat，int rows，int cols）{
 for（int row = 0; row< rows; ++ row） 
 for（int col = 0; col< cols; ++ col）{
 std :: cout< mat [row + col * rows]< ; 
} 
 std :: cout<< std :: endl; 
} 
} 
 
 void printMat1d（thrust :: device_vector< float>& mat，int rows，int cols）{
 for（int idx = 0; idx< cols * rows; ++ idx）{
 std :: cout< mat [idx]< ; 
} 
 std :: cout<< std :: endl; 
} 
 
 void generateMat（thrust :: device_vector< float>& mat，int rows，int cols，int add）{
 thrust :: host_vector& matHost（rows * cols）; 
 int val = 0; 
 for（int row = 0; row< rows; ++ row）{
 for（int col = 0; col< cols; ++ col）{
 matHost [row + col * rows] = val + add; 
 val ++; 
} 
} 
 mat = matHost; 
} 
 
 int main（）{
 std :: vector< int>行（2）; 
 rows [0] = 2; 
 rows [1] = 3; 
 std :: vector< int> cols（2）; 
 cols [0] = 3; 
 cols [1] = 3; 
 
 //生成矩阵
 std :: vector< thrust :: device_vector< float> >矩阵（2）; 
 for（size_t i = 0; i  generateMat（matrices [i]，rows [i]，cols [i]，i * 10） ; 
 
 std :: cout<< mat_< i<< =<< std :: endl; 
 printMat2d（matrices [i]，rows [i]，cols [i]）; 
 printMat1d（matrices [i]，rows [i]，cols [i]）; 
} 
 
 // copy 
 int resultRows = 5; 
 int resultCols = 3; 
 thrust :: device_vector< float> result（resultRows * resultCols）; 
 int offset = 0; 
 for（int i = 0; i  thrust :: copy（
 matrices [i] .begin（），
 matrices [i] .end（），
 result.begin（）+ offset 
）; 
 
 offset + = rows [i] * cols [i]; 
} 
 
 std :: cout<< result =< std :: endl; 
 printMat2d（result，resultRows，resultCols）; 
 printMat1d（result，resultRows，resultCols）; 
 
 return 0; 
}

解决方案

/ strong>我替换了我以前的答案，使用stride范围每行方法，用一个稍微不同的方法，将复制操作下降到一个单一的推力调用（每个要复制的矩阵）。

这里的关键思想是使用一个函数将行主存储器索引转换为列主存储器索引。然后，该函数可以与 counting_iterator 一起使用以创建任意行主要到列的主要内存索引（通过 make_transform_iterator ）。这些索引然后可以在 permutation_iterator 中用于源矩阵以选择要复制的元素，并且 permutation_iterator 用于目的矩阵选择要复制的存储位置。对于 transform_iterator ， counting_iterator 和 permutation_iterator ，请参阅推荐快速入门指南。我碰巧用这个练习使用CUDA 5.0和推力1.5.3。
#include< thrust / device_vector.h> #include< thrust / iterator / counting_iterator.h> #include< thrust / iterator / transform_iterator.h> #include< thrust / iterator / permutation_iterator.h> #include< thrust / functional.h> #include< thrust / copy.h> #include< iostream> struct rm2cm_idx_functor：public thrust :: unary_function< int，int> { int r; int c; rm2cm_idx_functor（int _r，int _c）：r（_r），c（_c）{}; __host__ __device__ int operator（）（int idx）{ unsigned my_r = idx / c; unsigned my_c = idx％c; return（my_c * r）+ my_r; } }; typedef float my_type; void printMat2d（thrust :: device_vector< my_type>& mat，int rows，int cols）{ for（int row = 0; row& + row）{ for（int col = 0; col< cols; ++ col）{ std :: cout< mat [row + col * rows]< ; } std :: cout<< std :: endl; } } void printMat1d（thrust :: device_vector< my_type>& mat，int rows，int cols）{ for（int idx = 0; idx< cols * rows; ++ idx）{ std :: cout< mat [idx]< ; } std :: cout<< std :: endl; } void generateMat（thrust :: device_vector< my_type>& mat，int rows，int cols，int add）{ thrust :: host_vector& matHost（rows * cols）; int val = 0; for（int row = 0; row< rows; ++ row）{ for（int col = 0; col< cols; ++ col）{ matHost [row + col * rows] = val + add; val ++; } } mat = matHost; } void copyMat（thrust :: device_vector< my_type>& src，thrust :: device_vector< my_type>& dst，unsigned src_rows，unsigned src_cols，unsigned dst_rows ，unsigned offset）{ thrust :: copy_n（thrust :: make_permutation_iterator（src.begin（），thrust :: make_transform_iterator（thrust :: counting_iterator< int>（0），rm2cm_idx_functor（src_rows，src_cols） src_rows * src_cols，thrust :: make_permutation_iterator（dst.begin（），thrust :: make_transform_iterator（thrust :: counting_iterator< int>（offset），rm2cm_idx_functor（dst_rows，src_cols））））; } int main（）{ std :: vector< int>行（2）; rows [0] = 2; rows [1] = 3; std :: vector< int> cols（2）; cols [0] = 3; cols [1] = 3; //生成矩阵 std :: vector< thrust :: device_vector< my_type> >矩阵（2）; for（size_t i = 0; i generateMat（matrices [i]，rows [i]，cols [i]，i * 10） ; std :: cout<< mat_<< i<< =<< std :: endl; printMat2d（matrices [i]，rows [i]，cols [i]）; printMat1d（matrices [i]，rows [i]，cols [i]）; } // copy int resultRows = 5; int resultCols = 3; thrust :: device_vector< my_type> result（resultRows * resultCols）; int offset = 0; for（int i = 0; i copyMat（matrices [i]，result，rows [i]，cols [i ]，resultRows，offset）; offset + = rows [i] * cols [i]; } std :: cout< result =< std :: endl; printMat2d（result，resultRows，resultCols）; printMat1d（result，resultRows，resultCols）; return 0; }
这也假定源列==目标列在你的问题陈述。标准警告：不是说这是无错误的，但它似乎适用于原始问题陈述中的测试用例。

这种方法可能还有待进一步改进。现在，与 thrust :: copy_n 调用关联的读操作和写操作都将被解除聚合。我们可以通过合并这两个操作中的一个来进一步改进这一点。这将需要将用于读取和写入的索引转换函子的效果组合到考虑源和目的维度的单个映射函子中。使用单个映射函子， copy_n 调用的第一项可以只是源向量。我认为也应该可以使用 thrust :: gather 或 thrust :: scatter 。但是，我还没有完全实现。

I have a vector of matrices (stored as column major arrays) that I want to concat vertically. Therefore, I want to utilize the copy function from the thrust framework as in the following example snippet:
int offset = 0; for(int i = 0; i < matrices.size(); ++i) { thrust::copy( thrust::device_ptr<float>(matrices[i]), thrust::device_ptr<float>(matrices[i]) + rows[i] * cols[i], thrust::device_ptr<float>(result) + offset ); offset += rows[i] * cols[i]; }
EDIT: extended example:

The problem is, that if I have a matrix A = [[1, 2, 3], [4, 5, 6]] (2 rows, 3 cols; in memory [1, 4, 2, 5, 3, 6]) and another B = [[7, 8, 9]] (1 row, 3 cols; in memory [7, 8, 9]), the resulting matrix C is not [[1, 2, 3], [4, 5, 6], [7, 8, 9]] (3 row, 3 cols; in memory [1, 4, 7, 2, 5, 8, 3, 6, 9]), but [[1, 5, 7], [4, 3, 8], [2, 6, 9]] (3 row, 3 cols; in memory [1, 4, 2, 5, 3, 6, 7, 8, 9]).

Is there an way to create an special OutputIterator for this problem (I have searched for it, but found nothing), or a fast alternative way?

EDIT: SSCCE
#include <thrust/host_vector.h> #include <thrust/generate.h> #include <thrust/device_vector.h> #include <iostream> void printMat2d(thrust::device_vector<float>& mat, int rows, int cols) { for(int row = 0; row < rows; ++row) { for(int col = 0; col < cols; ++col) { std::cout << mat[row + col * rows] << " "; } std::cout << std::endl; } } void printMat1d(thrust::device_vector<float>& mat, int rows, int cols) { for(int idx = 0; idx < cols*rows; ++idx) { std::cout << mat[idx] << " "; } std::cout << std::endl; } void generateMat(thrust::device_vector<float>& mat, int rows, int cols, int add) { thrust::host_vector<float> matHost(rows * cols); int val = 0; for(int row = 0; row < rows; ++row) { for(int col = 0; col < cols; ++col) { matHost[row + col * rows] = val + add; val++; } } mat = matHost; } int main() { std::vector<int> rows(2); rows[0] = 2; rows[1] = 3; std::vector<int> cols(2); cols[0] = 3; cols[1] = 3; //generate matrices std::vector<thrust::device_vector<float> > matrices(2); for(size_t i = 0; i < matrices.size(); ++i) { generateMat(matrices[i], rows[i], cols[i], i*10); std::cout << "mat_ " << i << " = " << std::endl; printMat2d(matrices[i], rows[i], cols[i]); printMat1d(matrices[i], rows[i], cols[i]); } //copy int resultRows = 5; int resultCols = 3; thrust::device_vector<float> result(resultRows * resultCols); int offset = 0; for(int i = 0; i < matrices.size(); ++i) { thrust::copy( matrices[i].begin(), matrices[i].end(), result.begin() + offset ); offset += rows[i] * cols[i]; } std::cout << "result = " << std::endl; printMat2d(result, resultRows, resultCols); printMat1d(result, resultRows, resultCols); return 0; }

解决方案
EDIT: I've replaced my previous answer that used the strided range per row method, with a slightly different approach, that gets the copy operation down to a single thrust call (per matrix to be copied).

The key idea here was to use a functor that converts row-major memory indexing to column-major memory indexing. This functor can then be used with a counting_iterator to create arbitrary row-major to column major memory indices (via make_transform_iterator). These indices can then be used in a permutation_iterator for the source matrix to select the element to be copied and a permutation_iterator for the destination matrix to select the memory position to copy to. For a general review of transform_iterator, counting_iterator, and permutation_iterator, refer to the thrust quick start guide. I happened to be using CUDA 5.0 and thrust 1.5.3 for this exercise.
#include <thrust/device_vector.h> #include <thrust/iterator/counting_iterator.h> #include <thrust/iterator/transform_iterator.h> #include <thrust/iterator/permutation_iterator.h> #include <thrust/functional.h> #include <thrust/copy.h> #include <iostream> struct rm2cm_idx_functor : public thrust::unary_function<int, int> { int r; int c; rm2cm_idx_functor(int _r, int _c) : r(_r), c(_c) {}; __host__ __device__ int operator() (int idx) { unsigned my_r = idx/c; unsigned my_c = idx%c; return (my_c * r) + my_r; } }; typedef float my_type; void printMat2d(thrust::device_vector<my_type>& mat, int rows, int cols) { for(int row = 0; row < rows; ++row) { for(int col = 0; col < cols; ++col) { std::cout << mat[row + col * rows] << " "; } std::cout << std::endl; } } void printMat1d(thrust::device_vector<my_type>& mat, int rows, int cols) { for(int idx = 0; idx < cols*rows; ++idx) { std::cout << mat[idx] << " "; } std::cout << std::endl; } void generateMat(thrust::device_vector<my_type>& mat, int rows, int cols, int add) { thrust::host_vector<my_type> matHost(rows * cols); int val = 0; for(int row = 0; row < rows; ++row) { for(int col = 0; col < cols; ++col) { matHost[row + col * rows] = val + add; val++; } } mat = matHost; } void copyMat(thrust::device_vector<my_type>& src, thrust::device_vector<my_type>& dst, unsigned src_rows, unsigned src_cols, unsigned dst_rows, unsigned offset){ thrust::copy_n(thrust::make_permutation_iterator(src.begin(), thrust::make_transform_iterator(thrust::counting_iterator<int>(0), rm2cm_idx_functor(src_rows, src_cols))), src_rows*src_cols, thrust::make_permutation_iterator(dst.begin(), thrust::make_transform_iterator(thrust::counting_iterator<int>(offset), rm2cm_idx_functor(dst_rows, src_cols)))); } int main() { std::vector<int> rows(2); rows[0] = 2; rows[1] = 3; std::vector<int> cols(2); cols[0] = 3; cols[1] = 3; //generate matrices std::vector<thrust::device_vector<my_type> > matrices(2); for(size_t i = 0; i < matrices.size(); ++i) { generateMat(matrices[i], rows[i], cols[i], i*10); std::cout << "mat_ " << i << " = " << std::endl; printMat2d(matrices[i], rows[i], cols[i]); printMat1d(matrices[i], rows[i], cols[i]); } //copy int resultRows = 5; int resultCols = 3; thrust::device_vector<my_type> result(resultRows * resultCols); int offset = 0; for(int i = 0; i < matrices.size(); ++i) { copyMat(matrices[i], result, rows[i], cols[i], resultRows, offset); offset += rows[i]*cols[i]; } std::cout << "result = " << std::endl; printMat2d(result, resultRows, resultCols); printMat1d(result, resultRows, resultCols); return 0; }
This also assumes that source columns == destination columns, which seems to be implicit in your problem statement. Standard caveat: not saying this is bug free, but it seems to work for the test case built into the original problem statement.

This approach can probably still be further improved. Right now both the read operation and the write operation associated with the thrust::copy_n call will be uncoalesced. We can further improve this by making one of these two operations coalesced. This would necessitate combining the effect of index conversion functor for both read and write into a single mapping functor, which takes into account both source and destination dimensions. With a single mapping functor, the first term of the copy_n call could be just the source vector. I think it should also be possible to alternatively use thrust::gather or thrust::scatter. However, I haven't fully worked it out.

这篇关于推力复制 - OutputIterator列主命令的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持IT屋！

查看全文

推力复制 - OutputIterator列主命令 [英] Thrust copy - OutputIterator column-major order

问题描述

相关文章

其它硬件开发最新文章

热门教程

热门工具

登录关闭

推力复制 - OutputIterator列主命令 [英] Thrust copy - OutputIterator column-major order

问题描述

相关文章

其它硬件开发最新文章

热门教程

热门工具

登录 关闭

登录关闭