Parrot vs Thrust#
This page provides comparisons between Parrot code and equivalent Thrust code for common operations. Parrot is built on top of Thrust but provides a more concise and expressive API.
Examples#
✅ Arbitrary Transformation (6.1x less code)#
Parrot Code
#include "parrot.hpp"
int main() {
auto A = parrot::array({3, 4, 0, 8, 2});
auto B = parrot::array({6, 7, 2, 1, 8});
auto C = parrot::array({2, 5, 7, 4, 3});
(B * C + A).print(); // 15 39 14 12 26
}
Thrust Code
Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/arbitrary_transformation.cu
#include <thrust/detail/config.h>
#include <thrust/device_vector.h>
#include <thrust/for_each.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/zip_function.h>
#include <iostream>
// This example shows how to implement an arbitrary transformation of
// the form output[i] = F(first[i], second[i], third[i], ... ).
// In this example, we use a function with 3 inputs and 1 output.
//
// Iterators for all four vectors (3 inputs + 1 output) are "zipped"
// into a single sequence of tuples with the zip_iterator.
//
// The arbitrary_functor receives a tuple that contains four elements,
// which are references to values in each of the four sequences. When we
// access the tuple 't' with the get() function,
// get<0>(t) returns a reference to A[i],
// get<1>(t) returns a reference to B[i],
// get<2>(t) returns a reference to C[i],
// get<3>(t) returns a reference to D[i].
//
// In this example, we can implement the transformation,
// D[i] = A[i] + B[i] * C[i];
// by invoking arbitrary_functor() on each of the tuples using for_each.
//
// If we are using a functor that is not designed for zip iterators by taking a
// tuple instead of individual arguments we can adapt this function using the
// zip_function adaptor (C++11 only).
//
// Note that we could extend this example to implement functions with an
// arbitrary number of input arguments by zipping more sequence together.
// With the same approach we can have multiple *output* sequences, if we
// wanted to implement something like
// D[i] = A[i] + B[i] * C[i];
// E[i] = A[i] + B[i] + C[i];
//
// The possibilities are endless! :)
struct arbitrary_functor1 {
template <typename Tuple> __host__ __device__ void operator()(Tuple t) {
// D[i] = A[i] + B[i] * C[i];
thrust::get<3>(t) =
thrust::get<0>(t) + thrust::get<1>(t) * thrust::get<2>(t);
}
};
struct arbitrary_functor2 {
__host__ __device__ void operator()(const float &a, const float &b,
const float &c, float &d) {
// D[i] = A[i] + B[i] * C[i];
d = a + b * c;
}
};
int main() {
// allocate and initialize
thrust::device_vector<float> A{3, 4, 0, 8, 2};
thrust::device_vector<float> B{6, 7, 2, 1, 8};
thrust::device_vector<float> C{2, 5, 7, 4, 3};
thrust::device_vector<float> D1(5);
// apply the transformation
thrust::for_each(
thrust::make_zip_iterator(A.begin(), B.begin(), C.begin(), D1.begin()),
thrust::make_zip_iterator(A.end(), B.end(), C.end(), D1.end()),
arbitrary_functor1());
// print the output
std::cout << "Tuple functor" << std::endl;
for (size_t i = 0; i < A.size(); i++) {
std::cout << A[i] << " + " << B[i] << " * " << C[i] << " = " << D1[i]
<< std::endl;
}
// apply the transformation using zip_function
thrust::device_vector<float> D2(5);
thrust::for_each(
thrust::make_zip_iterator(A.begin(), B.begin(), C.begin(), D2.begin()),
thrust::make_zip_iterator(A.end(), B.end(), C.end(), D2.end()),
thrust::make_zip_function(arbitrary_functor2()));
// print the output
std::cout << "N-ary functor" << std::endl;
for (size_t i = 0; i < A.size(); i++) {
std::cout << A[i] << " + " << B[i] << " * " << C[i] << " = " << D2[i]
<< std::endl;
}
}
✅ Basic Vector (2.7x less code)#
Parrot Code
#include "parrot.hpp"
int main() {
parrot::array({14, 20, 38, 46})
.print() // 14 20 38 46
.take(2)
.print(); // 14 20
}
Thrust Code
Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/basic_vector.cu
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <iostream>
int main() {
// H holds 4 integers
thrust::host_vector<int> H{14, 20, 38, 46};
// H.size() returns the size of vector H
std::cout << "H has size " << H.size() << std::endl;
// print contents of H
for (size_t i = 0; i < H.size(); i++) {
std::cout << "H[" << i << "] = " << H[i] << std::endl;
}
// resize H
H.resize(2);
std::cout << "H now has size " << H.size() << std::endl;
// Copy host_vector H to device_vector D
thrust::device_vector<int> D = H;
// elements of D can be modified
D[0] = 99;
D[1] = 88;
// print contents of D
for (size_t i = 0; i < D.size(); i++) {
std::cout << "D[" << i << "] = " << D[i] << std::endl;
}
// H and D are automatically deleted when the function returns
return 0;
}
✅ Bounding Box (7.3x less code)#
Parrot Code
#include "parrot.hpp"
int main() {
auto ll = parrot::matrix(1.0, {2, 10}).rand();
auto ur = parrot::matrix(1.0, {2, 10}).rand().add(1);
ll.minr<2>().print();
ur.maxr<2>().print();
}
Thrust Code
Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/bounding_box.cu
#include <thrust/device_vector.h>
#include <thrust/extrema.h>
#include <thrust/pair.h>
#include <thrust/random.h>
#include <thrust/transform_reduce.h>
// This example shows how to compute a bounding box
// for a set of points in two dimensions.
struct point2d {
float x, y;
__host__ __device__ point2d() : x(0), y(0) {}
__host__ __device__ point2d(float _x, float _y) : x(_x), y(_y) {}
};
// bounding box type
struct bbox {
// construct an empty box
__host__ __device__ bbox() {}
// construct a box from a single point
__host__ __device__ bbox(const point2d &point)
: lower_left(point), upper_right(point) {}
// construct a box from a single point
__host__ __device__ bbox &operator=(const point2d &point) {
lower_left = point;
upper_right = point;
return *this;
}
// construct a box from a pair of points
__host__ __device__ bbox(const point2d &ll, const point2d &ur)
: lower_left(ll), upper_right(ur) {}
point2d lower_left, upper_right;
};
// reduce a pair of bounding boxes (a,b) to a bounding box containing a and b
struct bbox_union {
__host__ __device__ bbox operator()(bbox a, bbox b) {
// lower left corner
point2d ll(thrust::min(a.lower_left.x, b.lower_left.x),
thrust::min(a.lower_left.y, b.lower_left.y));
// upper right corner
point2d ur(thrust::max(a.upper_right.x, b.upper_right.x),
thrust::max(a.upper_right.y, b.upper_right.y));
return bbox(ll, ur);
}
};
int main() {
const size_t N = 40;
// allocate storage for points
thrust::device_vector<point2d> points(N);
// generate some random points in the unit square
thrust::default_random_engine rng;
thrust::uniform_real_distribution<float> u01(0.0f, 1.0f);
for (size_t i = 0; i < N; i++) {
float x = u01(rng);
float y = u01(rng);
points[i] = point2d(x, y);
}
// initial bounding box contains first point
bbox init(points[0], points[0]);
// compute the bounding box for the point set
bbox result =
thrust::reduce(points.begin(), points.end(), init, bbox_union{});
// print output
std::cout << "bounding box " << std::fixed;
std::cout << "(" << result.lower_left.x << "," << result.lower_left.y << ") ";
std::cout << "(" << result.upper_right.x << "," << result.upper_right.y << ")"
<< std::endl;
return 0;
}
✅ Constant Iterator (2.3x less code)#
Parrot Code
#include "parrot.hpp"
int main() {
auto data = parrot::array({3, 7, 2, 5});
auto constant = parrot::scalar(10).repeat(4);
data.add(constant).print(); // [13, 17, 12, 15] - first method
data.add(10).print(); // [13, 17, 12, 15] - better method
}
Thrust Code
Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/constant_iterator.cu
#include <thrust/copy.h>
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/transform.h>
#include <iostream>
#include <iterator>
int main() {
thrust::device_vector<int> data{3, 7, 2, 5};
// add 10 to all values in data
thrust::transform(data.begin(), data.end(),
thrust::constant_iterator<int>(10), data.begin(),
cuda::std::plus<int>());
// data is now [13, 17, 12, 15]
// print result
thrust::copy(data.begin(), data.end(),
std::ostream_iterator<int>(std::cout, "\n"));
return 0;
}
✅ Counting Iterator (2.5x less code)#
Parrot Code
#include "parrot.hpp"
int main() {
auto stencil = parrot::array({0, 1, 1, 0, 0, 1, 0, 1});
auto indices = parrot::range(8)
.minus(1)
.keep(stencil)
.print(); // [1, 2, 5, 7]
}
Thrust Code
Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/counting_iterator.cu
#include <thrust/copy.h>
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/iterator/counting_iterator.h>
#include <iostream>
#include <iterator>
int main() {
// this example computes indices for all the nonzero values in a sequence
// sequence of zero and nonzero values
thrust::device_vector<int> stencil{0, 1, 1, 0, 0, 1, 0, 1};
// storage for the nonzero indices
thrust::device_vector<int> indices(8);
// counting iterators define a sequence [0, 8)
thrust::counting_iterator<int> first(0);
thrust::counting_iterator<int> last = first + 8;
// compute indices of nonzero elements
using IndexIterator = thrust::device_vector<int>::iterator;
IndexIterator indices_end = thrust::copy_if(
first, last, stencil.begin(), indices.begin(), cuda::std::identity{});
// indices now contains [1,2,5,7]
// print result
std::cout << "found " << cuda::std::distance(indices.begin(), indices_end)
<< " nonzero values at indices:\n";
thrust::copy(indices.begin(), indices_end,
std::ostream_iterator<int>(std::cout, "\n"));
return 0;
}
✅ Dot Products With Zip (10.1x less code)#
Parrot Code
#include <parrot.hpp>
int main() {
int N = 1000;
auto a = parrot::matrix(1.0, {3, N}).rand();
auto b = parrot::matrix(1.0, {3, N}).rand();
a.times(b).sum<2>().print(); // 256.714 236.45 256.613 (roughly)
}
Thrust Code
Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/dot_products_with_zip.cu
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/host_vector.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/random.h>
#include <thrust/transform.h>
// This example shows how thrust::zip_iterator can be used to create a
// 'virtual' array of structures. In this case the structure is a 3d
// vector type (Float3) whose (x,y,z) components will be stored in
// three separate float arrays. The zip_iterator "zips" these arrays
// into a single virtual Float3 array.
// We'll use a 3-tuple to store our 3d vector type
using Float3 = thrust::tuple<float, float, float>;
// This functor implements the dot product between 3d vectors
struct DotProduct {
__host__ __device__ float operator()(const Float3 &a, const Float3 &b) const {
return thrust::get<0>(a) * thrust::get<0>(b) + // x components
thrust::get<1>(a) * thrust::get<1>(b) + // y components
thrust::get<2>(a) * thrust::get<2>(b); // z components
}
};
// Return a host vector with random values in the range [0,1)
thrust::host_vector<float>
random_vector(const size_t N,
unsigned int seed = thrust::default_random_engine::default_seed) {
thrust::default_random_engine rng(seed);
thrust::uniform_real_distribution<float> u01(0.0f, 1.0f);
thrust::host_vector<float> temp(N);
for (size_t i = 0; i < N; ++i) {
temp[i] = u01(rng);
}
return temp;
}
int main() {
// number of vectors
const size_t N = 1000;
// We'll store the components of the 3d vectors in separate arrays. One set of
// arrays will store the 'A' vectors and another set will store the 'B'
// vectors.
// This 'structure of arrays' (SoA) approach is usually more efficient than
// the 'array of structures' (AoS) approach. The primary reason is that
// structures, like Float3, don't always obey the memory coalescing rules, so
// they are not efficiently transferred to and from memory. Another reason to
// prefer SoA to AoS is that we don't always want to process all members of
// the structure. For example, if we only need to look at first element of
// the structure then it is wasteful to load the entire structure from memory.
// With the SoA approach, we can chose which elements of the structure we wish
// to read.
thrust::device_vector<float> A0 =
random_vector(N); // x components of the 'A' vectors
thrust::device_vector<float> A1 =
random_vector(N); // y components of the 'A' vectors
thrust::device_vector<float> A2 =
random_vector(N); // z components of the 'A' vectors
thrust::device_vector<float> B0 =
random_vector(N); // x components of the 'B' vectors
thrust::device_vector<float> B1 =
random_vector(N); // y components of the 'B' vectors
thrust::device_vector<float> B2 =
random_vector(N); // z components of the 'B' vectors
// Storage for result of each dot product
thrust::device_vector<float> result(N);
// We'll now illustrate two ways to use zip_iterator to compute the dot
// products. The first method is verbose but shows how the parts fit
// together. The second method hides these details and is more concise.
// METHOD #1
// Defining a zip_iterator type can be a little cumbersome ...
using FloatIterator = thrust::device_vector<float>::iterator;
using FloatIteratorTuple =
thrust::tuple<FloatIterator, FloatIterator, FloatIterator>;
using Float3Iterator = thrust::zip_iterator<FloatIteratorTuple>;
// Now we'll create some zip_iterators for A and B
Float3Iterator A_first =
thrust::make_zip_iterator(A0.begin(), A1.begin(), A2.begin());
Float3Iterator A_last =
thrust::make_zip_iterator(A0.end(), A1.end(), A2.end());
Float3Iterator B_first =
thrust::make_zip_iterator(B0.begin(), B1.begin(), B2.begin());
// Finally, we pass the zip_iterators into transform() as if they
// were 'normal' iterators for a device_vector<Float3>.
thrust::transform(A_first, A_last, B_first, result.begin(), DotProduct());
// METHOD #2
// Alternatively, we can avoid creating variables for X_first, X_last,
// and Y_first and invoke transform() directly.
thrust::transform(
thrust::make_zip_iterator(A0.begin(), A1.begin(), A2.begin()),
thrust::make_zip_iterator(A0.end(), A1.end(), A2.end()),
thrust::make_zip_iterator(B0.begin(), B1.begin(), B2.begin()),
result.begin(), DotProduct());
// Finally, we'll print a few results
// Example output
// (0.840188,0.45724,0.0860517) * (0.0587587,0.456151,0.322409) = 0.285683
// (0.394383,0.640368,0.180886) * (0.0138811,0.24875,0.0221609) = 0.168775
// (0.783099,0.717092,0.426423) * (0.622212,0.0699601,0.234811) = 0.63755
// (0.79844,0.460067,0.0470658) * (0.0391351,0.742097,0.354747) = 0.389358
std::cout << std::fixed;
for (size_t i = 0; i < 4; i++) {
Float3 a = A_first[i];
Float3 b = B_first[i];
float dot = result[i];
std::cout << "(" << thrust::get<0>(a) << "," << thrust::get<1>(a) << ","
<< thrust::get<2>(a) << ")";
std::cout << " * ";
std::cout << "(" << thrust::get<0>(b) << "," << thrust::get<1>(b) << ","
<< thrust::get<2>(b) << ")";
std::cout << " = ";
std::cout << dot << std::endl;
}
return 0;
}
✅ Max Abs Diff (3.5x less code)#
Parrot Code
#include "parrot.hpp"
int main() {
auto a = parrot::array({1, 2, 3, 4});
auto b = parrot::array({2, 4, 3, 0});
(b - a).abs().maxr().print(); // 4
}
Thrust Code
Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/max_abs_diff.cu
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/inner_product.h>
#include <cuda/functional>
#include <iostream>
// this example computes the maximum absolute difference
// between the elements of two vectors
template <typename T> struct abs_diff {
__host__ __device__ T operator()(const T &a, const T &b) {
return fabsf(b - a);
}
};
int main() {
thrust::device_vector<float> d_a = {1.0, 2.0, 3.0, 4.0};
thrust::device_vector<float> d_b = {2.0, 4.0, 3.0, 0.0};
// initial value of the reduction
float init = 0;
// binary operations
cuda::maximum<float> binary_op1{};
abs_diff<float> binary_op2;
float max_abs_diff = thrust::inner_product(
d_a.begin(), d_a.end(), d_b.begin(), init, binary_op1, binary_op2);
std::cout << "maximum absolute difference: " << max_abs_diff << std::endl;
return 0;
}
✅ Min Max (10.0x less code)#
Parrot Code
#include "parrot.hpp"
int main() {
auto data = parrot::scalar(89).add(10).repeat(10).rand().print();
data.minmax().print();
}
Thrust Code
Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/minmax.cu
#include <thrust/device_vector.h>
#include <thrust/extrema.h>
#include <thrust/functional.h>
#include <thrust/host_vector.h>
#include <thrust/random.h>
#include <thrust/transform_reduce.h>
// compute minimum and maximum values in a single reduction
// minmax_pair stores the minimum and maximum
// values that have been encountered so far
template <typename T> struct minmax_pair {
T min_val;
T max_val;
};
// minmax_unary_op is a functor that takes in a value x and
// returns a minmax_pair whose minimum and maximum values
// are initialized to x.
template <typename T> struct minmax_unary_op {
__host__ __device__ minmax_pair<T> operator()(const T &x) const {
minmax_pair<T> result;
result.min_val = x;
result.max_val = x;
return result;
}
};
// minmax_binary_op is a functor that accepts two minmax_pair
// structs and returns a new minmax_pair whose minimum and
// maximum values are the min() and max() respectively of
// the minimums and maximums of the input pairs
template <typename T> struct minmax_binary_op {
__host__ __device__ minmax_pair<T> operator()(const minmax_pair<T> &x,
const minmax_pair<T> &y) const {
minmax_pair<T> result;
result.min_val = thrust::min(x.min_val, y.min_val);
result.max_val = thrust::max(x.max_val, y.max_val);
return result;
}
};
int main() {
// input size
size_t N = 10;
// initialize random number generator
thrust::default_random_engine rng;
thrust::uniform_int_distribution<int> dist(10, 99);
// initialize data on host
thrust::host_vector<int> host_data(N);
for (auto &e : host_data) {
e = dist(rng);
}
thrust::device_vector<int> data = host_data;
// setup arguments
minmax_unary_op<int> unary_op;
minmax_binary_op<int> binary_op;
// initialize reduction with the first value
minmax_pair<int> init = unary_op(data[0]);
// compute minimum and maximum values
minmax_pair<int> result = thrust::transform_reduce(data.begin(), data.end(),
unary_op, init, binary_op);
// print results
std::cout << "[ ";
for (auto &e : host_data) {
std::cout << e << " ";
}
std::cout << "]" << std::endl;
std::cout << "minimum = " << result.min_val << std::endl;
std::cout << "maximum = " << result.max_val << std::endl;
return 0;
}
✅ Mode (11.0x less code)#
Parrot Code
#include "parrot.hpp"
int main() {
auto data = parrot::scalar(9).repeat(30).rand().print();
data.sort().print().rle().print().max_by_key(parrot::snd()).print();
// Output:
// 2 5 6 2 0 0 4 2 3 8 5 0 1 4 4 7 5 8 3 3 8 2 6 0 7 5 6 0 2 3
// 0 0 0 0 0 1 2 2 2 2 2 3 3 3 3 4 4 4 5 5 5 5 6 6 6 7 7 8 8 8
// (0, 5) (1, 1) (2, 5) (3, 4) (4, 3) (5, 4) (6, 3) (7, 2) (8, 3)
// (0, 5)
}
Thrust Code
Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/mode.cu
#include <thrust/device_vector.h>
#include <thrust/extrema.h>
#include <thrust/functional.h>
#include <thrust/host_vector.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/random.h>
#include <thrust/reduce.h>
#include <thrust/sort.h>
#include <thrust/unique.h>
#include <iostream>
#include <iterator>
// This example compute the mode [1] of a set of numbers. If there
// are multiple modes, one with the smallest value it returned.
//
// [1] http://en.wikipedia.org/wiki/Mode_(statistics)
int main() {
const size_t N = 30;
const size_t M = 10;
thrust::default_random_engine rng;
thrust::uniform_int_distribution<int> dist(0, M - 1);
// generate random data on the host
thrust::host_vector<int> h_data(N);
for (auto &e : h_data) {
e = dist(rng);
}
// transfer data to device
thrust::device_vector<int> d_data(h_data);
// print the initial data
std::cout << "initial data" << std::endl;
thrust::copy(d_data.begin(), d_data.end(),
std::ostream_iterator<int>(std::cout, " "));
std::cout << std::endl;
// sort data to bring equal elements together
thrust::sort(d_data.begin(), d_data.end());
// print the sorted data
std::cout << "sorted data" << std::endl;
thrust::copy(d_data.begin(), d_data.end(),
std::ostream_iterator<int>(std::cout, " "));
std::cout << std::endl;
// count number of unique keys
size_t num_unique = thrust::unique_count(d_data.begin(), d_data.end());
// count multiplicity of each key
thrust::device_vector<int> d_output_keys(num_unique);
thrust::device_vector<int> d_output_counts(num_unique);
thrust::reduce_by_key(d_data.begin(), d_data.end(),
thrust::constant_iterator<int>(1),
d_output_keys.begin(), d_output_counts.begin());
// print the counts
std::cout << "values" << std::endl;
thrust::copy(d_output_keys.begin(), d_output_keys.end(),
std::ostream_iterator<int>(std::cout, " "));
std::cout << std::endl;
// print the counts
std::cout << "counts" << std::endl;
thrust::copy(d_output_counts.begin(), d_output_counts.end(),
std::ostream_iterator<int>(std::cout, " "));
std::cout << std::endl;
// find the index of the maximum count
thrust::device_vector<int>::iterator mode_iter;
mode_iter =
thrust::max_element(d_output_counts.begin(), d_output_counts.end());
int mode =
d_output_keys[cuda::std::distance(d_output_counts.begin(), mode_iter)];
int occurrences = *mode_iter;
std::cout << "Modal value " << mode << " occurs " << occurrences << " times "
<< std::endl;
return 0;
}
✅ Monte Carlo (4.5x less code)#
Parrot Code
#include <iomanip>
#include "parrot.hpp"
int main() {
int N = 1000000;
auto x = parrot::scalar(1.0).repeat(N).rand();
auto y = parrot::scalar(1.0).repeat(N).rand();
auto i = x.sq().add(y.sq()).lte(1);
std::setprecision(3);
i.sum().div(N).times(4).print(); // 3.142
}
Thrust Code
Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/monte_carlo.cu
#include <thrust/functional.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/random.h>
#include <thrust/transform_reduce.h>
#include <cmath>
#include <iomanip>
#include <iostream>
// we could vary M & N to find the perf sweet spot
__host__ __device__ unsigned int hash(unsigned int a) {
a = (a + 0x7ed55d16) + (a << 12);
a = (a ^ 0xc761c23c) ^ (a >> 19);
a = (a + 0x165667b1) + (a << 5);
a = (a + 0xd3a2646c) ^ (a << 9);
a = (a + 0xfd7046c5) + (a << 3);
a = (a ^ 0xb55a4f09) ^ (a >> 16);
return a;
}
struct estimate_pi {
__host__ __device__ float operator()(unsigned int thread_id) {
float sum = 0;
unsigned int N = 10000; // samples per thread
unsigned int seed = hash(thread_id);
// seed a random number generator
thrust::default_random_engine rng(seed);
// create a mapping from random numbers to [0,1)
thrust::uniform_real_distribution<float> u01(0, 1);
// take N samples in a quarter circle
for (unsigned int i = 0; i < N; ++i) {
// draw a sample from the unit square
float x = u01(rng);
float y = u01(rng);
// measure distance from the origin
float dist = sqrtf(x * x + y * y);
// add 1.0f if (u0,u1) is inside the quarter circle
if (dist <= 1.0f) {
sum += 1.0f;
}
}
// multiply by 4 to get the area of the whole circle
sum *= 4.0f;
// divide by N
return sum / N;
}
};
int main() {
// use 30K independent seeds
int M = 30000;
float estimate = thrust::transform_reduce(
thrust::counting_iterator<int>(0), thrust::counting_iterator<int>(M),
estimate_pi(), 0.0f, cuda::std::plus<float>());
estimate /= M;
std::cout << std::setprecision(3);
std::cout << "pi is approximately " << estimate << std::endl;
return 0;
}
✅ Norm (3.8x less code)#
Parrot Code
#include "parrot.hpp"
int main() {
auto x = parrot::array<float>({1.0, 2.0, 3.0, 4.0});
auto result = x.sq().sum().sqrt().print(); // 5.47723
}
Thrust Code
Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/norm.cu
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/host_vector.h>
#include <thrust/transform_reduce.h>
#include <cmath>
#include <iostream>
// This example computes the norm [1] of a vector. The norm is
// computed by squaring all numbers in the vector, summing the
// squares, and taking the square root of the sum of squares. In
// Thrust this operation is efficiently implemented with the
// transform_reduce() algorithm. Specifically, we first transform
// x -> x^2 and the compute a standard plus reduction. Since there
// is no built-in functor for squaring numbers, we define our own
// square functor.
//
// [1] http://en.wikipedia.org/wiki/Norm_(mathematics)#Euclidean_norm
// square<T> computes the square of a number f(x) -> x*x
template <typename T> struct square {
__host__ __device__ T operator()(const T &x) const { return x * x; }
};
int main() {
// initialize device vector directly
thrust::device_vector<float> d_x = {1.0, 2.0, 3.0, 4.0};
// setup arguments
square<float> unary_op;
cuda::std::plus<float> binary_op;
float init = 0;
// compute norm
float norm = std::sqrt(thrust::transform_reduce(d_x.begin(), d_x.end(),
unary_op, init, binary_op));
std::cout << "norm is " << norm << std::endl;
return 0;
}
✅ Padded Grid Reduction (10.6x less code)#
Parrot Code
#include "parrot.hpp"
int main() {
auto shape = {10, 16};
auto grid = parrot::matrix(1.0, shape).rand();
auto mask = parrot::range(16).lt(11).cycle({10, 16});
grid.keep(mask).minmax().print();
}
Thrust Code
Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/padded_grid_reduction.cu
#include <thrust/device_vector.h>
#include <thrust/extrema.h>
#include <thrust/functional.h>
#include <thrust/host_vector.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/random.h>
#include <thrust/transform_reduce.h>
#include <cmath>
#include <iomanip>
#include <float.h>
// This example computes the minimum and maximum values
// over a padded grid. The padded values are not considered
// during the reduction operation.
// transform a tuple (int,value) into a tuple (bool,value,value)
// where the bool is true for valid grid values and false for
// values in the padded region of the grid
template <typename IndexType, typename ValueType> struct transform_tuple {
using InputTuple = typename thrust::tuple<IndexType, ValueType>;
using OutputTuple = typename thrust::tuple<bool, ValueType, ValueType>;
IndexType n, N;
transform_tuple(IndexType n, IndexType N) : n(n), N(N) {}
__host__ __device__ OutputTuple operator()(const InputTuple &t) const {
bool is_valid = (thrust::get<0>(t) % N) < n;
return OutputTuple(is_valid, thrust::get<1>(t), thrust::get<1>(t));
}
};
// reduce two tuples (bool,value,value) into a single tuple such that output
// contains the smallest and largest *valid* values.
template <typename IndexType, typename ValueType> struct reduce_tuple {
using Tuple = typename thrust::tuple<bool, ValueType, ValueType>;
__host__ __device__ Tuple operator()(const Tuple &t0, const Tuple &t1) const {
if (thrust::get<0>(t0) && thrust::get<0>(t1)) // both valid
{
return Tuple(true, thrust::min(thrust::get<1>(t0), thrust::get<1>(t1)),
thrust::max(thrust::get<2>(t0), thrust::get<2>(t1)));
} else if (thrust::get<0>(t0)) {
return t0;
} else if (thrust::get<0>(t1)) {
return t1;
} else {
return t1; // if neither is valid then it doesn't matter what we return
}
}
};
int main() {
int M = 10; // number of rows
int n = 11; // number of columns excluding padding
int N = 16; // number of columns including padding
thrust::default_random_engine rng(12345);
thrust::uniform_real_distribution<float> dist(0.0f, 1.0f);
thrust::device_vector<float> data(M * N, -1);
// initialize valid values in grid
for (int i = 0; i < M; i++) {
for (int j = 0; j < n; j++) {
data[i * N + j] = dist(rng);
}
}
// print full grid
std::cout << "padded grid" << std::endl;
std::cout << std::fixed << std::setprecision(4);
for (int i = 0; i < M; i++) {
std::cout << " ";
for (int j = 0; j < N; j++) {
std::cout << data[i * N + j] << " ";
}
std::cout << "\n";
}
std::cout << "\n";
// compute min & max over valid region of the 2d grid
using result_type = thrust::tuple<bool, float, float>;
result_type init(true, FLT_MAX, -FLT_MAX); // initial value
transform_tuple<int, float> unary_op(n, N); // transformation operator
reduce_tuple<int, float> binary_op; // reduction operator
result_type result = thrust::transform_reduce(
thrust::make_zip_iterator(thrust::counting_iterator<int>(0),
data.begin()),
thrust::make_zip_iterator(
thrust::make_tuple(thrust::counting_iterator<int>(0), data.begin())) +
data.size(),
unary_op, init, binary_op);
std::cout << "minimum value: " << thrust::get<1>(result) << std::endl;
std::cout << "maximum value: " << thrust::get<2>(result) << std::endl;
return 0;
}
✅ Permutation Iterator (2.2x less code)#
Parrot Code
#include "parrot.hpp"
int main() {
auto source = parrot::array({10, 20, 30, 40, 50, 60});
auto indices = parrot::array({3, 1, 0, 5});
source.gather(indices).sum().print(); // 130
}
Thrust Code
Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/permutation_iterator.cu
#include <thrust/device_vector.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/reduce.h>
#include <iostream>
// this example fuses a gather operation with a reduction for
// greater efficiency than separate gather() and reduce() calls
int main() {
// gather locations
thrust::device_vector<int> map = {3, 1, 0, 5};
// array to gather from
thrust::device_vector<int> source = {10, 20, 30, 40, 50, 60};
// fuse gather with reduction:
// sum = source[map[0]] + source[map[1]] + ...
int sum = thrust::reduce(
thrust::make_permutation_iterator(source.begin(), map.begin()),
thrust::make_permutation_iterator(source.begin(), map.end()));
// print sum
std::cout << "sum is " << sum << std::endl;
return 0;
}
✅ Remove Points2d (5.0x less code)#
Parrot Code
#include "parrot.hpp"
int main() {
auto is_in_circle = [] __host__ __device__(float x, float y) {
return x * x + y * y <= 1;
};
auto x = parrot::scalar(1.0f).repeat(20).rand();
auto y = parrot::scalar(1.0f).repeat(20).rand();
auto p = x.pairs(y).filter(thrust::make_zip_function(is_in_circle)).print();
}
Thrust Code
Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/remove_points2d.cu
#include <thrust/host_vector.h>
#include <thrust/random.h>
#include <thrust/remove.h>
// This example generates random points in the
// unit square [0,1)x[0,1) and then removes all
// points where x^2 + y^2 > 1
//
// The x and y coordinates are stored in separate arrays
// and a zip_iterator is used to combine them together
template <typename T> struct is_outside_circle {
template <typename Tuple>
inline __host__ __device__ bool operator()(const Tuple &tuple) const {
// unpack the tuple into x and y coordinates
const T x = thrust::get<0>(tuple);
const T y = thrust::get<1>(tuple);
if (x * x + y * y > 1) {
return true;
} else {
return false;
}
}
};
int main() {
const size_t N = 20;
// generate random points in the unit square on the host
thrust::default_random_engine rng;
thrust::uniform_real_distribution<float> u01(0.0f, 1.0f);
thrust::host_vector<float> x(N);
thrust::host_vector<float> y(N);
for (size_t i = 0; i < N; i++) {
x[i] = u01(rng);
y[i] = u01(rng);
}
// print the initial points
std::cout << std::fixed;
std::cout << "Generated " << N << " points" << std::endl;
for (size_t i = 0; i < x.size(); i++) {
std::cout << "(" << x[i] << "," << y[i] << ")" << std::endl;
}
std::cout << std::endl;
// remove points where x^2 + y^2 > 1 and determine new array sizes
size_t new_size =
thrust::remove_if(thrust::make_zip_iterator(x.begin(), y.begin()),
thrust::make_zip_iterator(x.end(), y.end()),
is_outside_circle<float>()) -
thrust::make_zip_iterator(x.begin(), y.begin());
// resize the vectors (note: this does not free any memory)
x.resize(new_size);
y.resize(new_size);
// print the filtered points
std::cout << "After stream compaction, " << new_size << " points remain"
<< std::endl;
for (size_t i = 0; i < x.size(); i++) {
std::cout << "(" << x[i] << "," << y[i] << ")" << std::endl;
}
return 0;
}
✅ Run Length Encoding (3.7x less code)#
Parrot Code
#include "parrot.hpp"
int main() {
const char data[] = "aaabbbbbcddeeeeeeeeeff";
const size_t N = (sizeof(data) / sizeof(char)) - 1;
auto input = std::vector(data, data + N);
parrot::array(input)
.rle()
.print(); //(a, 3) (b, 5) (c, 1) (d, 2) (e, 9) (f, 2)
}
Thrust Code
Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/run_length_encoding.cu
#include <thrust/copy.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/reduce.h>
#include <iostream>
#include <iterator>
// This example computes a run-length code [1] for an array of characters.
//
// [1] http://en.wikipedia.org/wiki/Run-length_encoding
int main() {
// input data on the host
const char data[] = "aaabbbbbcddeeeeeeeeeff";
const size_t N = (sizeof(data) / sizeof(char)) - 1;
// copy input data to the device
thrust::device_vector<char> input(data, data + N);
// allocate storage for output data and run lengths
thrust::device_vector<char> output(N);
thrust::device_vector<int> lengths(N);
// print the initial data
std::cout << "input data:" << std::endl;
thrust::copy(input.begin(), input.end(),
std::ostream_iterator<char>(std::cout, ""));
std::cout << std::endl << std::endl;
// compute run lengths
size_t num_runs =
thrust::reduce_by_key(
input.begin(),
input.end(), // input key sequence
thrust::constant_iterator<int>(1), // input value sequence
output.begin(), // output key sequence
lengths.begin() // output value sequence
)
.first -
output.begin(); // compute the output size
// print the output
std::cout << "run-length encoded output:" << std::endl;
for (size_t i = 0; i < num_runs; i++) {
std::cout << "(" << output[i] << "," << lengths[i] << ")";
}
std::cout << std::endl;
return 0;
}
✅ Saxpy (7.0x less code)#
Parrot Code
#include "parrot.hpp"
int main() {
auto x = parrot::array({1.0, 1.0, 1.0, 1.0});
auto y = parrot::array({1.0, 2.0, 3.0, 4.0});
x.times(2.0).add(y).print(); // 3 4 5 6
}
Thrust Code
Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/saxpy.cu
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/host_vector.h>
#include <thrust/transform.h>
#include <algorithm>
#include <iostream>
#include <iterator>
// This example illustrates how to implement the SAXPY
// operation (Y[i] = a * X[i] + Y[i]) using Thrust.
// The saxpy_slow function demonstrates the most
// straightforward implementation using a temporary
// array and two separate transformations, one with
// multiplies and one with plus. The saxpy_fast function
// implements the operation with a single transformation
// and represents "best practice".
struct saxpy_functor {
const float a;
saxpy_functor(float _a) : a(_a) {}
__host__ __device__ float operator()(const float &x, const float &y) const {
return a * x + y;
}
};
void saxpy_fast(float A, thrust::device_vector<float> &X,
thrust::device_vector<float> &Y) {
// Y <- A * X + Y
thrust::transform(X.begin(), X.end(), Y.begin(), Y.begin(), saxpy_functor(A));
}
void saxpy_slow(float A, thrust::device_vector<float> &X,
thrust::device_vector<float> &Y) {
thrust::device_vector<float> temp(X.size());
// temp <- A
thrust::fill(temp.begin(), temp.end(), A);
// temp <- A * X
thrust::transform(X.begin(), X.end(), temp.begin(), temp.begin(),
cuda::std::multiplies<float>());
// Y <- A * X + Y
thrust::transform(temp.begin(), temp.end(), Y.begin(), Y.begin(),
cuda::std::plus<float>());
}
int main() {
// initialize host arrays
thrust::host_vector<float> x{1.0, 1.0, 1.0, 1.0};
thrust::host_vector<float> y{1.0, 2.0, 3.0, 4.0};
{
// transfer to device
thrust::device_vector<float> X(x);
thrust::device_vector<float> Y(y);
// slow method
saxpy_slow(2.0, X, Y);
}
{
// transfer to device
thrust::device_vector<float> X(x);
thrust::device_vector<float> Y(y);
// fast method
saxpy_fast(2.0, X, Y);
}
return 0;
}
✅ Simple Moving Average (8.8x less code)#
Parrot Code
#include "parrot.hpp"
int main() {
auto data = parrot::scalar(10).repeat(30).rand().print();
auto sums = data.prepend(0).sums();
sums.drop(4).minus(sums.take(27)).div(4.0).reshape({3, 9}).print();
}
Thrust Code
Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/simple_moving_average.cu
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/host_vector.h>
#include <thrust/random.h>
#include <thrust/scan.h>
#include <thrust/sequence.h>
#include <thrust/transform.h>
#include <iomanip>
#include <iostream>
// Efficiently computes the simple moving average (SMA) [1] of a data series
// using a parallel prefix-sum or "scan" operation.
//
// Note: additional numerical precision should be used in the cumulative summing
// stage when computing the SMA of large data series. The most straightforward
// remedy is to replace 'float' with 'double'. Alternatively a Kahan or
// "compensated" summation algorithm could be applied [2].
//
// [1] http://en.wikipedia.org/wiki/Moving_average#Simple_moving_average
// [2] http://en.wikipedia.org/wiki/Kahan_summation_algorithm
// compute the difference of two positions in the cumumulative sum and
// divide by the SMA window size w.
template <typename T> struct minus_and_divide {
T w;
minus_and_divide(T w) : w(w) {}
__host__ __device__ T operator()(const T &a, const T &b) const {
return (a - b) / w;
}
};
template <typename InputVector, typename OutputVector>
void simple_moving_average(const InputVector &data, size_t w,
OutputVector &output) {
using T = typename InputVector::value_type;
if (data.size() < w) {
return;
}
// allocate storage for cumulative sum
thrust::device_vector<T> temp(data.size() + 1);
// compute cumulative sum
thrust::exclusive_scan(data.begin(), data.end(), temp.begin());
temp[data.size()] = data.back() + temp[data.size() - 1];
// compute moving averages from cumulative sum
thrust::transform(temp.begin() + w, temp.end(), temp.begin(), output.begin(),
minus_and_divide<T>(T(w)));
}
int main() {
// length of data series
size_t n = 30;
// window size of the moving average
size_t w = 4;
// generate random data series
thrust::host_vector<float> host_data(n);
thrust::default_random_engine rng;
thrust::uniform_int_distribution<int> dist(0, 10);
for (auto &e : host_data) {
e = static_cast<float>(dist(rng));
}
thrust::device_vector<float> data = host_data;
// allocate storage for averages
thrust::device_vector<float> averages(data.size() - (w - 1));
// compute SMA using standard summation
simple_moving_average(data, w, averages);
// print data series
std::cout << "data series: [ ";
for (const auto &value : data) {
std::cout << value << " ";
}
std::cout << "]" << std::endl;
// print moving averages
std::cout << "simple moving averages (window = " << w << ")" << std::endl;
for (size_t i = 0; i < averages.size(); i++) {
std::cout << " [" << std::setw(2) << i << "," << std::setw(2) << (i + w)
<< ") = " << averages[i] << std::endl;
}
return 0;
}
✅ Sort (12.3x less code)#
Parrot Code
#include "parrot.hpp"
int main() {
auto const N = 10;
auto ints = parrot::scalar(100).repeat(N).rand();
auto floats = parrot::scalar(10.0).repeat(N).rand();
auto even_first = [] __host__ __device__(int x) { return x % 2; };
ints.sort().print(); // sort integers
ints.sort_by(thrust::greater<int>()).print(); // sort integers descending
ints.sort_by_key(even_first).print(); // sort integers (user-defined)
floats.sort().print(); // sort floats
floats.pairs(ints).sort().print(); // sort pairs
}
Thrust Code
Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/sort.cu
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/random.h>
#include <thrust/sequence.h>
#include <thrust/sort.h>
#include <cuda/std/utility>
#include <iomanip>
#include <iostream>
#include <numeric>
// Helper routines
void initialize(thrust::device_vector<int> &v) {
thrust::default_random_engine rng(123456);
thrust::uniform_int_distribution<int> dist(10, 99);
thrust::host_vector<int> host_data(v.size());
for (auto &e : host_data) {
e = dist(rng);
}
v = host_data;
}
void initialize(thrust::device_vector<float> &v) {
thrust::default_random_engine rng(123456);
thrust::uniform_int_distribution<int> dist(2, 19);
thrust::host_vector<float> host_data(v.size());
for (auto &e : host_data) {
e = dist(rng) / 2.0f;
}
v = host_data;
}
void initialize(thrust::device_vector<cuda::std::pair<int, int>> &v) {
thrust::default_random_engine rng(123456);
thrust::uniform_int_distribution<int> dist(0, 9);
thrust::host_vector<cuda::std::pair<int, int>> host_data(v.size());
for (auto &e : host_data) {
int a = dist(rng);
int b = dist(rng);
e = cuda::std::make_pair(a, b);
}
v = host_data;
}
void initialize(thrust::device_vector<int> &v1,
thrust::device_vector<int> &v2) {
thrust::default_random_engine rng(123456);
thrust::uniform_int_distribution<int> dist(10, 99);
thrust::host_vector<int> host_data(v1.size());
for (auto &e : host_data) {
e = dist(rng);
}
v1 = host_data;
thrust::sequence(v2.begin(), v2.end(), 0);
}
void print(const thrust::device_vector<int> &v) {
for (const auto &value : v) {
std::cout << " " << value;
}
std::cout << "\n";
}
void print(const thrust::device_vector<float> &v) {
for (const auto &value : v) {
std::cout << " " << std::fixed << std::setprecision(1) << value;
}
std::cout << "\n";
}
void print(const thrust::device_vector<cuda::std::pair<int, int>> &v) {
for (const auto &p : v) {
cuda::std::pair<int, int> local_p = p;
std::cout << " (" << local_p.first << "," << local_p.second << ")";
}
std::cout << "\n";
}
void print(thrust::device_vector<int> &v1, thrust::device_vector<int> v2) {
for (size_t i = 0; i < v1.size(); i++) {
std::cout << " (" << v1[i] << "," << std::setw(2) << v2[i] << ")";
}
std::cout << "\n";
}
// user-defined comparison operator that acts like less<int>,
// except even numbers are considered to be smaller than odd numbers
struct evens_before_odds {
__host__ __device__ bool operator()(int x, int y) {
if (x % 2 == y % 2) {
return x < y;
} else if (x % 2) {
return false;
} else {
return true;
}
}
};
int main() {
size_t N = 16;
std::cout << "sorting integers\n";
{
thrust::device_vector<int> keys(N);
initialize(keys);
print(keys);
thrust::sort(keys.begin(), keys.end());
print(keys);
}
std::cout << "\nsorting integers (descending)\n";
{
thrust::device_vector<int> keys(N);
initialize(keys);
print(keys);
thrust::sort(keys.begin(), keys.end(), cuda::std::greater<int>());
print(keys);
}
std::cout << "\nsorting integers (user-defined comparison)\n";
{
thrust::device_vector<int> keys(N);
initialize(keys);
print(keys);
thrust::sort(keys.begin(), keys.end(), evens_before_odds());
print(keys);
}
std::cout << "\nsorting floats\n";
{
thrust::device_vector<float> keys(N);
initialize(keys);
print(keys);
thrust::sort(keys.begin(), keys.end());
print(keys);
}
std::cout << "\nsorting pairs\n";
{
thrust::device_vector<cuda::std::pair<int, int>> keys(N);
initialize(keys);
print(keys);
thrust::sort(keys.begin(), keys.end());
print(keys);
}
std::cout << "\nkey-value sorting\n";
{
thrust::device_vector<int> keys(N);
thrust::device_vector<int> values(N);
initialize(keys, values);
print(keys, values);
thrust::sort_by_key(keys.begin(), keys.end(), values.begin());
print(keys, values);
}
std::cout << "\nkey-value sorting (descending)\n";
{
thrust::device_vector<int> keys(N);
thrust::device_vector<int> values(N);
initialize(keys, values);
print(keys, values);
thrust::sort_by_key(keys.begin(), keys.end(), values.begin(),
cuda::std::greater<int>());
print(keys, values);
}
return 0;
}
✅ Sum (5.2x less code)#
Parrot Code
#include "parrot.hpp"
int main() {
parrot::scalar(9999).repeat(100).sum().print(); //
}
Thrust Code
Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/sum.cu
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/generate.h>
#include <thrust/host_vector.h>
#include <thrust/random.h>
#include <thrust/reduce.h>
int my_rand() {
static thrust::default_random_engine rng;
static thrust::uniform_int_distribution<int> dist(0, 9999);
return dist(rng);
}
int main() {
// generate random data on the host
thrust::host_vector<int> h_vec(100);
thrust::generate(h_vec.begin(), h_vec.end(), my_rand);
// transfer to device and compute sum
thrust::device_vector<int> d_vec = h_vec;
// initial value of the reduction
int init = 0;
// binary operation used to reduce values
cuda::std::plus<int> binary_op;
// compute sum on the device
int sum = thrust::reduce(d_vec.begin(), d_vec.end(), init, binary_op);
// print the sum
std::cout << "sum is " << sum << std::endl;
return 0;
}
✅ Sum Rows (8.2x less code)#
Parrot Code
#include "parrot.hpp"
int main() {
auto matrix = parrot::matrix(89, {5, 8}).add(10).rand();
matrix.sum<2>().print();
}
Thrust Code
Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/sum_rows.cu
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/generate.h>
#include <thrust/host_vector.h>
#include <thrust/random.h>
#include <thrust/reduce.h>
#include <iostream>
// convert a linear index to a row index
template <typename T> struct linear_index_to_row_index {
T C; // number of columns
__host__ __device__ linear_index_to_row_index(T C) : C(C) {}
__host__ __device__ T operator()(T i) { return i / C; }
};
int main() {
int R = 5; // number of rows
int C = 8; // number of columns
thrust::default_random_engine rng;
thrust::uniform_int_distribution<int> dist(10, 99);
// initialize data
thrust::host_vector<int> host_array(R * C);
for (auto &e : host_array) {
e = dist(rng);
}
thrust::device_vector<int> array = host_array;
// allocate storage for row sums and indices
thrust::device_vector<int> row_sums(R);
thrust::device_vector<int> row_indices(R);
// compute row sums by summing values with equal row indices
thrust::reduce_by_key(
thrust::make_transform_iterator(thrust::counting_iterator<int>(0),
linear_index_to_row_index<int>(C)),
thrust::make_transform_iterator(thrust::counting_iterator<int>(0),
linear_index_to_row_index<int>(C)) +
(R * C),
array.begin(), row_indices.begin(), row_sums.begin(),
cuda::std::equal_to<int>(), cuda::std::plus<int>());
// print data
for (int i = 0; i < R; i++) {
std::cout << "[ ";
for (int j = 0; j < C; j++) {
std::cout << array[i * C + j] << " ";
}
std::cout << "] = " << row_sums[i] << "\n";
}
return 0;
}
✅ Summed Area Table (14.4x less code)#
Parrot Code
#include "parrot.hpp"
int main() {
auto m = parrot::matrix(1, {3, 4});
m.sums<2>().sums<1>().print();
}
Thrust Code
Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/summed_area_table.cu
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/gather.h>
#include <thrust/host_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/scan.h>
#include <iomanip>
#include <iostream>
// This example computes a summed area table using segmented scan
// http://en.wikipedia.org/wiki/Summed_area_table
// convert a linear index to a linear index in the transpose
struct transpose_index {
size_t m, n;
__host__ __device__ transpose_index(size_t _m, size_t _n) : m(_m), n(_n) {}
__host__ __device__ size_t operator()(size_t linear_index) {
size_t i = linear_index / n;
size_t j = linear_index % n;
return m * j + i;
}
};
// convert a linear index to a row index
struct row_index {
size_t n;
__host__ __device__ row_index(size_t _n) : n(_n) {}
__host__ __device__ size_t operator()(size_t i) { return i / n; }
};
// transpose an M-by-N array
template <typename T>
void transpose(size_t m, size_t n, thrust::device_vector<T> &src,
thrust::device_vector<T> &dst) {
thrust::counting_iterator<size_t> indices(0);
thrust::gather(
thrust::make_transform_iterator(indices, transpose_index(n, m)),
thrust::make_transform_iterator(indices, transpose_index(n, m)) +
dst.size(),
src.begin(), dst.begin());
}
// scan the rows of an M-by-N array
template <typename T>
void scan_horizontally(size_t n, thrust::device_vector<T> &d_data) {
thrust::counting_iterator<size_t> indices(0);
thrust::inclusive_scan_by_key(
thrust::make_transform_iterator(indices, row_index(n)),
thrust::make_transform_iterator(indices, row_index(n)) + d_data.size(),
d_data.begin(), d_data.begin());
}
// print an M-by-N array
template <typename T>
void print(size_t m, size_t n, thrust::device_vector<T> &d_data) {
thrust::host_vector<T> h_data = d_data;
for (size_t i = 0; i < m; i++) {
for (size_t j = 0; j < n; j++) {
std::cout << std::setw(8) << h_data[i * n + j] << " ";
}
std::cout << "\n";
}
}
int main() {
size_t m = 3; // number of rows
size_t n = 4; // number of columns
// 2d array stored in row-major order [(0,0), (0,1), (0,2) ... ]
thrust::device_vector<int> data(m * n, 1);
std::cout << "[step 0] initial array" << std::endl;
print(m, n, data);
std::cout << "[step 1] scan horizontally" << std::endl;
scan_horizontally(n, data);
print(m, n, data);
std::cout << "[step 2] transpose array" << std::endl;
thrust::device_vector<int> temp(m * n);
transpose(m, n, data, temp);
print(n, m, temp);
std::cout << "[step 3] scan transpose horizontally" << std::endl;
scan_horizontally(m, temp);
print(n, m, temp);
std::cout << "[step 4] transpose the transpose" << std::endl;
transpose(n, m, temp, data);
print(m, n, data);
return 0;
}
✅ Transform Iterator (8.9x less code)#
Parrot Code
#include <parrot.hpp>
int main() {
auto values = parrot::array({2, 5, 7, 1, 6, 0, 3, 8});
auto clamp = values.max(1).min(5).print(); // 2 5 5 1 5 1 3 5
auto sum = clamp.sum().print(); // 27
auto iota = parrot::range(10).minus(1).print(); // 0 1 2 3 4 5 6 7 8 9
auto clamp_i = iota.max(1).min(5).print(); // 1 2 3 4 5 5 5 5 5 5
auto negated = clamp_i.neg().print(); // -1 -2 -3 -4 -5 -5 -5 -5 -5 -5
}
Thrust Code
Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/transform_iterator.cu
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/reduce.h>
#include <iostream>
#include <iterator>
#include <string>
// this functor clamps a value to the range [lo, hi]
template <typename T> struct clamp {
T lo, hi;
__host__ __device__ clamp(T _lo, T _hi) : lo(_lo), hi(_hi) {}
__host__ __device__ T operator()(T x) {
if (x < lo) {
return lo;
} else if (x < hi) {
return x;
} else {
return hi;
}
}
};
template <typename T> struct simple_negate {
__host__ __device__ T operator()(T x) { return -x; }
};
template <typename Iterator>
void print_range(const std::string &name, Iterator first, Iterator last) {
using T = typename std::iterator_traits<Iterator>::value_type;
std::cout << name << ": ";
thrust::copy(first, last, std::ostream_iterator<T>(std::cout, " "));
std::cout << "\n";
}
int main() {
// clamp values to the range [1, 5]
int lo = 1;
int hi = 5;
// define some types
using Vector = thrust::device_vector<int>;
using VectorIterator = Vector::iterator;
// initialize values
Vector values(8);
values[0] = 2;
values[1] = 5;
values[2] = 7;
values[3] = 1;
values[4] = 6;
values[5] = 0;
values[6] = 3;
values[7] = 8;
print_range("values ", values.begin(), values.end());
// define some more types
using ClampedVectorIterator =
thrust::transform_iterator<clamp<int>, VectorIterator>;
// create a transform_iterator that applies clamp() to the values array
ClampedVectorIterator cv_begin =
thrust::make_transform_iterator(values.begin(), clamp<int>(lo, hi));
ClampedVectorIterator cv_end = cv_begin + values.size();
// now [clamped_begin, clamped_end) defines a sequence of clamped values
print_range("clamped values ", cv_begin, cv_end);
////
// compute the sum of the clamped sequence with reduce()
std::cout << "sum of clamped values : " << thrust::reduce(cv_begin, cv_end)
<< "\n";
////
// combine transform_iterator with other fancy iterators like
// counting_iterator
using CountingIterator = thrust::counting_iterator<int>;
using ClampedCountingIterator =
thrust::transform_iterator<clamp<int>, CountingIterator>;
CountingIterator count_begin(0);
CountingIterator count_end(10);
print_range("sequence ", count_begin, count_end);
ClampedCountingIterator cs_begin =
thrust::make_transform_iterator(count_begin, clamp<int>(lo, hi));
ClampedCountingIterator cs_end =
thrust::make_transform_iterator(count_end, clamp<int>(lo, hi));
print_range("clamped sequence ", cs_begin, cs_end);
////
// combine transform_iterator with another transform_iterator
using NegatedClampedCountingIterator =
thrust::transform_iterator<cuda::std::negate<int>,
ClampedCountingIterator>;
NegatedClampedCountingIterator ncs_begin =
thrust::make_transform_iterator(cs_begin, cuda::std::negate<int>());
NegatedClampedCountingIterator ncs_end =
thrust::make_transform_iterator(cs_end, cuda::std::negate<int>());
print_range("negated sequence ", ncs_begin, ncs_end);
////
// when a functor does not define result_type, a third template argument must
// be provided
using NegatedVectorIterator =
thrust::transform_iterator<simple_negate<int>, VectorIterator, int>;
NegatedVectorIterator nv_begin(values.begin(), simple_negate<int>());
NegatedVectorIterator nv_end(values.end(), simple_negate<int>());
print_range("negated values ", nv_begin, nv_end);
return 0;
}