Parrot vs Thrust#

This page provides comparisons between Parrot code and equivalent Thrust code for common operations. Parrot is built on top of Thrust but provides a more concise and expressive API.

Examples #

✅ Arbitrary Transformation (6.1x less code)#

Parrot Code

#include "parrot.hpp"

int main() {
    auto A = parrot::array({3, 4, 0, 8, 2});
    auto B = parrot::array({6, 7, 2, 1, 8});
    auto C = parrot::array({2, 5, 7, 4, 3});
    (B * C + A).print();  // 15 39 14 12 26
}

Thrust Code

Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/arbitrary_transformation.cu

#include <thrust/detail/config.h>

#include <thrust/device_vector.h>
#include <thrust/for_each.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/zip_function.h>

#include <iostream>

// This example shows how to implement an arbitrary transformation of
// the form output[i] = F(first[i], second[i], third[i], ... ).
// In this example, we use a function with 3 inputs and 1 output.
//
// Iterators for all four vectors (3 inputs + 1 output) are "zipped"
// into a single sequence of tuples with the zip_iterator.
//
// The arbitrary_functor receives a tuple that contains four elements,
// which are references to values in each of the four sequences. When we
// access the tuple 't' with the get() function,
//      get<0>(t) returns a reference to A[i],
//      get<1>(t) returns a reference to B[i],
//      get<2>(t) returns a reference to C[i],
//      get<3>(t) returns a reference to D[i].
//
// In this example, we can implement the transformation,
//      D[i] = A[i] + B[i] * C[i];
// by invoking arbitrary_functor() on each of the tuples using for_each.
//
// If we are using a functor that is not designed for zip iterators by taking a
// tuple instead of individual arguments we can adapt this function using the
// zip_function adaptor (C++11 only).
//
// Note that we could extend this example to implement functions with an
// arbitrary number of input arguments by zipping more sequence together.
// With the same approach we can have multiple *output* sequences, if we
// wanted to implement something like
//      D[i] = A[i] + B[i] * C[i];
//      E[i] = A[i] + B[i] + C[i];
//
// The possibilities are endless! :)

struct arbitrary_functor1 {
  template <typename Tuple> __host__ __device__ void operator()(Tuple t) {
    // D[i] = A[i] + B[i] * C[i];
    thrust::get<3>(t) =
        thrust::get<0>(t) + thrust::get<1>(t) * thrust::get<2>(t);
  }
};

struct arbitrary_functor2 {
  __host__ __device__ void operator()(const float &a, const float &b,
                                      const float &c, float &d) {
    // D[i] = A[i] + B[i] * C[i];
    d = a + b * c;
  }
};

int main() {
  // allocate and initialize
  thrust::device_vector<float> A{3, 4, 0, 8, 2};
  thrust::device_vector<float> B{6, 7, 2, 1, 8};
  thrust::device_vector<float> C{2, 5, 7, 4, 3};
  thrust::device_vector<float> D1(5);

  // apply the transformation
  thrust::for_each(
      thrust::make_zip_iterator(A.begin(), B.begin(), C.begin(), D1.begin()),
      thrust::make_zip_iterator(A.end(), B.end(), C.end(), D1.end()),
      arbitrary_functor1());

  // print the output
  std::cout << "Tuple functor" << std::endl;
  for (size_t i = 0; i < A.size(); i++) {
    std::cout << A[i] << " + " << B[i] << " * " << C[i] << " = " << D1[i]
              << std::endl;
  }

  // apply the transformation using zip_function
  thrust::device_vector<float> D2(5);
  thrust::for_each(
      thrust::make_zip_iterator(A.begin(), B.begin(), C.begin(), D2.begin()),
      thrust::make_zip_iterator(A.end(), B.end(), C.end(), D2.end()),
      thrust::make_zip_function(arbitrary_functor2()));

  // print the output
  std::cout << "N-ary functor" << std::endl;
  for (size_t i = 0; i < A.size(); i++) {
    std::cout << A[i] << " + " << B[i] << " * " << C[i] << " = " << D2[i]
              << std::endl;
  }
}

✅ Basic Vector (2.7x less code)#

Parrot Code

#include "parrot.hpp"

int main() {
    parrot::array({14, 20, 38, 46})
      .print()  // 14 20 38 46
      .take(2)
      .print();  // 14 20
}

Thrust Code

Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/basic_vector.cu

#include <thrust/device_vector.h>
#include <thrust/host_vector.h>

#include <iostream>

int main() {
  // H holds 4 integers
  thrust::host_vector<int> H{14, 20, 38, 46};

  // H.size() returns the size of vector H
  std::cout << "H has size " << H.size() << std::endl;

  // print contents of H
  for (size_t i = 0; i < H.size(); i++) {
    std::cout << "H[" << i << "] = " << H[i] << std::endl;
  }

  // resize H
  H.resize(2);

  std::cout << "H now has size " << H.size() << std::endl;

  // Copy host_vector H to device_vector D
  thrust::device_vector<int> D = H;

  // elements of D can be modified
  D[0] = 99;
  D[1] = 88;

  // print contents of D
  for (size_t i = 0; i < D.size(); i++) {
    std::cout << "D[" << i << "] = " << D[i] << std::endl;
  }

  // H and D are automatically deleted when the function returns
  return 0;
}

✅ Bounding Box (7.3x less code)#

Parrot Code

#include "parrot.hpp"

int main() {
    auto ll = parrot::matrix(1.0, {2, 10}).rand();
    auto ur = parrot::matrix(1.0, {2, 10}).rand().add(1);
    ll.minr<2>().print();
    ur.maxr<2>().print();
}

Thrust Code

Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/bounding_box.cu

#include <thrust/device_vector.h>
#include <thrust/extrema.h>
#include <thrust/random.h>
#include <thrust/transform_reduce.h>

#include <cuda/std/utility>

// This example shows how to compute a bounding box
// for a set of points in two dimensions.

struct point2d {
  float x, y;

  __host__ __device__ point2d() : x(0), y(0) {}

  __host__ __device__ point2d(float _x, float _y) : x(_x), y(_y) {}
};

// bounding box type
struct bbox {
  // construct an empty box
  __host__ __device__ bbox() {}

  // construct a box from a single point
  __host__ __device__ bbox(const point2d &point)
      : lower_left(point), upper_right(point) {}

  // construct a box from a single point
  __host__ __device__ bbox &operator=(const point2d &point) {
    lower_left = point;
    upper_right = point;
    return *this;
  }

  // construct a box from a pair of points
  __host__ __device__ bbox(const point2d &ll, const point2d &ur)
      : lower_left(ll), upper_right(ur) {}

  point2d lower_left, upper_right;
};

// reduce a pair of bounding boxes (a,b) to a bounding box containing a and b
struct bbox_union {
  __host__ __device__ bbox operator()(bbox a, bbox b) {
    // lower left corner
    point2d ll(thrust::min(a.lower_left.x, b.lower_left.x),
               thrust::min(a.lower_left.y, b.lower_left.y));

    // upper right corner
    point2d ur(thrust::max(a.upper_right.x, b.upper_right.x),
               thrust::max(a.upper_right.y, b.upper_right.y));

    return bbox(ll, ur);
  }
};

int main() {
  const size_t N = 40;

  // allocate storage for points
  thrust::device_vector<point2d> points(N);

  // generate some random points in the unit square
  thrust::default_random_engine rng;
  thrust::uniform_real_distribution<float> u01(0.0f, 1.0f);
  for (size_t i = 0; i < N; i++) {
    float x = u01(rng);
    float y = u01(rng);
    points[i] = point2d(x, y);
  }

  // initial bounding box contains first point
  bbox init(points[0], points[0]);

  // compute the bounding box for the point set
  bbox result =
      thrust::reduce(points.begin(), points.end(), init, bbox_union{});

  // print output
  std::cout << "bounding box " << std::fixed;
  std::cout << "(" << result.lower_left.x << "," << result.lower_left.y << ") ";
  std::cout << "(" << result.upper_right.x << "," << result.upper_right.y << ")"
            << std::endl;

  return 0;
}

✅ Constant Iterator (2.3x less code)#

Parrot Code

#include "parrot.hpp"

int main() {
    auto data     = parrot::array({3, 7, 2, 5});
    auto constant = parrot::scalar(10).repeat(4);
    data.add(constant).print();  // [13, 17, 12, 15] - first method
    data.add(10).print();        // [13, 17, 12, 15] - better method
}

Thrust Code

Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/constant_iterator.cu

#include <thrust/copy.h>
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/transform.h>

#include <iostream>
#include <iterator>

int main() {
  thrust::device_vector<int> data{3, 7, 2, 5};

  // add 10 to all values in data
  thrust::transform(data.begin(), data.end(),
                    thrust::constant_iterator<int>(10), data.begin(),
                    cuda::std::plus<int>());

  // data is now [13, 17, 12, 15]

  // print result
  thrust::copy(data.begin(), data.end(),
               std::ostream_iterator<int>(std::cout, "\n"));

  return 0;
}

✅ Counting Iterator (2.5x less code)#

Parrot Code

#include "parrot.hpp"

int main() {
    auto stencil = parrot::array({0, 1, 1, 0, 0, 1, 0, 1});
    auto indices = parrot::range(8)
                     .minus(1)
                     .keep(stencil)
                     .print();  // [1, 2, 5, 7]
}

Thrust Code

Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/counting_iterator.cu

#include <thrust/copy.h>
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/iterator/counting_iterator.h>

#include <iostream>
#include <iterator>

int main() {
  // this example computes indices for all the nonzero values in a sequence

  // sequence of zero and nonzero values
  thrust::device_vector<int> stencil{0, 1, 1, 0, 0, 1, 0, 1};

  // storage for the nonzero indices
  thrust::device_vector<int> indices(8);

  // counting iterators define a sequence [0, 8)
  thrust::counting_iterator<int> first(0);
  thrust::counting_iterator<int> last = first + 8;

  // compute indices of nonzero elements
  using IndexIterator = thrust::device_vector<int>::iterator;

  IndexIterator indices_end = thrust::copy_if(
      first, last, stencil.begin(), indices.begin(), cuda::std::identity{});
  // indices now contains [1,2,5,7]

  // print result
  std::cout << "found " << cuda::std::distance(indices.begin(), indices_end)
            << " nonzero values at indices:\n";
  thrust::copy(indices.begin(), indices_end,
               std::ostream_iterator<int>(std::cout, "\n"));

  return 0;
}

✅ Dot Products With Zip (10.1x less code)#

Parrot Code

#include <parrot.hpp>

int main() {
    int N  = 1000;
    auto a = parrot::matrix(1.0, {3, N}).rand();
    auto b = parrot::matrix(1.0, {3, N}).rand();
    a.times(b).sum<2>().print();  // 256.714 236.45 256.613 (roughly)
}

Thrust Code

Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/dot_products_with_zip.cu

#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/host_vector.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/random.h>
#include <thrust/transform.h>

// This example shows how thrust::zip_iterator can be used to create a
// 'virtual' array of structures.  In this case the structure is a 3d
// vector type (Float3) whose (x,y,z) components will be stored in
// three separate float arrays.  The zip_iterator "zips" these arrays
// into a single virtual Float3 array.

// We'll use a 3-tuple to store our 3d vector type
using Float3 = thrust::tuple<float, float, float>;

// This functor implements the dot product between 3d vectors
struct DotProduct {
  __host__ __device__ float operator()(const Float3 &a, const Float3 &b) const {
    return thrust::get<0>(a) * thrust::get<0>(b) + // x components
           thrust::get<1>(a) * thrust::get<1>(b) + // y components
           thrust::get<2>(a) * thrust::get<2>(b);  // z components
  }
};

// Return a host vector with random values in the range [0,1)
thrust::host_vector<float>
random_vector(const size_t N,
              unsigned int seed = thrust::default_random_engine::default_seed) {
  thrust::default_random_engine rng(seed);
  thrust::uniform_real_distribution<float> u01(0.0f, 1.0f);
  thrust::host_vector<float> temp(N);
  for (size_t i = 0; i < N; ++i) {
    temp[i] = u01(rng);
  }
  return temp;
}

int main() {
  // number of vectors
  const size_t N = 1000;

  // We'll store the components of the 3d vectors in separate arrays. One set of
  // arrays will store the 'A' vectors and another set will store the 'B'
  // vectors.

  // This 'structure of arrays' (SoA) approach is usually more efficient than
  // the 'array of structures' (AoS) approach.  The primary reason is that
  // structures, like Float3, don't always obey the memory coalescing rules, so
  // they are not efficiently transferred to and from memory.  Another reason to
  // prefer SoA to AoS is that we don't always want to process all members of
  // the structure.  For example, if we only need to look at first element of
  // the structure then it is wasteful to load the entire structure from memory.
  // With the SoA approach, we can chose which elements of the structure we wish
  // to read.

  thrust::device_vector<float> A0 =
      random_vector(N); // x components of the 'A' vectors
  thrust::device_vector<float> A1 =
      random_vector(N); // y components of the 'A' vectors
  thrust::device_vector<float> A2 =
      random_vector(N); // z components of the 'A' vectors

  thrust::device_vector<float> B0 =
      random_vector(N); // x components of the 'B' vectors
  thrust::device_vector<float> B1 =
      random_vector(N); // y components of the 'B' vectors
  thrust::device_vector<float> B2 =
      random_vector(N); // z components of the 'B' vectors

  // Storage for result of each dot product
  thrust::device_vector<float> result(N);

  // We'll now illustrate two ways to use zip_iterator to compute the dot
  // products.  The first method is verbose but shows how the parts fit
  // together. The second method hides these details and is more concise.

  // METHOD #1
  // Defining a zip_iterator type can be a little cumbersome ...
  using FloatIterator = thrust::device_vector<float>::iterator;
  using FloatIteratorTuple =
      thrust::tuple<FloatIterator, FloatIterator, FloatIterator>;
  using Float3Iterator = thrust::zip_iterator<FloatIteratorTuple>;

  // Now we'll create some zip_iterators for A and B
  Float3Iterator A_first =
      thrust::make_zip_iterator(A0.begin(), A1.begin(), A2.begin());
  Float3Iterator A_last =
      thrust::make_zip_iterator(A0.end(), A1.end(), A2.end());
  Float3Iterator B_first =
      thrust::make_zip_iterator(B0.begin(), B1.begin(), B2.begin());

  // Finally, we pass the zip_iterators into transform() as if they
  // were 'normal' iterators for a device_vector<Float3>.
  thrust::transform(A_first, A_last, B_first, result.begin(), DotProduct());

  // METHOD #2
  // Alternatively, we can avoid creating variables for X_first, X_last,
  // and Y_first and invoke transform() directly.
  thrust::transform(
      thrust::make_zip_iterator(A0.begin(), A1.begin(), A2.begin()),
      thrust::make_zip_iterator(A0.end(), A1.end(), A2.end()),
      thrust::make_zip_iterator(B0.begin(), B1.begin(), B2.begin()),
      result.begin(), DotProduct());

  // Finally, we'll print a few results

  // Example output
  // (0.840188,0.45724,0.0860517) * (0.0587587,0.456151,0.322409) = 0.285683
  // (0.394383,0.640368,0.180886) * (0.0138811,0.24875,0.0221609) = 0.168775
  // (0.783099,0.717092,0.426423) * (0.622212,0.0699601,0.234811) = 0.63755
  // (0.79844,0.460067,0.0470658) * (0.0391351,0.742097,0.354747) = 0.389358
  std::cout << std::fixed;
  for (size_t i = 0; i < 4; i++) {
    Float3 a = A_first[i];
    Float3 b = B_first[i];
    float dot = result[i];

    std::cout << "(" << thrust::get<0>(a) << "," << thrust::get<1>(a) << ","
              << thrust::get<2>(a) << ")";
    std::cout << " * ";
    std::cout << "(" << thrust::get<0>(b) << "," << thrust::get<1>(b) << ","
              << thrust::get<2>(b) << ")";
    std::cout << " = ";
    std::cout << dot << std::endl;
  }

  return 0;
}

✅ Max Abs Diff (3.5x less code)#

Parrot Code

#include "parrot.hpp"

int main() {
    auto a = parrot::array({1, 2, 3, 4});
    auto b = parrot::array({2, 4, 3, 0});
    (b - a).abs().maxr().print();  // 4
}

Thrust Code

Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/max_abs_diff.cu

#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/inner_product.h>

#include <cuda/functional>

#include <iostream>

// this example computes the maximum absolute difference
// between the elements of two vectors

template <typename T> struct abs_diff {
  __host__ __device__ T operator()(const T &a, const T &b) {
    return fabsf(b - a);
  }
};

int main() {
  thrust::device_vector<float> d_a = {1.0, 2.0, 3.0, 4.0};
  thrust::device_vector<float> d_b = {2.0, 4.0, 3.0, 0.0};

  // initial value of the reduction
  float init = 0;

  // binary operations
  cuda::maximum<float> binary_op1{};
  abs_diff<float> binary_op2;

  float max_abs_diff = thrust::inner_product(
      d_a.begin(), d_a.end(), d_b.begin(), init, binary_op1, binary_op2);

  std::cout << "maximum absolute difference: " << max_abs_diff << std::endl;
  return 0;
}

✅ Min Max (10.0x less code)#

Parrot Code

#include "parrot.hpp"

int main() {
    auto data = parrot::scalar(89).add(10).repeat(10).rand().print();
    data.minmax().print();
}

Thrust Code

Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/minmax.cu

#include <thrust/device_vector.h>
#include <thrust/extrema.h>
#include <thrust/functional.h>
#include <thrust/host_vector.h>
#include <thrust/random.h>
#include <thrust/transform_reduce.h>

// compute minimum and maximum values in a single reduction

// minmax_pair stores the minimum and maximum
// values that have been encountered so far
template <typename T> struct minmax_pair {
  T min_val;
  T max_val;
};

// minmax_unary_op is a functor that takes in a value x and
// returns a minmax_pair whose minimum and maximum values
// are initialized to x.
template <typename T> struct minmax_unary_op {
  __host__ __device__ minmax_pair<T> operator()(const T &x) const {
    minmax_pair<T> result;
    result.min_val = x;
    result.max_val = x;
    return result;
  }
};

// minmax_binary_op is a functor that accepts two minmax_pair
// structs and returns a new minmax_pair whose minimum and
// maximum values are the min() and max() respectively of
// the minimums and maximums of the input pairs
template <typename T> struct minmax_binary_op {
  __host__ __device__ minmax_pair<T> operator()(const minmax_pair<T> &x,
                                                const minmax_pair<T> &y) const {
    minmax_pair<T> result;
    result.min_val = thrust::min(x.min_val, y.min_val);
    result.max_val = thrust::max(x.max_val, y.max_val);
    return result;
  }
};

int main() {
  // input size
  size_t N = 10;

  // initialize random number generator
  thrust::default_random_engine rng;
  thrust::uniform_int_distribution<int> dist(10, 99);

  // initialize data on host
  thrust::host_vector<int> host_data(N);
  for (auto &e : host_data) {
    e = dist(rng);
  }
  thrust::device_vector<int> data = host_data;

  // setup arguments
  minmax_unary_op<int> unary_op;
  minmax_binary_op<int> binary_op;

  // initialize reduction with the first value
  minmax_pair<int> init = unary_op(data[0]);

  // compute minimum and maximum values
  minmax_pair<int> result = thrust::transform_reduce(data.begin(), data.end(),
                                                     unary_op, init, binary_op);

  // print results
  std::cout << "[ ";
  for (auto &e : host_data) {
    std::cout << e << " ";
  }
  std::cout << "]" << std::endl;

  std::cout << "minimum = " << result.min_val << std::endl;
  std::cout << "maximum = " << result.max_val << std::endl;

  return 0;
}

✅ Mode (11.0x less code)#

Parrot Code

#include "parrot.hpp"

int main() {
    auto data = parrot::scalar(9).repeat(30).rand().print();
    data.sort().print().rle().print().max_by_key(parrot::snd()).print();
    // Output:
    // 2 5 6 2 0 0 4 2 3 8 5 0 1 4 4 7 5 8 3 3 8 2 6 0 7 5 6 0 2 3
    // 0 0 0 0 0 1 2 2 2 2 2 3 3 3 3 4 4 4 5 5 5 5 6 6 6 7 7 8 8 8
    // (0, 5) (1, 1) (2, 5) (3, 4) (4, 3) (5, 4) (6, 3) (7, 2) (8, 3)
    // (0, 5)
}

Thrust Code

Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/mode.cu

#include <thrust/device_vector.h>
#include <thrust/extrema.h>
#include <thrust/functional.h>
#include <thrust/host_vector.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/random.h>
#include <thrust/reduce.h>
#include <thrust/sort.h>
#include <thrust/unique.h>

#include <iostream>
#include <iterator>

// This example compute the mode [1] of a set of numbers.  If there
// are multiple modes, one with the smallest value it returned.
//
// [1] http://en.wikipedia.org/wiki/Mode_(statistics)

int main() {
  const size_t N = 30;
  const size_t M = 10;
  thrust::default_random_engine rng;
  thrust::uniform_int_distribution<int> dist(0, M - 1);

  // generate random data on the host
  thrust::host_vector<int> h_data(N);
  for (auto &e : h_data) {
    e = dist(rng);
  }

  // transfer data to device
  thrust::device_vector<int> d_data(h_data);

  // print the initial data
  std::cout << "initial data" << std::endl;
  thrust::copy(d_data.begin(), d_data.end(),
               std::ostream_iterator<int>(std::cout, " "));
  std::cout << std::endl;

  // sort data to bring equal elements together
  thrust::sort(d_data.begin(), d_data.end());

  // print the sorted data
  std::cout << "sorted data" << std::endl;
  thrust::copy(d_data.begin(), d_data.end(),
               std::ostream_iterator<int>(std::cout, " "));
  std::cout << std::endl;

  // count number of unique keys
  size_t num_unique = thrust::unique_count(d_data.begin(), d_data.end());

  // count multiplicity of each key
  thrust::device_vector<int> d_output_keys(num_unique);
  thrust::device_vector<int> d_output_counts(num_unique);
  thrust::reduce_by_key(d_data.begin(), d_data.end(),
                        thrust::constant_iterator<int>(1),
                        d_output_keys.begin(), d_output_counts.begin());

  // print the counts
  std::cout << "values" << std::endl;
  thrust::copy(d_output_keys.begin(), d_output_keys.end(),
               std::ostream_iterator<int>(std::cout, " "));
  std::cout << std::endl;

  // print the counts
  std::cout << "counts" << std::endl;
  thrust::copy(d_output_counts.begin(), d_output_counts.end(),
               std::ostream_iterator<int>(std::cout, " "));
  std::cout << std::endl;

  // find the index of the maximum count
  thrust::device_vector<int>::iterator mode_iter;
  mode_iter =
      thrust::max_element(d_output_counts.begin(), d_output_counts.end());

  int mode =
      d_output_keys[cuda::std::distance(d_output_counts.begin(), mode_iter)];
  int occurrences = *mode_iter;

  std::cout << "Modal value " << mode << " occurs " << occurrences << " times "
            << std::endl;

  return 0;
}

✅ Monte Carlo (4.5x less code)#

Parrot Code

#include <iomanip>
#include "parrot.hpp"

int main() {
    int N  = 1000000;
    auto x = parrot::scalar(1.0).repeat(N).rand();
    auto y = parrot::scalar(1.0).repeat(N).rand();
    auto i = x.sq().add(y.sq()).lte(1);
    std::setprecision(3);
    i.sum().div(N).times(4).print();  // 3.142
}

Thrust Code

Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/monte_carlo.cu

#include <thrust/functional.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/random.h>
#include <thrust/transform_reduce.h>

#include <cmath>
#include <iomanip>
#include <iostream>

// we could vary M & N to find the perf sweet spot

__host__ __device__ unsigned int hash(unsigned int a) {
  a = (a + 0x7ed55d16) + (a << 12);
  a = (a ^ 0xc761c23c) ^ (a >> 19);
  a = (a + 0x165667b1) + (a << 5);
  a = (a + 0xd3a2646c) ^ (a << 9);
  a = (a + 0xfd7046c5) + (a << 3);
  a = (a ^ 0xb55a4f09) ^ (a >> 16);
  return a;
}

struct estimate_pi {
  __host__ __device__ float operator()(unsigned int thread_id) {
    float sum = 0;
    unsigned int N = 10000; // samples per thread

    unsigned int seed = hash(thread_id);

    // seed a random number generator
    thrust::default_random_engine rng(seed);

    // create a mapping from random numbers to [0,1)
    thrust::uniform_real_distribution<float> u01(0, 1);

    // take N samples in a quarter circle
    for (unsigned int i = 0; i < N; ++i) {
      // draw a sample from the unit square
      float x = u01(rng);
      float y = u01(rng);

      // measure distance from the origin
      float dist = sqrtf(x * x + y * y);

      // add 1.0f if (u0,u1) is inside the quarter circle
      if (dist <= 1.0f) {
        sum += 1.0f;
      }
    }

    // multiply by 4 to get the area of the whole circle
    sum *= 4.0f;

    // divide by N
    return sum / N;
  }
};

int main() {
  // use 30K independent seeds
  int M = 30000;

  float estimate = thrust::transform_reduce(
      thrust::counting_iterator<int>(0), thrust::counting_iterator<int>(M),
      estimate_pi(), 0.0f, cuda::std::plus<float>());
  estimate /= M;

  std::cout << std::setprecision(3);
  std::cout << "pi is approximately " << estimate << std::endl;

  return 0;
}

✅ Norm (3.8x less code)#

Parrot Code

#include "parrot.hpp"

int main() {
    auto x      = parrot::array<float>({1.0, 2.0, 3.0, 4.0});
    auto result = x.sq().sum().sqrt().print();  // 5.47723
}

Thrust Code

Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/norm.cu

#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/host_vector.h>
#include <thrust/transform_reduce.h>

#include <cmath>
#include <iostream>

//   This example computes the norm [1] of a vector.  The norm is
// computed by squaring all numbers in the vector, summing the
// squares, and taking the square root of the sum of squares.  In
// Thrust this operation is efficiently implemented with the
// transform_reduce() algorithm.  Specifically, we first transform
// x -> x^2 and the compute a standard plus reduction.  Since there
// is no built-in functor for squaring numbers, we define our own
// square functor.
//
// [1] http://en.wikipedia.org/wiki/Norm_(mathematics)#Euclidean_norm

// square<T> computes the square of a number f(x) -> x*x
template <typename T> struct square {
  __host__ __device__ T operator()(const T &x) const { return x * x; }
};

int main() {
  // initialize device vector directly
  thrust::device_vector<float> d_x = {1.0, 2.0, 3.0, 4.0};

  // setup arguments
  square<float> unary_op;
  cuda::std::plus<float> binary_op;
  float init = 0;

  // compute norm
  float norm = std::sqrt(thrust::transform_reduce(d_x.begin(), d_x.end(),
                                                  unary_op, init, binary_op));

  std::cout << "norm is " << norm << std::endl;

  return 0;
}

✅ Padded Grid Reduction (10.6x less code)#

Parrot Code

#include "parrot.hpp"

int main() {
    auto shape = {10, 16};
    auto grid  = parrot::matrix(1.0, shape).rand();
    auto mask  = parrot::range(16).lt(11).cycle({10, 16});
    grid.keep(mask).minmax().print();
}

Thrust Code

Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/padded_grid_reduction.cu

#include <thrust/device_vector.h>
#include <thrust/extrema.h>
#include <thrust/functional.h>
#include <thrust/host_vector.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/random.h>
#include <thrust/transform_reduce.h>

#include <cmath>
#include <iomanip>

#include <float.h>

// This example computes the minimum and maximum values
// over a padded grid.  The padded values are not considered
// during the reduction operation.

// transform a tuple (int,value) into a tuple (bool,value,value)
// where the bool is true for valid grid values and false for
// values in the padded region of the grid
template <typename IndexType, typename ValueType> struct transform_tuple {
  using InputTuple = typename thrust::tuple<IndexType, ValueType>;
  using OutputTuple = typename thrust::tuple<bool, ValueType, ValueType>;

  IndexType n, N;

  transform_tuple(IndexType n, IndexType N) : n(n), N(N) {}

  __host__ __device__ OutputTuple operator()(const InputTuple &t) const {
    bool is_valid = (thrust::get<0>(t) % N) < n;
    return OutputTuple(is_valid, thrust::get<1>(t), thrust::get<1>(t));
  }
};

// reduce two tuples (bool,value,value) into a single tuple such that output
// contains the smallest and largest *valid* values.
template <typename IndexType, typename ValueType> struct reduce_tuple {
  using Tuple = typename thrust::tuple<bool, ValueType, ValueType>;

  __host__ __device__ Tuple operator()(const Tuple &t0, const Tuple &t1) const {
    if (thrust::get<0>(t0) && thrust::get<0>(t1)) // both valid
    {
      return Tuple(true, thrust::min(thrust::get<1>(t0), thrust::get<1>(t1)),
                   thrust::max(thrust::get<2>(t0), thrust::get<2>(t1)));
    } else if (thrust::get<0>(t0)) {
      return t0;
    } else if (thrust::get<0>(t1)) {
      return t1;
    } else {
      return t1; // if neither is valid then it doesn't matter what we return
    }
  }
};

int main() {
  int M = 10; // number of rows
  int n = 11; // number of columns excluding padding
  int N = 16; // number of columns including padding

  thrust::default_random_engine rng(12345);
  thrust::uniform_real_distribution<float> dist(0.0f, 1.0f);

  thrust::device_vector<float> data(M * N, -1);

  // initialize valid values in grid
  for (int i = 0; i < M; i++) {
    for (int j = 0; j < n; j++) {
      data[i * N + j] = dist(rng);
    }
  }

  // print full grid
  std::cout << "padded grid" << std::endl;
  std::cout << std::fixed << std::setprecision(4);
  for (int i = 0; i < M; i++) {
    std::cout << " ";
    for (int j = 0; j < N; j++) {
      std::cout << data[i * N + j] << " ";
    }
    std::cout << "\n";
  }
  std::cout << "\n";

  // compute min & max over valid region of the 2d grid
  using result_type = thrust::tuple<bool, float, float>;

  result_type init(true, FLT_MAX, -FLT_MAX);  // initial value
  transform_tuple<int, float> unary_op(n, N); // transformation operator
  reduce_tuple<int, float> binary_op;         // reduction operator

  result_type result = thrust::transform_reduce(
      thrust::make_zip_iterator(thrust::counting_iterator<int>(0),
                                data.begin()),
      thrust::make_zip_iterator(
          thrust::make_tuple(thrust::counting_iterator<int>(0), data.begin())) +
          data.size(),
      unary_op, init, binary_op);

  std::cout << "minimum value: " << thrust::get<1>(result) << std::endl;
  std::cout << "maximum value: " << thrust::get<2>(result) << std::endl;

  return 0;
}

✅ Permutation Iterator (2.2x less code)#

Parrot Code

#include "parrot.hpp"

int main() {
    auto source  = parrot::array({10, 20, 30, 40, 50, 60});
    auto indices = parrot::array({3, 1, 0, 5});
    source.gather(indices).sum().print();  // 130
}

Thrust Code

Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/permutation_iterator.cu

#include <thrust/device_vector.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/reduce.h>

#include <iostream>

// this example fuses a gather operation with a reduction for
// greater efficiency than separate gather() and reduce() calls

int main() {
  // gather locations
  thrust::device_vector<int> map = {3, 1, 0, 5};

  // array to gather from
  thrust::device_vector<int> source = {10, 20, 30, 40, 50, 60};

  // fuse gather with reduction:
  //   sum = source[map[0]] + source[map[1]] + ...
  int sum = thrust::reduce(
      thrust::make_permutation_iterator(source.begin(), map.begin()),
      thrust::make_permutation_iterator(source.begin(), map.end()));

  // print sum
  std::cout << "sum is " << sum << std::endl;

  return 0;
}

✅ Remove Points2d (5.0x less code)#

Parrot Code

#include "parrot.hpp"

int main() {
    auto is_in_circle = [] __host__ __device__(float x, float y) {
        return x * x + y * y <= 1;
    };
    auto x = parrot::scalar(1.0f).repeat(20).rand();
    auto y = parrot::scalar(1.0f).repeat(20).rand();
    auto p = x.pairs(y).filter(thrust::make_zip_function(is_in_circle)).print();
}

Thrust Code

Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/remove_points2d.cu

#include <thrust/host_vector.h>
#include <thrust/random.h>
#include <thrust/remove.h>

// This example generates random points in the
// unit square [0,1)x[0,1) and then removes all
// points where x^2 + y^2 > 1
//
// The x and y coordinates are stored in separate arrays
// and a zip_iterator is used to combine them together

template <typename T> struct is_outside_circle {
  template <typename Tuple>
  inline __host__ __device__ bool operator()(const Tuple &tuple) const {
    // unpack the tuple into x and y coordinates
    const T x = thrust::get<0>(tuple);
    const T y = thrust::get<1>(tuple);

    if (x * x + y * y > 1) {
      return true;
    } else {
      return false;
    }
  }
};

int main() {
  const size_t N = 20;

  // generate random points in the unit square on the host
  thrust::default_random_engine rng;
  thrust::uniform_real_distribution<float> u01(0.0f, 1.0f);
  thrust::host_vector<float> x(N);
  thrust::host_vector<float> y(N);
  for (size_t i = 0; i < N; i++) {
    x[i] = u01(rng);
    y[i] = u01(rng);
  }

  // print the initial points
  std::cout << std::fixed;
  std::cout << "Generated " << N << " points" << std::endl;
  for (size_t i = 0; i < x.size(); i++) {
    std::cout << "(" << x[i] << "," << y[i] << ")" << std::endl;
  }
  std::cout << std::endl;

  // remove points where x^2 + y^2 > 1 and determine new array sizes
  size_t new_size =
      thrust::remove_if(thrust::make_zip_iterator(x.begin(), y.begin()),
                        thrust::make_zip_iterator(x.end(), y.end()),
                        is_outside_circle<float>()) -
      thrust::make_zip_iterator(x.begin(), y.begin());

  // resize the vectors (note: this does not free any memory)
  x.resize(new_size);
  y.resize(new_size);

  // print the filtered points
  std::cout << "After stream compaction, " << new_size << " points remain"
            << std::endl;
  for (size_t i = 0; i < x.size(); i++) {
    std::cout << "(" << x[i] << "," << y[i] << ")" << std::endl;
  }

  return 0;
}

✅ Run Length Encoding (3.7x less code)#

Parrot Code

#include "parrot.hpp"

int main() {
    const char data[] = "aaabbbbbcddeeeeeeeeeff";
    const size_t N    = (sizeof(data) / sizeof(char)) - 1;
    auto input        = std::vector(data, data + N);
    parrot::array(input)
      .rle()
      .print();  //(a, 3) (b, 5) (c, 1) (d, 2) (e, 9) (f, 2)
}

Thrust Code

Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/run_length_encoding.cu

#include <thrust/copy.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/reduce.h>

#include <iostream>
#include <iterator>

// This example computes a run-length code [1] for an array of characters.
//
// [1] http://en.wikipedia.org/wiki/Run-length_encoding

int main() {
  // input data on the host
  const char data[] = "aaabbbbbcddeeeeeeeeeff";

  const size_t N = (sizeof(data) / sizeof(char)) - 1;

  // copy input data to the device
  thrust::device_vector<char> input(data, data + N);

  // allocate storage for output data and run lengths
  thrust::device_vector<char> output(N);
  thrust::device_vector<int> lengths(N);

  // print the initial data
  std::cout << "input data:" << std::endl;
  thrust::copy(input.begin(), input.end(),
               std::ostream_iterator<char>(std::cout, ""));
  std::cout << std::endl << std::endl;

  // compute run lengths
  size_t num_runs =
      thrust::reduce_by_key(
          input.begin(),
          input.end(),                       // input key sequence
          thrust::constant_iterator<int>(1), // input value sequence
          output.begin(),                    // output key sequence
          lengths.begin()                    // output value sequence
          )
          .first -
      output.begin(); // compute the output size

  // print the output
  std::cout << "run-length encoded output:" << std::endl;
  for (size_t i = 0; i < num_runs; i++) {
    std::cout << "(" << output[i] << "," << lengths[i] << ")";
  }
  std::cout << std::endl;

  return 0;
}

✅ Saxpy (7.0x less code)#

Parrot Code

#include "parrot.hpp"

int main() {
    auto x = parrot::array({1.0, 1.0, 1.0, 1.0});
    auto y = parrot::array({1.0, 2.0, 3.0, 4.0});
    x.times(2.0).add(y).print();  // 3 4 5 6
}

Thrust Code

Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/saxpy.cu

#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/host_vector.h>
#include <thrust/transform.h>

#include <algorithm>
#include <iostream>
#include <iterator>

// This example illustrates how to implement the SAXPY
// operation (Y[i] = a * X[i] + Y[i]) using Thrust.
// The saxpy_slow function demonstrates the most
// straightforward implementation using a temporary
// array and two separate transformations, one with
// multiplies and one with plus.  The saxpy_fast function
// implements the operation with a single transformation
// and represents "best practice".

struct saxpy_functor {
  const float a;

  saxpy_functor(float _a) : a(_a) {}

  __host__ __device__ float operator()(const float &x, const float &y) const {
    return a * x + y;
  }
};

void saxpy_fast(float A, thrust::device_vector<float> &X,
                thrust::device_vector<float> &Y) {
  // Y <- A * X + Y
  thrust::transform(X.begin(), X.end(), Y.begin(), Y.begin(), saxpy_functor(A));
}

void saxpy_slow(float A, thrust::device_vector<float> &X,
                thrust::device_vector<float> &Y) {
  thrust::device_vector<float> temp(X.size());

  // temp <- A
  thrust::fill(temp.begin(), temp.end(), A);

  // temp <- A * X
  thrust::transform(X.begin(), X.end(), temp.begin(), temp.begin(),
                    cuda::std::multiplies<float>());

  // Y <- A * X + Y
  thrust::transform(temp.begin(), temp.end(), Y.begin(), Y.begin(),
                    cuda::std::plus<float>());
}

int main() {
  // initialize host arrays
  thrust::host_vector<float> x{1.0, 1.0, 1.0, 1.0};
  thrust::host_vector<float> y{1.0, 2.0, 3.0, 4.0};

  {
    // transfer to device
    thrust::device_vector<float> X(x);
    thrust::device_vector<float> Y(y);

    // slow method
    saxpy_slow(2.0, X, Y);
  }

  {
    // transfer to device
    thrust::device_vector<float> X(x);
    thrust::device_vector<float> Y(y);

    // fast method
    saxpy_fast(2.0, X, Y);
  }

  return 0;
}

✅ Simple Moving Average (8.8x less code)#

Parrot Code

#include "parrot.hpp"

int main() {
    auto data = parrot::scalar(10).repeat(30).rand().print();
    auto sums = data.prepend(0).sums();
    sums.drop(4).minus(sums.take(27)).div(4.0).reshape({3, 9}).print();
}

Thrust Code

Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/simple_moving_average.cu

#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/host_vector.h>
#include <thrust/random.h>
#include <thrust/scan.h>
#include <thrust/sequence.h>
#include <thrust/transform.h>

#include <iomanip>
#include <iostream>

// Efficiently computes the simple moving average (SMA) [1] of a data series
// using a parallel prefix-sum or "scan" operation.
//
// Note: additional numerical precision should be used in the cumulative summing
// stage when computing the SMA of large data series.  The most straightforward
// remedy is to replace 'float' with 'double'.   Alternatively a Kahan or
// "compensated" summation algorithm could be applied [2].
//
// [1] http://en.wikipedia.org/wiki/Moving_average#Simple_moving_average
// [2] http://en.wikipedia.org/wiki/Kahan_summation_algorithm

// compute the difference of two positions in the cumumulative sum and
// divide by the SMA window size w.
template <typename T> struct minus_and_divide {
  T w;

  minus_and_divide(T w) : w(w) {}

  __host__ __device__ T operator()(const T &a, const T &b) const {
    return (a - b) / w;
  }
};

template <typename InputVector, typename OutputVector>
void simple_moving_average(const InputVector &data, size_t w,
                           OutputVector &output) {
  using T = typename InputVector::value_type;

  if (data.size() < w) {
    return;
  }

  // allocate storage for cumulative sum
  thrust::device_vector<T> temp(data.size() + 1);

  // compute cumulative sum
  thrust::exclusive_scan(data.begin(), data.end(), temp.begin());
  temp[data.size()] = data.back() + temp[data.size() - 1];

  // compute moving averages from cumulative sum
  thrust::transform(temp.begin() + w, temp.end(), temp.begin(), output.begin(),
                    minus_and_divide<T>(T(w)));
}

int main() {
  // length of data series
  size_t n = 30;

  // window size of the moving average
  size_t w = 4;

  // generate random data series
  thrust::host_vector<float> host_data(n);
  thrust::default_random_engine rng;
  thrust::uniform_int_distribution<int> dist(0, 10);
  for (auto &e : host_data) {
    e = static_cast<float>(dist(rng));
  }
  thrust::device_vector<float> data = host_data;

  // allocate storage for averages
  thrust::device_vector<float> averages(data.size() - (w - 1));

  // compute SMA using standard summation
  simple_moving_average(data, w, averages);

  // print data series
  std::cout << "data series: [ ";
  for (const auto &value : data) {
    std::cout << value << " ";
  }
  std::cout << "]" << std::endl;

  // print moving averages
  std::cout << "simple moving averages (window = " << w << ")" << std::endl;
  for (size_t i = 0; i < averages.size(); i++) {
    std::cout << "  [" << std::setw(2) << i << "," << std::setw(2) << (i + w)
              << ") = " << averages[i] << std::endl;
  }

  return 0;
}

✅ Sort (12.3x less code)#

Parrot Code

#include "parrot.hpp"

int main() {
    auto const N = 10;
    auto ints    = parrot::scalar(100).repeat(N).rand();
    auto floats  = parrot::scalar(10.0).repeat(N).rand();

    auto even_first = [] __host__ __device__(int x) { return x % 2; };

    ints.sort().print();                           // sort integers
    ints.sort_by(thrust::greater<int>()).print();  // sort integers descending
    ints.sort_by_key(even_first).print();  // sort integers (user-defined)
    floats.sort().print();                 // sort floats
    floats.pairs(ints).sort().print();     // sort pairs
}

Thrust Code

Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/sort.cu

#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/random.h>
#include <thrust/sequence.h>
#include <thrust/sort.h>

#include <cuda/std/utility>

#include <iomanip>
#include <iostream>
#include <numeric>

// Helper routines

void initialize(thrust::device_vector<int> &v) {
  thrust::default_random_engine rng(123456);
  thrust::uniform_int_distribution<int> dist(10, 99);
  thrust::host_vector<int> host_data(v.size());
  for (auto &e : host_data) {
    e = dist(rng);
  }
  v = host_data;
}

void initialize(thrust::device_vector<float> &v) {
  thrust::default_random_engine rng(123456);
  thrust::uniform_int_distribution<int> dist(2, 19);
  thrust::host_vector<float> host_data(v.size());
  for (auto &e : host_data) {
    e = dist(rng) / 2.0f;
  }
  v = host_data;
}

void initialize(thrust::device_vector<cuda::std::pair<int, int>> &v) {
  thrust::default_random_engine rng(123456);
  thrust::uniform_int_distribution<int> dist(0, 9);
  thrust::host_vector<cuda::std::pair<int, int>> host_data(v.size());
  for (auto &e : host_data) {
    int a = dist(rng);
    int b = dist(rng);
    e = cuda::std::make_pair(a, b);
  }
  v = host_data;
}

void initialize(thrust::device_vector<int> &v1,
                thrust::device_vector<int> &v2) {
  thrust::default_random_engine rng(123456);
  thrust::uniform_int_distribution<int> dist(10, 99);
  thrust::host_vector<int> host_data(v1.size());
  for (auto &e : host_data) {
    e = dist(rng);
  }
  v1 = host_data;
  thrust::sequence(v2.begin(), v2.end(), 0);
}

void print(const thrust::device_vector<int> &v) {
  for (const auto &value : v) {
    std::cout << " " << value;
  }
  std::cout << "\n";
}

void print(const thrust::device_vector<float> &v) {
  for (const auto &value : v) {
    std::cout << " " << std::fixed << std::setprecision(1) << value;
  }
  std::cout << "\n";
}

void print(const thrust::device_vector<cuda::std::pair<int, int>> &v) {
  for (const auto &p : v) {
    cuda::std::pair<int, int> local_p = p;
    std::cout << " (" << local_p.first << "," << local_p.second << ")";
  }
  std::cout << "\n";
}

void print(thrust::device_vector<int> &v1, thrust::device_vector<int> v2) {
  for (size_t i = 0; i < v1.size(); i++) {
    std::cout << " (" << v1[i] << "," << std::setw(2) << v2[i] << ")";
  }
  std::cout << "\n";
}

// user-defined comparison operator that acts like less<int>,
// except even numbers are considered to be smaller than odd numbers
struct evens_before_odds {
  __host__ __device__ bool operator()(int x, int y) {
    if (x % 2 == y % 2) {
      return x < y;
    } else if (x % 2) {
      return false;
    } else {
      return true;
    }
  }
};

int main() {
  size_t N = 16;

  std::cout << "sorting integers\n";
  {
    thrust::device_vector<int> keys(N);
    initialize(keys);
    print(keys);
    thrust::sort(keys.begin(), keys.end());
    print(keys);
  }

  std::cout << "\nsorting integers (descending)\n";
  {
    thrust::device_vector<int> keys(N);
    initialize(keys);
    print(keys);
    thrust::sort(keys.begin(), keys.end(), cuda::std::greater<int>());
    print(keys);
  }

  std::cout << "\nsorting integers (user-defined comparison)\n";
  {
    thrust::device_vector<int> keys(N);
    initialize(keys);
    print(keys);
    thrust::sort(keys.begin(), keys.end(), evens_before_odds());
    print(keys);
  }

  std::cout << "\nsorting floats\n";
  {
    thrust::device_vector<float> keys(N);
    initialize(keys);
    print(keys);
    thrust::sort(keys.begin(), keys.end());
    print(keys);
  }

  std::cout << "\nsorting pairs\n";
  {
    thrust::device_vector<cuda::std::pair<int, int>> keys(N);
    initialize(keys);
    print(keys);
    thrust::sort(keys.begin(), keys.end());
    print(keys);
  }

  std::cout << "\nkey-value sorting\n";
  {
    thrust::device_vector<int> keys(N);
    thrust::device_vector<int> values(N);
    initialize(keys, values);
    print(keys, values);
    thrust::sort_by_key(keys.begin(), keys.end(), values.begin());
    print(keys, values);
  }

  std::cout << "\nkey-value sorting (descending)\n";
  {
    thrust::device_vector<int> keys(N);
    thrust::device_vector<int> values(N);
    initialize(keys, values);
    print(keys, values);
    thrust::sort_by_key(keys.begin(), keys.end(), values.begin(),
                        cuda::std::greater<int>());
    print(keys, values);
  }

  return 0;
}

✅ Sum (5.2x less code)#

Parrot Code

#include "parrot.hpp"

int main() {
    parrot::scalar(9999).repeat(100).sum().print();  //
}

Thrust Code

Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/sum.cu

#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/generate.h>
#include <thrust/host_vector.h>
#include <thrust/random.h>
#include <thrust/reduce.h>

int my_rand() {
  static thrust::default_random_engine rng;
  static thrust::uniform_int_distribution<int> dist(0, 9999);
  return dist(rng);
}

int main() {
  // generate random data on the host
  thrust::host_vector<int> h_vec(100);
  thrust::generate(h_vec.begin(), h_vec.end(), my_rand);

  // transfer to device and compute sum
  thrust::device_vector<int> d_vec = h_vec;

  // initial value of the reduction
  int init = 0;

  // binary operation used to reduce values
  cuda::std::plus<int> binary_op;

  // compute sum on the device
  int sum = thrust::reduce(d_vec.begin(), d_vec.end(), init, binary_op);

  // print the sum
  std::cout << "sum is " << sum << std::endl;

  return 0;
}

✅ Sum Rows (8.6x less code)#

Parrot Code

#include "parrot.hpp"

int main() {
    auto matrix = parrot::matrix(89, {5, 8}).add(10).rand();
    matrix.sum<2>().print();
}

Thrust Code

Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/sum_rows.cu

#include <thrust/execution_policy.h>
#include <thrust/for_each.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/random.h>
#include <thrust/reduce.h>
#include <thrust/tabulate.h>
#include <thrust/universal_vector.h>

#include <cuda/iterator>
#include <cuda/std/mdspan>

#include <iomanip>
#include <iostream>

#include "include/host_device.h"

int main() {
  const int rows = 32;
  const int cols = 16;

  // Create a 2D multidimensional array of ints.
  thrust::universal_vector<int> data(rows * cols, 42);
  cuda::std::mdspan M(thrust::raw_pointer_cast(data.data()), rows, cols);

  // Create an iterator to the flat linear index space for the multidimensional
  // array.
  auto flat_idx = cuda::counting_iterator(0);

  // Fill the array with pseudorandom inputs in parallel on the device.
  thrust::tabulate(thrust::device, M.data_handle(), M.data_handle() + M.size(),
                   [] __host__ __device__(int flat) {
                     thrust::default_random_engine rng;
                     thrust::uniform_int_distribution<int> dist(0, 3);
                     rng.discard(
                         flat); // Advance to the current element's position.
                     return dist(rng);
                   });

  // Create a range to the row index of each element.
  auto row_idx_begin = thrust::make_transform_iterator(
      flat_idx, [=] __host__ __device__(int flat) { return flat / cols; });
  auto row_idx_end = row_idx_begin + M.size();

  // Sum each row, storing the result in a new vector.
  thrust::universal_vector<int> sums(rows);
  thrust::reduce_by_key(thrust::device, row_idx_begin, row_idx_end,
                        M.data_handle(), thrust::make_discard_iterator(),
                        sums.begin());

  // Output the result.
  thrust::for_each_n(thrust::seq, flat_idx, rows, [&](int i) {
    std::cout << "[ ";
    thrust::for_each_n(thrust::seq, flat_idx, cols, [&](int j) {
      std::cout << std::setw(2) << M(i, j) << " ";
    });
    std::cout << "] = " << sums[i] << "\n";
  });
}

✅ Summed Area Table (14.4x less code)#

Parrot Code

#include "parrot.hpp"

int main() {
    auto m = parrot::matrix(1, {3, 4});
    m.sums<2>().sums<1>().print();
}

Thrust Code

Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/summed_area_table.cu

#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/gather.h>
#include <thrust/host_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/scan.h>

#include <iomanip>
#include <iostream>

// This example computes a summed area table using segmented scan
// http://en.wikipedia.org/wiki/Summed_area_table

// convert a linear index to a linear index in the transpose
struct transpose_index {
  size_t m, n;

  __host__ __device__ transpose_index(size_t _m, size_t _n) : m(_m), n(_n) {}

  __host__ __device__ size_t operator()(size_t linear_index) {
    size_t i = linear_index / n;
    size_t j = linear_index % n;

    return m * j + i;
  }
};

// convert a linear index to a row index
struct row_index {
  size_t n;

  __host__ __device__ row_index(size_t _n) : n(_n) {}

  __host__ __device__ size_t operator()(size_t i) { return i / n; }
};

// transpose an M-by-N array
template <typename T>
void transpose(size_t m, size_t n, thrust::device_vector<T> &src,
               thrust::device_vector<T> &dst) {
  thrust::counting_iterator<size_t> indices(0);

  thrust::gather(
      thrust::make_transform_iterator(indices, transpose_index(n, m)),
      thrust::make_transform_iterator(indices, transpose_index(n, m)) +
          dst.size(),
      src.begin(), dst.begin());
}

// scan the rows of an M-by-N array
template <typename T>
void scan_horizontally(size_t n, thrust::device_vector<T> &d_data) {
  thrust::counting_iterator<size_t> indices(0);

  thrust::inclusive_scan_by_key(
      thrust::make_transform_iterator(indices, row_index(n)),
      thrust::make_transform_iterator(indices, row_index(n)) + d_data.size(),
      d_data.begin(), d_data.begin());
}

// print an M-by-N array
template <typename T>
void print(size_t m, size_t n, thrust::device_vector<T> &d_data) {
  thrust::host_vector<T> h_data = d_data;

  for (size_t i = 0; i < m; i++) {
    for (size_t j = 0; j < n; j++) {
      std::cout << std::setw(8) << h_data[i * n + j] << " ";
    }
    std::cout << "\n";
  }
}

int main() {
  size_t m = 3; // number of rows
  size_t n = 4; // number of columns

  // 2d array stored in row-major order [(0,0), (0,1), (0,2) ... ]
  thrust::device_vector<int> data(m * n, 1);

  std::cout << "[step 0] initial array" << std::endl;
  print(m, n, data);

  std::cout << "[step 1] scan horizontally" << std::endl;
  scan_horizontally(n, data);
  print(m, n, data);

  std::cout << "[step 2] transpose array" << std::endl;
  thrust::device_vector<int> temp(m * n);
  transpose(m, n, data, temp);
  print(n, m, temp);

  std::cout << "[step 3] scan transpose horizontally" << std::endl;
  scan_horizontally(m, temp);
  print(n, m, temp);

  std::cout << "[step 4] transpose the transpose" << std::endl;
  transpose(n, m, temp, data);
  print(m, n, data);

  return 0;
}

✅ Transform Iterator (8.9x less code)#

Parrot Code

#include <parrot.hpp>

int main() {
    auto values  = parrot::array({2, 5, 7, 1, 6, 0, 3, 8});
    auto clamp   = values.max(1).min(5).print();        // 2 5 5 1 5 1 3 5
    auto sum     = clamp.sum().print();                 // 27
    auto iota    = parrot::range(10).minus(1).print();  // 0 1 2 3 4 5 6 7 8 9
    auto clamp_i = iota.max(1).min(5).print();          // 1 2 3 4 5 5 5 5 5 5
    auto negated = clamp_i.neg().print();  // -1 -2 -3 -4 -5 -5 -5 -5 -5 -5
}

Thrust Code

Link: https://github.com/NVIDIA/cccl/blob/main/thrust/examples/transform_iterator.cu

#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/reduce.h>

#include <iostream>
#include <iterator>
#include <string>

// this functor clamps a value to the range [lo, hi]
template <typename T> struct clamp {
  T lo, hi;

  __host__ __device__ clamp(T _lo, T _hi) : lo(_lo), hi(_hi) {}

  __host__ __device__ T operator()(T x) {
    if (x < lo) {
      return lo;
    } else if (x < hi) {
      return x;
    } else {
      return hi;
    }
  }
};

template <typename T> struct simple_negate {
  __host__ __device__ T operator()(T x) { return -x; }
};

template <typename Iterator>
void print_range(const std::string &name, Iterator first, Iterator last) {
  using T = typename std::iterator_traits<Iterator>::value_type;

  std::cout << name << ": ";
  thrust::copy(first, last, std::ostream_iterator<T>(std::cout, " "));
  std::cout << "\n";
}

int main() {
  // clamp values to the range [1, 5]
  int lo = 1;
  int hi = 5;

  // define some types
  using Vector = thrust::device_vector<int>;
  using VectorIterator = Vector::iterator;

  // initialize values
  Vector values(8);

  values[0] = 2;
  values[1] = 5;
  values[2] = 7;
  values[3] = 1;
  values[4] = 6;
  values[5] = 0;
  values[6] = 3;
  values[7] = 8;

  print_range("values         ", values.begin(), values.end());

  // define some more types
  using ClampedVectorIterator =
      thrust::transform_iterator<clamp<int>, VectorIterator>;

  // create a transform_iterator that applies clamp() to the values array
  ClampedVectorIterator cv_begin =
      thrust::make_transform_iterator(values.begin(), clamp<int>(lo, hi));
  ClampedVectorIterator cv_end = cv_begin + values.size();

  // now [clamped_begin, clamped_end) defines a sequence of clamped values
  print_range("clamped values ", cv_begin, cv_end);

  ////
  // compute the sum of the clamped sequence with reduce()
  std::cout << "sum of clamped values : " << thrust::reduce(cv_begin, cv_end)
            << "\n";

  ////
  // combine transform_iterator with other fancy iterators like
  // counting_iterator
  using CountingIterator = thrust::counting_iterator<int>;
  using ClampedCountingIterator =
      thrust::transform_iterator<clamp<int>, CountingIterator>;

  CountingIterator count_begin(0);
  CountingIterator count_end(10);

  print_range("sequence         ", count_begin, count_end);

  ClampedCountingIterator cs_begin =
      thrust::make_transform_iterator(count_begin, clamp<int>(lo, hi));
  ClampedCountingIterator cs_end =
      thrust::make_transform_iterator(count_end, clamp<int>(lo, hi));

  print_range("clamped sequence ", cs_begin, cs_end);

  ////
  // combine transform_iterator with another transform_iterator
  using NegatedClampedCountingIterator =
      thrust::transform_iterator<cuda::std::negate<int>,
                                 ClampedCountingIterator>;

  NegatedClampedCountingIterator ncs_begin =
      thrust::make_transform_iterator(cs_begin, cuda::std::negate<int>());
  NegatedClampedCountingIterator ncs_end =
      thrust::make_transform_iterator(cs_end, cuda::std::negate<int>());

  print_range("negated sequence ", ncs_begin, ncs_end);

  ////
  // when a functor does not define result_type, a third template argument must
  // be provided
  using NegatedVectorIterator =
      thrust::transform_iterator<simple_negate<int>, VectorIterator, int>;

  NegatedVectorIterator nv_begin(values.begin(), simple_negate<int>());
  NegatedVectorIterator nv_end(values.end(), simple_negate<int>());

  print_range("negated values ", nv_begin, nv_end);

  return 0;
}

Parrot vs Thrust#

Examples#

✅ Arbitrary Transformation (6.1x less code)#

✅ Basic Vector (2.7x less code)#

✅ Bounding Box (7.3x less code)#

✅ Constant Iterator (2.3x less code)#

✅ Counting Iterator (2.5x less code)#

✅ Dot Products With Zip (10.1x less code)#

✅ Max Abs Diff (3.5x less code)#

✅ Min Max (10.0x less code)#

✅ Mode (11.0x less code)#

✅ Monte Carlo (4.5x less code)#

✅ Norm (3.8x less code)#

✅ Padded Grid Reduction (10.6x less code)#

✅ Permutation Iterator (2.2x less code)#

✅ Remove Points2d (5.0x less code)#

✅ Run Length Encoding (3.7x less code)#

✅ Saxpy (7.0x less code)#

✅ Simple Moving Average (8.8x less code)#

✅ Sort (12.3x less code)#

✅ Sum (5.2x less code)#

✅ Sum Rows (8.6x less code)#

✅ Summed Area Table (14.4x less code)#

✅ Transform Iterator (8.9x less code)#

Examples #