Examples#

This page provides standalone Parrot examples demonstrating common operations and patterns, as well as real world examples showing before and after code transformations.

Getting Started Examples #

These examples demonstrate basic Parrot functionality and common patterns.

Map Reduce#

#include "parrot.hpp"

int main() {
    // Map-reduce: sum of first 100k odd numbers
    int N    = 100000;
    auto sum = parrot::range(N)  //
                 .times(2)
                 .minus(1)
                 .sum()
                 .print();  // 1,410,065,408
}

Max Diff#

#include "parrot.hpp"

int main() {
    // Find maximum absolute difference between consecutive elements
    auto data     = parrot::array({10, 5, 8, 3, 12, 1});
    auto max_diff = data  //
                      .deltas()
                      .abs()
                      .maxr()
                      .print();  // 11 (12 - 1)
}

Sum Of Squares#

#include "parrot.hpp"

int main() {
    // Calculate sum of squares of 1 million numbers
    int N    = 1000000;
    auto sos = parrot::scalar(10)  //
                 .repeat(N)
                 .rand()
                 .sq()
                 .sum()
                 .print();  // ~ 28.5M
}

Real World Examples #

These examples show real world code transformations using Parrot. Each example demonstrates the before and after code, highlighting how Parrot simplifies complex operations.

Aresdb Expand (2.1x code reduction)#

Source: Original Code

Before (Original)

int expand(DimensionVector inputKeys, DimensionVector outputKeys,
           uint32_t *baseCounts, uint32_t *indexVector, int indexVectorLen,
           int outputOccupiedLen, cudaStream_t cudaStream) {
  // create count interator from baseCount and indexVector
  IndexCountIterator countIter = IndexCountIterator(baseCounts, indexVector);

  // total item counts by adding counts together
  uint32_t totalCount = thrust::reduce(GET_EXECUTION_POLICY(cudaStream),
                                       countIter, countIter + indexVectorLen);

  // scan the counts to obtain output offsets for each input element
  ares::device_vector<uint32_t> offsets(indexVectorLen);
  thrust::exclusive_scan(GET_EXECUTION_POLICY(cudaStream), countIter,
                         countIter + indexVectorLen, offsets.begin());

  // scatter the nonzero counts into their corresponding output positions
  ares::device_vector<uint32_t> indices(totalCount);
  thrust::scatter_if(GET_EXECUTION_POLICY(cudaStream),
                     thrust::counting_iterator<uint32_t>(0),
                     thrust::counting_iterator<uint32_t>(indexVectorLen),
                     offsets.begin(), countIter, indices.begin());

  // compute max-scan over the indices, filling in the holes
  thrust::inclusive_scan(GET_EXECUTION_POLICY(cudaStream), indices.begin(),
                         indices.end(), indices.begin(),
                         thrust::maximum<uint32_t>());

  // get the raw pointer from device/host vector
  uint32_t *newIndexVector = thrust::raw_pointer_cast(&indices[0]);

  int outputLen =
      min(totalCount, outputKeys.VectorCapacity - outputOccupiedLen);
  // start the real copy operation
  DimensionColumnPermutateIterator iterIn(inputKeys.DimValues, newIndexVector,
                                          inputKeys.VectorCapacity, outputLen,
                                          inputKeys.NumDimsPerDimWidth);

  DimensionColumnOutputIterator iterOut(
      outputKeys.DimValues, outputKeys.VectorCapacity, outputLen,
      inputKeys.NumDimsPerDimWidth, outputOccupiedLen);

  int numDims = 0;
  for (int i = 0; i < NUM_DIM_WIDTH; i++) {
    numDims += inputKeys.NumDimsPerDimWidth[i];
  }
  // copy dim values into output
  thrust::copy(GET_EXECUTION_POLICY(cudaStream), iterIn,
               iterIn + numDims * 2 * outputLen, iterOut);
  // return total count in the output dimensionVector
  return outputLen + outputOccupiedLen;
}

After (Parrot)

// this is a simplified version of the expand
// function that assumes all dim widths are 1
template <typename BaseCountsArray,
          typename IndexArray,
          typename InputKeysArray>
auto expand_parrot(const InputKeysArray &input_keys,
                   const BaseCountsArray &base_counts,
                   const IndexArray &indices,
                   int capacity) {
    auto counts = base_counts.deltas()
                    .gather(indices)
                    .sums()
                    .min(capacity)
                    .prepend(0)
                    .deltas()
                    .cycle({input_keys.size()});
    return input_keys.replicate(counts);
}

Fastllm Topk (5.7x code reduction)#

Source: Original Code

Before (Original)

__host__ __device__ void operator()(int i) const {
  thrust::device_ptr<float> d_input(cudaInput);
  thrust::device_ptr<float> d_output(cudaOutput);

  // 当前行的起始位置
  thrust::device_ptr<float> row_start = d_input + i * channels;
  thrust::device_ptr<float> row_end = row_start + channels;

  // 创建索引序列 [0, 1, 2, ..., channels-1]
  thrust::device_vector<int> indices(channels);
  thrust::sequence(indices.begin(), indices.end());

  // 使用zip迭代器将值和索引组合在一起
  auto begin =
      thrust::make_zip_iterator(thrust::make_tuple(row_start, indices.begin()));
  auto end =
      thrust::make_zip_iterator(thrust::make_tuple(row_end, indices.end()));

  // 按值降序排序
  thrust::sort(begin, end, thrust::greater<thrust::tuple<float, int>>());

  // 复制前topk个结果到输出
  for (int k = 0; k < topk; ++k) {
    d_output[i * topk * 2 + k * 2] = indices[k];       // 索引
    d_output[i * topk * 2 + k * 2 + 1] = row_start[k]; // 值
  }
}

After (Parrot)

auto topk(auto d_input, int k) {
    return d_input.enumerate().sort().rev().take(k);
}

Paddle Paddle Mode (3.2x code reduction)#

Source: Original Code

Before (Original)

static void GetModebySort(const phi::GPUContext &dev_ctx,
                          const DenseTensor *input_tensor,
                          const int64_t num_cols, const int64_t num_rows,
                          T *out_tensor, int64_t *indices_tensor) {
  DenseTensor input_tmp;
  input_tmp.Resize(common::make_ddim({num_rows, num_cols}));
  T *input_tmp_data = dev_ctx.Alloc<T>(&input_tmp);
  phi::Copy(dev_ctx, *input_tensor, dev_ctx.GetPlace(), false, &input_tmp);

  thrust::device_ptr<T> out_tensor_ptr(out_tensor);
  thrust::device_ptr<int64_t> indices_tensor_ptr(indices_tensor);

  for (int64_t i = 0; i < num_rows; ++i) {
    T *begin = input_tmp_data + num_cols * i;
    T *end = input_tmp_data + num_cols * (i + 1);
    thrust::device_vector<int64_t> indices_data(num_cols);
    thrust::sequence(thrust::device, indices_data.begin(),
                     indices_data.begin() + num_cols);
    thrust::sort_by_key(thrust::device, begin, end, indices_data.begin());
    int unique = 1 + thrust::inner_product(thrust::device, begin, end - 1,
                                           begin + 1, 0, thrust::plus<int>(),
                                           thrust::not_equal_to<T>());
    thrust::device_vector<T> keys_data(unique);
    thrust::device_vector<int64_t> cnts_data(unique);
    thrust::reduce_by_key(thrust::device, begin, end,
                          thrust::constant_iterator<int>(1), keys_data.begin(),
                          cnts_data.begin());
    auto it = thrust::max_element(thrust::device, cnts_data.begin(),
                                  cnts_data.begin() + unique);
    T mode = keys_data[it - cnts_data.begin()];
    int64_t counts = cnts_data[it - cnts_data.begin()];
    auto pos = thrust::find(thrust::device, begin, end, mode);
    int64_t index = indices_data[pos - begin + counts - 1];
    out_tensor_ptr[i] = static_cast<T>(mode);
    indices_tensor_ptr[i] = static_cast<int64_t>(index);
  }
}

After (Parrot)

template <typename Array>
auto GetModeBySort_Parrot(const Array& data, int num_rows, int num_cols) {
    using T = typename Array::value_type;
    std::vector<thrust::pair<T, int>> results;

    for (int r = 0; r < num_rows; ++r) {
        auto mode  = parrot::stats::mode(data.row(r)).value();
        auto index = data.row(r).last_index_of(mode);
        results.push_back(thrust::make_pair(mode, index));
    }

    return parrot::array(results);
}

Fused

1-index Maps (Unary)

1-index Maps (Binary)

2-index Maps

Joins

Products

Reshapes

Copying

Permutations

Conditionally Fused

Compactions

Materializing

Reductions

Scans

Permutations

Compactions

Copying

Split-Reductions

Comparisons

Properties

Accessors

Array Creation

I/O

Function Objects

Accessors

Binary Operations

Statistical Functions

Examples#

Getting Started Examples #

Map Reduce#

Max Diff#

Sum Of Squares#

Real World Examples #

Aresdb Expand (2.1x code reduction)#

Fastllm Topk (5.7x code reduction)#

Paddle Paddle Mode (3.2x code reduction)#

Examples#

Getting Started Examples#

Map Reduce#

Max Diff#

Sum Of Squares#

Real World Examples#

Aresdb Expand (2.1x code reduction)#

Fastllm Topk (5.7x code reduction)#

Paddle Paddle Mode (3.2x code reduction)#

Getting Started Examples #

Real World Examples #