Examples#
This page provides standalone Parrot examples demonstrating common operations and patterns, as well as real world examples showing before and after code transformations.
Getting Started Examples#
These examples demonstrate basic Parrot functionality and common patterns.
Map Reduce#
#include "parrot.hpp"
int main() {
// Map-reduce: sum of first 100k odd numbers
int N = 100000;
auto sum = parrot::range(N) //
.times(2)
.minus(1)
.sum()
.print(); // 1,410,065,408
}
Max Diff#
#include "parrot.hpp"
int main() {
// Find maximum absolute difference between consecutive elements
auto data = parrot::array({10, 5, 8, 3, 12, 1});
auto max_diff = data //
.deltas()
.abs()
.maxr()
.print(); // 11 (12 - 1)
}
Sum Of Squares#
#include "parrot.hpp"
int main() {
// Calculate sum of squares of 1 million numbers
int N = 1000000;
auto sos = parrot::scalar(10) //
.repeat(N)
.rand()
.sq()
.sum()
.print(); // ~ 28.5M
}
Real World Examples#
These examples show real world code transformations using Parrot. Each example demonstrates the before and after code, highlighting how Parrot simplifies complex operations.
Aresdb Expand (2.1x code reduction)#
Source: Original Code
Before (Original)
int expand(DimensionVector inputKeys, DimensionVector outputKeys,
uint32_t *baseCounts, uint32_t *indexVector, int indexVectorLen,
int outputOccupiedLen, cudaStream_t cudaStream) {
// create count interator from baseCount and indexVector
IndexCountIterator countIter = IndexCountIterator(baseCounts, indexVector);
// total item counts by adding counts together
uint32_t totalCount = thrust::reduce(GET_EXECUTION_POLICY(cudaStream),
countIter, countIter + indexVectorLen);
// scan the counts to obtain output offsets for each input element
ares::device_vector<uint32_t> offsets(indexVectorLen);
thrust::exclusive_scan(GET_EXECUTION_POLICY(cudaStream), countIter,
countIter + indexVectorLen, offsets.begin());
// scatter the nonzero counts into their corresponding output positions
ares::device_vector<uint32_t> indices(totalCount);
thrust::scatter_if(GET_EXECUTION_POLICY(cudaStream),
thrust::counting_iterator<uint32_t>(0),
thrust::counting_iterator<uint32_t>(indexVectorLen),
offsets.begin(), countIter, indices.begin());
// compute max-scan over the indices, filling in the holes
thrust::inclusive_scan(GET_EXECUTION_POLICY(cudaStream), indices.begin(),
indices.end(), indices.begin(),
thrust::maximum<uint32_t>());
// get the raw pointer from device/host vector
uint32_t *newIndexVector = thrust::raw_pointer_cast(&indices[0]);
int outputLen =
min(totalCount, outputKeys.VectorCapacity - outputOccupiedLen);
// start the real copy operation
DimensionColumnPermutateIterator iterIn(inputKeys.DimValues, newIndexVector,
inputKeys.VectorCapacity, outputLen,
inputKeys.NumDimsPerDimWidth);
DimensionColumnOutputIterator iterOut(
outputKeys.DimValues, outputKeys.VectorCapacity, outputLen,
inputKeys.NumDimsPerDimWidth, outputOccupiedLen);
int numDims = 0;
for (int i = 0; i < NUM_DIM_WIDTH; i++) {
numDims += inputKeys.NumDimsPerDimWidth[i];
}
// copy dim values into output
thrust::copy(GET_EXECUTION_POLICY(cudaStream), iterIn,
iterIn + numDims * 2 * outputLen, iterOut);
// return total count in the output dimensionVector
return outputLen + outputOccupiedLen;
}
After (Parrot)
// this is a simplified version of the expand
// function that assumes all dim widths are 1
template <typename BaseCountsArray,
typename IndexArray,
typename InputKeysArray>
auto expand_parrot(const InputKeysArray &input_keys,
const BaseCountsArray &base_counts,
const IndexArray &indices,
int capacity) {
auto counts = base_counts.deltas()
.gather(indices)
.sums()
.min(capacity)
.prepend(0)
.deltas()
.cycle({input_keys.size()});
return input_keys.replicate(counts);
}
Fastllm Topk (5.7x code reduction)#
Source: Original Code
Before (Original)
__host__ __device__ void operator()(int i) const {
thrust::device_ptr<float> d_input(cudaInput);
thrust::device_ptr<float> d_output(cudaOutput);
// 当前行的起始位置
thrust::device_ptr<float> row_start = d_input + i * channels;
thrust::device_ptr<float> row_end = row_start + channels;
// 创建索引序列 [0, 1, 2, ..., channels-1]
thrust::device_vector<int> indices(channels);
thrust::sequence(indices.begin(), indices.end());
// 使用zip迭代器将值和索引组合在一起
auto begin =
thrust::make_zip_iterator(thrust::make_tuple(row_start, indices.begin()));
auto end =
thrust::make_zip_iterator(thrust::make_tuple(row_end, indices.end()));
// 按值降序排序
thrust::sort(begin, end, thrust::greater<thrust::tuple<float, int>>());
// 复制前topk个结果到输出
for (int k = 0; k < topk; ++k) {
d_output[i * topk * 2 + k * 2] = indices[k]; // 索引
d_output[i * topk * 2 + k * 2 + 1] = row_start[k]; // 值
}
}
After (Parrot)
auto topk(auto d_input, int k) {
return d_input.enumerate().sort().rev().take(k);
}
Paddle Paddle Mode (3.2x code reduction)#
Source: Original Code
Before (Original)
static void GetModebySort(const phi::GPUContext &dev_ctx,
const DenseTensor *input_tensor,
const int64_t num_cols, const int64_t num_rows,
T *out_tensor, int64_t *indices_tensor) {
DenseTensor input_tmp;
input_tmp.Resize(common::make_ddim({num_rows, num_cols}));
T *input_tmp_data = dev_ctx.Alloc<T>(&input_tmp);
phi::Copy(dev_ctx, *input_tensor, dev_ctx.GetPlace(), false, &input_tmp);
thrust::device_ptr<T> out_tensor_ptr(out_tensor);
thrust::device_ptr<int64_t> indices_tensor_ptr(indices_tensor);
for (int64_t i = 0; i < num_rows; ++i) {
T *begin = input_tmp_data + num_cols * i;
T *end = input_tmp_data + num_cols * (i + 1);
thrust::device_vector<int64_t> indices_data(num_cols);
thrust::sequence(thrust::device, indices_data.begin(),
indices_data.begin() + num_cols);
thrust::sort_by_key(thrust::device, begin, end, indices_data.begin());
int unique = 1 + thrust::inner_product(thrust::device, begin, end - 1,
begin + 1, 0, thrust::plus<int>(),
thrust::not_equal_to<T>());
thrust::device_vector<T> keys_data(unique);
thrust::device_vector<int64_t> cnts_data(unique);
thrust::reduce_by_key(thrust::device, begin, end,
thrust::constant_iterator<int>(1), keys_data.begin(),
cnts_data.begin());
auto it = thrust::max_element(thrust::device, cnts_data.begin(),
cnts_data.begin() + unique);
T mode = keys_data[it - cnts_data.begin()];
int64_t counts = cnts_data[it - cnts_data.begin()];
auto pos = thrust::find(thrust::device, begin, end, mode);
int64_t index = indices_data[pos - begin + counts - 1];
out_tensor_ptr[i] = static_cast<T>(mode);
indices_tensor_ptr[i] = static_cast<int64_t>(index);
}
}
After (Parrot)
template <typename Array>
auto GetModeBySort_Parrot(const Array& data, int num_rows, int num_cols) {
using T = typename Array::value_type;
std::vector<thrust::pair<T, int>> results;
for (int r = 0; r < num_rows; ++r) {
auto mode = parrot::stats::mode(data.row(r)).value();
auto index = data.row(r).last_index_of(mode);
results.push_back(thrust::make_pair(mode, index));
}
return parrot::array(results);
}