CUB  
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
example_block_scan.cu
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
* Simple demonstration of cub::BlockScan
*
* To compile using the command line:
* nvcc -arch=sm_XX example_block_scan.cu -I../.. -lcudart -O3
*
******************************************************************************/
// Ensure printing of CUDA runtime errors to console (define before including cub.h)
#define CUB_STDERR
#include <stdio.h>
#include <iostream>
#include "../../test/test_util.h"
using namespace cub;
//---------------------------------------------------------------------
// Globals, constants and typedefs
//---------------------------------------------------------------------
bool g_verbose = false;
int g_timing_iterations = 100;
int g_grid_size = 1;
//---------------------------------------------------------------------
// Kernels
//---------------------------------------------------------------------
template <
int BLOCK_THREADS,
int ITEMS_PER_THREAD,
BlockScanAlgorithm ALGORITHM>
__global__ void BlockPrefixSumKernel(
int *d_in, // Tile of input
int *d_out, // Tile of output
clock_t *d_elapsed) // Elapsed cycle count of block scan
{
// Specialize BlockLoad type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement)
// Specialize BlockStore type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement)
// Specialize BlockScan type for our thread block
// Shared memory
__shared__ union TempStorage
{
typename BlockLoadT::TempStorage load;
typename BlockStoreT::TempStorage store;
typename BlockScanT::TempStorage scan;
} temp_storage;
// Per-thread tile data
int data[ITEMS_PER_THREAD];
// Load items into a blocked arrangement
BlockLoadT(temp_storage.load).Load(d_in, data);
// Barrier for smem reuse
__syncthreads();
// Start cycle timer
clock_t start = clock();
// Compute exclusive prefix sum
int aggregate;
BlockScanT(temp_storage.scan).ExclusiveSum(data, data, aggregate);
// Stop cycle timer
clock_t stop = clock();
// Barrier for smem reuse
__syncthreads();
// Store items from a blocked arrangement
BlockStoreT(temp_storage.store).Store(d_out, data);
// Store aggregate and elapsed clocks
if (threadIdx.x == 0)
{
*d_elapsed = (start > stop) ? start - stop : stop - start;
d_out[BLOCK_THREADS * ITEMS_PER_THREAD] = aggregate;
}
}
//---------------------------------------------------------------------
// Host utilities
//---------------------------------------------------------------------
int Initialize(
int *h_in,
int *h_reference,
int num_items)
{
int inclusive = 0;
for (int i = 0; i < num_items; ++i)
{
h_in[i] = i % 17;
h_reference[i] = inclusive;
inclusive += h_in[i];
}
return inclusive;
}
template <
int BLOCK_THREADS,
int ITEMS_PER_THREAD,
BlockScanAlgorithm ALGORITHM>
void Test()
{
const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
// Allocate host arrays
int *h_in = new int[TILE_SIZE];
int *h_reference = new int[TILE_SIZE];
int *h_gpu = new int[TILE_SIZE + 1];
// Initialize problem and reference output on host
int h_aggregate = Initialize(h_in, h_reference, TILE_SIZE);
// Initialize device arrays
int *d_in = NULL;
int *d_out = NULL;
clock_t *d_elapsed = NULL;
cudaMalloc((void**)&d_in, sizeof(int) * TILE_SIZE);
cudaMalloc((void**)&d_out, sizeof(int) * (TILE_SIZE + 1));
cudaMalloc((void**)&d_elapsed, sizeof(clock_t));
// Display input problem data
if (g_verbose)
{
printf("Input data: ");
for (int i = 0; i < TILE_SIZE; i++)
printf("%d, ", h_in[i]);
printf("\n\n");
}
// Kernel props
int max_sm_occupancy;
CubDebugExit(MaxSmOccupancy(max_sm_occupancy, BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>, BLOCK_THREADS));
// Copy problem to device
cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);
printf("BlockScan algorithm %s on %d items (%d timing iterations, %d blocks, %d threads, %d items per thread, %d SM occupancy):\n",
(ALGORITHM == BLOCK_SCAN_RAKING) ? "BLOCK_SCAN_RAKING" : (ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE) ? "BLOCK_SCAN_RAKING_MEMOIZE" : "BLOCK_SCAN_WARP_SCANS",
TILE_SIZE, g_timing_iterations, g_grid_size, BLOCK_THREADS, ITEMS_PER_THREAD, max_sm_occupancy);
// Run aggregate/prefix kernel
BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
d_in,
d_out,
d_elapsed);
// Check results
printf("\tOutput items: ");
int compare = CompareDeviceResults(h_reference, d_out, TILE_SIZE, g_verbose, g_verbose);
printf("%s\n", compare ? "FAIL" : "PASS");
AssertEquals(0, compare);
// Check total aggregate
printf("\tAggregate: ");
compare = CompareDeviceResults(&h_aggregate, d_out + TILE_SIZE, 1, g_verbose, g_verbose);
printf("%s\n", compare ? "FAIL" : "PASS");
AssertEquals(0, compare);
// Run this several times and average the performance results
GpuTimer timer;
float elapsed_millis = 0.0;
clock_t elapsed_clocks = 0;
for (int i = 0; i < g_timing_iterations; ++i)
{
// Copy problem to device
cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);
timer.Start();
// Run aggregate/prefix kernel
BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
d_in,
d_out,
d_elapsed);
timer.Stop();
elapsed_millis += timer.ElapsedMillis();
// Copy clocks from device
clock_t clocks;
CubDebugExit(cudaMemcpy(&clocks, d_elapsed, sizeof(clock_t), cudaMemcpyDeviceToHost));
elapsed_clocks += clocks;
}
// Check for kernel errors and STDIO from the kernel, if any
CubDebugExit(cudaPeekAtLastError());
CubDebugExit(cudaDeviceSynchronize());
// Display timing results
float avg_millis = elapsed_millis / g_timing_iterations;
float avg_items_per_sec = float(TILE_SIZE * g_grid_size) / avg_millis / 1000.0f;
float avg_clocks = float(elapsed_clocks) / g_timing_iterations;
float avg_clocks_per_item = avg_clocks / TILE_SIZE;
printf("\tAverage BlockScan::Sum clocks: %.3f\n", avg_clocks);
printf("\tAverage BlockScan::Sum clocks per item: %.3f\n", avg_clocks_per_item);
printf("\tAverage kernel millis: %.4f\n", avg_millis);
printf("\tAverage million items / sec: %.4f\n", avg_items_per_sec);
// Cleanup
if (h_in) delete[] h_in;
if (h_reference) delete[] h_reference;
if (h_gpu) delete[] h_gpu;
if (d_in) cudaFree(d_in);
if (d_out) cudaFree(d_out);
if (d_elapsed) cudaFree(d_elapsed);
}
int main(int argc, char** argv)
{
// Initialize command line
CommandLineArgs args(argc, argv);
g_verbose = args.CheckCmdLineFlag("v");
args.GetCmdLineArgument("i", g_timing_iterations);
args.GetCmdLineArgument("grid-size", g_grid_size);
// Print usage
if (args.CheckCmdLineFlag("help"))
{
printf("%s "
"[--device=<device-id>] "
"[--i=<timing iterations (default:%d)>]"
"[--grid-size=<grid size (default:%d)>]"
"[--v] "
"\n", argv[0], g_timing_iterations, g_grid_size);
exit(0);
}
// Initialize device
CubDebugExit(args.DeviceInit());
// Run tests
Test<1024, 1, BLOCK_SCAN_RAKING>();
Test<512, 2, BLOCK_SCAN_RAKING>();
Test<256, 4, BLOCK_SCAN_RAKING>();
Test<128, 8, BLOCK_SCAN_RAKING>();
Test<64, 16, BLOCK_SCAN_RAKING>();
Test<32, 32, BLOCK_SCAN_RAKING>();
printf("-------------\n");
Test<1024, 1, BLOCK_SCAN_RAKING_MEMOIZE>();
Test<512, 2, BLOCK_SCAN_RAKING_MEMOIZE>();
Test<256, 4, BLOCK_SCAN_RAKING_MEMOIZE>();
Test<128, 8, BLOCK_SCAN_RAKING_MEMOIZE>();
Test<64, 16, BLOCK_SCAN_RAKING_MEMOIZE>();
Test<32, 32, BLOCK_SCAN_RAKING_MEMOIZE>();
printf("-------------\n");
Test<1024, 1, BLOCK_SCAN_WARP_SCANS>();
Test<512, 2, BLOCK_SCAN_WARP_SCANS>();
Test<256, 4, BLOCK_SCAN_WARP_SCANS>();
Test<128, 8, BLOCK_SCAN_WARP_SCANS>();
Test<64, 16, BLOCK_SCAN_WARP_SCANS>();
Test<32, 32, BLOCK_SCAN_WARP_SCANS>();
return 0;
}