Thrust: GPU Parallel Algorithms
Thrust is to CUDA what the C++ STL is to CPU code. If you need parallel sort, reduce, scan, or transform on GPU data and don't want to write kernels, reach for Thrust first.
Setup
Thrust ships with CUDA toolkit (10.0+). Header-only, no separate install.
cpp
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
#include <thrust/transform.h>
#include <thrust/reduce.h>
#include <thrust/scan.h>Host and Device Vectors
cpp
// Data lives on host
thrust::host_vector<float> h_vec(1000, 1.0f);
// Copy to GPU — single line
thrust::device_vector<float> d_vec = h_vec;
// Back to host
thrust::host_vector<float> result = d_vec;Sort
cpp
thrust::device_vector<int> d_keys = {5, 2, 8, 1, 9, 3};
thrust::sort(d_keys.begin(), d_keys.end());
// stable_sort, sort_by_key (co-sort values with keys)
thrust::device_vector<float> d_values = {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f};
thrust::sort_by_key(d_keys.begin(), d_keys.end(), d_values.begin());Reduce and Transform
cpp
// Sum
float total = thrust::reduce(d_vec.begin(), d_vec.end(), 0.0f, thrust::plus<float>());
// Max element
float max_val = thrust::reduce(d_vec.begin(), d_vec.end(),
-FLT_MAX, thrust::maximum<float>());
// Transform: square each element in-place
thrust::transform(d_vec.begin(), d_vec.end(), d_vec.begin(),
[] __device__ (float x) { return x * x; });
// Transform two vectors element-wise
thrust::transform(d_a.begin(), d_a.end(), d_b.begin(), d_out.begin(),
thrust::plus<float>());Prefix Scan
cpp
// Exclusive scan: out[i] = sum(in[0..i-1])
thrust::exclusive_scan(d_vec.begin(), d_vec.end(), d_out.begin());
// Inclusive scan: out[i] = sum(in[0..i])
thrust::inclusive_scan(d_vec.begin(), d_vec.end(), d_out.begin());Counting and Fancy Iterators
cpp
// counting_iterator: generate 0,1,2,...N without allocating
thrust::counting_iterator<int> first(0);
thrust::counting_iterator<int> last = first + 1000;
// constant_iterator: broadcast a single value
thrust::constant_iterator<float> const_iter(3.14f);
// zip_iterator: iterate over two vectors in lockstep
auto zip = thrust::make_zip_iterator(thrust::make_tuple(d_x.begin(), d_y.begin()));Custom Functors
cpp
struct AbsoluteDifference {
__host__ __device__
float operator()(float a, float b) const { return fabsf(a - b); }
};
thrust::transform(d_a.begin(), d_a.end(), d_b.begin(), d_out.begin(),
AbsoluteDifference());Interop with Raw CUDA
cpp
// Get raw pointer from device_vector for kernel calls
float* raw_ptr = thrust::raw_pointer_cast(d_vec.data());
my_kernel<<<grid, block>>>(raw_ptr, d_vec.size());When to Use Thrust vs Raw CUDA
- Thrust: sort, reduce, scan, filter (copy_if), unique, binary search on GPU data
- Raw CUDA: custom memory access patterns, tiling, warp-level primitives, anything Thrust doesn't have
Performance: Async Execution with Streams
Thrust calls synchronize by default. For asynchronous execution, use thrust::cuda::par.on(stream):
cpp
cudaStream_t stream;
cudaStreamCreate(&stream);
thrust::sort(thrust::cuda::par.on(stream), d_vec.begin(), d_vec.end());