Your First Kernel
Writing a CUDA kernel is the fundamental skill of GPU programming. Once you understand how to launch a kernel, how threads are indexed, and how to move data between CPU and GPU, you can write any parallel computation. This chapter walks through the complete process: from a "hello world" kernel to a full vector addition with memory management and error checking.
What Is a CUDA Kernel?
A kernel is a function that runs on the GPU, executed by thousands of threads in parallel. The __global__ keyword marks a function as callable from CPU code but running on the GPU. Each thread executes the same code but on different data -- this is the SIMT (Single Instruction, Multiple Threads) model.
// hello.cu
#include <stdio.h>
__global__ void hello_kernel() {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
printf("Hello from thread %d (block %d, thread %d)\n",
tid, blockIdx.x, threadIdx.x);
}
int main() {
// Launch 2 blocks of 4 threads each = 8 threads total
hello_kernel<<<2, 4>>>();
cudaDeviceSynchronize(); // Wait for GPU to finish
return 0;
}
# Compile and run
nvcc hello.cu -o hello
./hello
Thread Indexing
Every thread knows its position in the grid:
// 1D indexing (most common)
int tid = threadIdx.x + blockIdx.x * blockDim.x;
// 2D indexing (useful for matrices)
int row = threadIdx.y + blockIdx.y * blockDim.y;
int col = threadIdx.x + blockIdx.x * blockDim.x;
int idx = row * width + col;
// Built-in variables:
// threadIdx.x/y/z -- thread position within block
// blockIdx.x/y/z -- block position within grid
// blockDim.x/y/z -- number of threads per block
// gridDim.x/y/z -- number of blocks in grid
Vector Addition
The "hello world" of GPU programming:
<Code id="code-first-kernel-3" caption="The "hello world" of GPU programming.">
__global__ void vec_add(float* a, float* b, float* c, int n) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < n) { // Bounds check (grid may be larger than data)
c[tid] = a[tid] + b[tid];
}
}
int main() {
int n = 1 << 20; // 1M elements
size_t bytes = n * sizeof(float);
// Allocate host memory
float *h_a = (float*)malloc(bytes);
float *h_b = (float*)malloc(bytes);
float *h_c = (float*)malloc(bytes);
// Initialize
for (int i = 0; i < n; i++) {
h_a[i] = 1.0f;
h_b[i] = 2.0f;
}
// Allocate device memory
float *d_a, *d_b, *d_c;
cudaMalloc(&d_a, bytes);
cudaMalloc(&d_b, bytes);
cudaMalloc(&d_c, bytes);
// Copy host -> device
cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice);
// Launch kernel
int block_size = 256;
int grid_size = (n + block_size - 1) / block_size; // Ceiling division
vec_add<<<grid_size, block_size>>>(d_a, d_b, d_c, n);
// Copy device -> host
cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);
// Cleanup
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
free(h_a); free(h_b); free(h_c);
return 0;
}
| Qualifier | Runs On | Callable From | Notes |
|---|---|---|---|
__global__ | GPU | CPU (or GPU with dynamic parallelism) | Kernel entry point; must return void |
__device__ | GPU | GPU only | Helper function called from kernels |
__host__ | CPU | CPU only | Normal CPU function (default) |
__host__ __device__ | Both | Both | Compiled for both architectures |
Total threads launched = grid.x * grid.y * grid.z * block.x * block.y * block.z.
| Parameter | Maximum Value | Notes |
|---|---|---|
| Threads per block | 1024 | block.x * block.y * block.z <= 1024 |
| Block dimension x | 1024 | |
| Block dimension y/z | 1024 / 64 | |
| Grid dimension x | Over 2 billion blocks | |
| Grid dimension y/z | 65535 |
Exceeding these limits causes a silent launch failure -- the kernel does not execute and cudaGetLastError() returns cudaErrorInvalidConfiguration. This is one of the most common beginner mistakes: using a block size of 2048 or a 2D block of (64, 64) (which is 4096 threads). Always check cudaGetLastError() after every kernel launch.
Common Patterns
// Pattern 1: stride loop (handle more data than threads)
// When data size >> grid size, each thread processes multiple elements
__global__ void stride_kernel(float* data, int n) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x; // Total number of threads
for (int i = tid; i < n; i += stride) {
data[i] = data[i] * 2.0f;
}
}
// Launch with a fixed grid size (e.g., 256 blocks * 256 threads = 65K threads)
// Each thread processes n/65K elements -- works for any n
// Pattern 2: 2D grid for matrix operations
dim3 block(16, 16); // 256 threads per block arranged in 16x16
dim3 grid((width + 15) / 16, (height + 15) / 16);
matrix_kernel<<<grid, block>>>(data, width, height);
// Pattern 3: Conditional execution based on thread position
__global__ void boundary_kernel(float* data, int rows, int cols) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row < rows && col < cols) { // 2D bounds check
int idx = row * cols + col;
data[idx] = process(data[idx]);
}
}
Error Checking
// Always check CUDA calls for errors
#define CUDA_CHECK(call) do { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
fprintf(stderr, "CUDA error at %s:%d: %s\n", \
__FILE__, __LINE__, cudaGetErrorString(err)); \
exit(1); \
} \
} while(0)
// Usage
CUDA_CHECK(cudaMalloc(&d_a, bytes));
CUDA_CHECK(cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice));
my_kernel<<<grid, block>>>(args); // This returns immediately
cudaError_t err = cudaGetLastError(); // Check for launch configuration errors
if (err != cudaSuccess) { /* handle */ }
cudaDeviceSynchronize(); // Wait for kernel to finish
err = cudaGetLastError(); // Check for runtime errors (out-of-bounds, etc.)
if (err != cudaSuccess) { /* handle */ }
In production code, use the CUDA_CHECK macro on every CUDA API call.
Memory Management Patterns
// Pattern 1: Manual allocation and deallocation
float *d_ptr;
CUDA_CHECK(cudaMalloc(&d_ptr, n * sizeof(float)));
// ... use d_ptr ...
CUDA_CHECK(cudaFree(d_ptr));
// Pattern 2: Unified memory (automatic CPU-GPU migration)
float *u_ptr;
CUDA_CHECK(cudaMallocManaged(&u_ptr, n * sizeof(float)));
// Can be accessed from BOTH CPU and GPU
// Data migrates on demand (page faults)
for (int i = 0; i < n; i++) u_ptr[i] = 1.0f; // CPU access
my_kernel<<<grid, block>>>(u_ptr); // GPU access (data migrates)
CUDA_CHECK(cudaFree(u_ptr));
// Pattern 3: Pinned host memory (faster CPU-GPU transfers)
float *h_pinned;
CUDA_CHECK(cudaMallocHost(&h_pinned, n * sizeof(float)));
// cudaMemcpyAsync works with pinned memory
CUDA_CHECK(cudaMemcpyAsync(d_ptr, h_pinned, n * sizeof(float),
cudaMemcpyHostToDevice, stream));
CUDA_CHECK(cudaFreeHost(h_pinned));
| Function | Location | Access From | Transfer | Use Case |
|---|---|---|---|---|
cudaMalloc | GPU | GPU only | Explicit (cudaMemcpy) | Standard GPU allocation |
cudaMallocHost | CPU (pinned) | CPU | Async-capable | Fast CPU-GPU transfers |
cudaMallocManaged | Unified | CPU + GPU | Automatic (page fault) | Prototyping, sparse access |