- Published on
HelloCuda 系列 Dynamic Parallelism
Dynamic parallelism is a powerful feature in CUDA that allows kernels to launch other kernels. By enabling kernels to launch additional kernels, dynamic parallelism can help manage load balancing and efficiently utilize GPU resources without excessive synchronization with the CPU.
- Basic CUDA program
dp.cu
#include <iostream>
#include <cuda_runtime.h>
#include <stdio.h>
__global__ void childKernel(int threadId, int *data) {
int childIndex = threadIdx.x;
printf("Child kernel: Thread %d, Child Index %d\n", threadId, childIndex);
if (childIndex < 128) {
data[threadId] += 1;
}
}
__global__ void parentkernel(int *data, int size) {
int threadId = threadIdx.x + blockIdx.x * blockDim.x;
if (threadId < size) {
data[threadId] = 2 * data[threadId];
}
// Launch the child kernel
childKernel<<<1, 1>>>(threadId, data);
__syncthreads(); // Ensure the child kernel completes before proceeding
}
int main() {
const int dataSize = 1024;
int *d_data;
err = cudaMalloc((void**)&d_data, dataSize * sizeof(int));
if (err != cudaSuccess) {
std::cerr << "Error allocating device memory: " << cudaGetErrorString(err) << std::endl;
return -1;
}
int h_data[dataSize];
for (int i = 0; i < dataSize; ++i) {
h_data[i] = i;
}
err = cudaMemcpy(d_data, h_data, dataSize * sizeof(int), cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
std::cerr << "Error copying data to device: " << cudaGetErrorString(err) << std::endl;
cudaFree(d_data);
return -1;
}
parentkernel<<<1, 128>>>(d_data, dataSize);
err = cudaGetLastError();
if (err != cudaSuccess) {
std::cerr << "Error launching parent kernel: " << cudaGetErrorString(err) << std::endl;
cudaFree(d_data);
return -1;
}
err = cudaMemcpy(h_data, d_data, dataSize * sizeof(int), cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
std::cerr << "Error copying data back to host: " << cudaGetErrorString(err) << std::endl;
cudaFree(d_data);
return -1;
}
for (int i = 0; i < dataSize; ++i) {
std::cout << "h_data[" << i << "] = " << h_data[i] << std::endl;
}
cudaFree(d_data);
return 0;
}
- build
nvcc -Wno-deprecated-gpu-targets -rdc=true hello_world.cu -o hello_world && ./hello_world
*Caution -rdc=true
is required to enable dynamic parallelism in CUDA. This flag allows the compiler to generate relocatable device code (RDC), which is necessary for launching kernels from other kernels.
在 CUDA 中,Relocatable Device Code (RDC) 是一种编译模式,允许将设备端代码(如 __device__
或 __global__
函数)编译为可重定位的中间对象文件(类似 CPU 端的 .o 文件),而不是直接生成最终的二进制代码。
THE END