HelloCuda 系列 Dynamic Parallelism

Dynamic parallelism is a powerful feature in CUDA that allows kernels to launch other kernels. By enabling kernels to launch additional kernels, dynamic parallelism can help manage load balancing and efficiently utilize GPU resources without excessive synchronization with the CPU.

Basic CUDA program dp.cu

#include <iostream>
#include <cuda_runtime.h>
#include <stdio.h>

__global__ void childKernel(int threadId, int *data) {
   int childIndex = threadIdx.x;
   
   printf("Child kernel: Thread %d, Child Index %d\n", threadId, childIndex);

   if (childIndex < 128) {
      data[threadId] += 1;
   }
}

__global__ void parentkernel(int *data, int size) {
   int threadId = threadIdx.x + blockIdx.x * blockDim.x;
   if (threadId < size) {
      data[threadId] = 2 * data[threadId];
   }

   // Launch the child kernel
   childKernel<<<1, 1>>>(threadId, data);

   __syncthreads(); // Ensure the child kernel completes before proceeding
}

int main() {
    const int dataSize = 1024;
    int *d_data;
    err = cudaMalloc((void**)&d_data, dataSize * sizeof(int));
    if (err != cudaSuccess) {
        std::cerr << "Error allocating device memory: " << cudaGetErrorString(err) << std::endl;
        return -1;
    }

    int h_data[dataSize];
    for (int i = 0; i < dataSize; ++i) {
        h_data[i] = i;
    }
    err = cudaMemcpy(d_data, h_data, dataSize * sizeof(int), cudaMemcpyHostToDevice);
    if (err != cudaSuccess) {
        std::cerr << "Error copying data to device: " << cudaGetErrorString(err) << std::endl;
        cudaFree(d_data);
        return -1;
    }

    parentkernel<<<1, 128>>>(d_data, dataSize);
    err = cudaGetLastError();
    if (err != cudaSuccess) {
        std::cerr << "Error launching parent kernel: " << cudaGetErrorString(err) << std::endl;
        cudaFree(d_data);
        return -1;
    }
    err = cudaMemcpy(h_data, d_data, dataSize * sizeof(int), cudaMemcpyDeviceToHost);
    if (err != cudaSuccess) {
        std::cerr << "Error copying data back to host: " << cudaGetErrorString(err) << std::endl;
        cudaFree(d_data);
        return -1;
    }
    for (int i = 0; i < dataSize; ++i) {
        std::cout << "h_data[" << i << "] = " << h_data[i] << std::endl;
    }
    cudaFree(d_data);

    return 0;
}

build

nvcc -Wno-deprecated-gpu-targets -rdc=true hello_world.cu -o hello_world && ./hello_world

*Caution -rdc=true is required to enable dynamic parallelism in CUDA. This flag allows the compiler to generate relocatable device code (RDC), which is necessary for launching kernels from other kernels.

在 CUDA 中，Relocatable Device Code (RDC) 是一种编译模式，允许将设备端代码（如 __device__ 或 __global__ 函数）编译为可重定位的中间对象文件（类似 CPU 端的 .o 文件），而不是直接生成最终的二进制代码。

THE END