I have been spending an entire day on trying to step into cuda kernel code for debug purpose in vscode with Nsight Visual Studio Code Edition extension. There are a few related questions in stack overflow, but they were asked about 9 years ago. I followed its official tutorial on youtube and its documentation, there is just no way for me to step in cuda kernel code in vscode.
Here is my kernel_debug.cu
#include <cuda_runtime.h>
// CUDA kernel for adding two matrices using a 3D block and thread configuration
__global__ void matrixAdd3D(const int* A, const int* B, int* C, int N) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int depth = blockIdx.z * blockDim.z + threadIdx.z; // Not used in computation, just for demonstration
if (row < N && col < N && depth == 0) { // Using depth == 0 to ensure it computes only once
C[row * N + col] = A[row * N + col] + B[row * N + col];
}
}
// Check for CUDA errors
void checkCudaError(cudaError_t err, const char* msg) {
if (err != cudaSuccess) {
std::cerr << msg << ": " << cudaGetErrorString(err) << std::endl;
exit(EXIT_FAILURE);
}
}
int main() {
const int N = 1024; // Size of the matrix (1024x1024)
const int size = N * N * sizeof(int);
int *h_A, *h_B, *h_C;
// Allocate memory on the host
h_A = (int*)malloc(size);
h_B = (int*)malloc(size);
h_C = (int*)malloc(size);
// Initialize matrices
for (int i = 0; i < N * N; ++i) {
h_A[i] = 1;
h_B[i] = 2;
}
int *d_A, *d_B, *d_C;
// Allocate memory on the device
checkCudaError(cudaMalloc((void**)&d_A, size), "Failed to allocate device memory for A");
checkCudaError(cudaMalloc((void**)&d_B, size), "Failed to allocate device memory for B");
checkCudaError(cudaMalloc((void**)&d_C, size), "Failed to allocate device memory for C");
// Copy data from host to device
checkCudaError(cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice), "Failed to copy A to device");
checkCudaError(cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice), "Failed to copy B to device");
// Set up execution configuration with a 3D block and thread structure
dim3 threadsPerBlock(16, 16, 1); // 1 in the z-dimension since we're not using it for actual computation
dim3 numBlocks((N + threadsPerBlock.x - 1) / threadsPerBlock.x,
(N + threadsPerBlock.y - 1) / threadsPerBlock.y,
1); // Only one block in the z-dimension
// Launch the kernel
matrixAdd3D<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N);
checkCudaError(cudaGetLastError(), "Kernel launch failed");
// Copy result back to host
checkCudaError(cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost), "Failed to copy C back to host");
// Free device memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
// Check results and free host memory
for (int i = 0; i < N * N; i++) {
if (h_C[i] != 3) {
std::cerr << "Incorrect result!" << std::endl;
break;
}
}
std::cout << "Program completed successfully!" << std::endl;
free(h_A);
free(h_B);
free(h_C);
return 0;
}
and my CMakeLists.txt
cmake_minimum_required(VERSION 3.10)
# Project name and version
project(kernel_debug VERSION 1.0)
# Enable C++17 standard
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED True)
set(CMAKE_BUILD_TYPE Debug)
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g -O0")
set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G")
# Set build type to Debug by default
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Debug)
endif()
# Find CUDA package
find_package(CUDA REQUIRED)
# Add the source files
cuda_add_executable(kernel_debug kernel_debug.cu)
# Set CUDA properties for the target
set_target_properties(kernel_debug PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
Also in the vscode, at bottom right corner, it seems asking me to set the cuda coordinates to debug the kernel, basically it means debug will stop at the specific thread and block,
but I have tried for a whole day, no progess has been made.
If anyone ever got this debugging work, please help!!!