I have total 811 images with each RGB image dimension of 1440×1080. Therefore total 3.52 GB. I’m using CUDA v12.2, NVIDIA RTX 3090 GPU, and Visual Studio 2019. I’m facing two issues
-
After
readImage()
function, the process memory shows 12.5 GB. Why 3.52 GB images taking this much memory? How to decrease it?//it is a global container std::vector <cv::cuda::GpuMat>gpuImgStack; void readImage(std::string img_path) { cv::Mat original_img; cv::cuda::GpuMat gpuTempImg; for (int i = 0; i < IMG_SIZE; i++) { original_img = cv::imread(img_path + "a1_" + std::to_string(i + 1) + ".BMP"); gpuTempImg.upload(original_img); gpuImgStack.push_back(gpuTempImg); gpuTempImg.release(); original_img.release(); if (gpuImgStack.back().empty()) { printf("Image read failedn"); exit(-1); } } std::cout << "Image UpLoading Done!" << std::endl; //For GPU for (int i = 0; i < IMG_SIZE; i++) { gpuImgStack[i].cv::cuda::GpuMat::convertTo(gpuImgStack[i], -1, 1, -155); gpuImgStack[i].cv::cuda::GpuMat::convertTo(gpuImgStack[i], -1, 3.6, 0); cv::cuda::cvtColor(gpuImgStack[i], gpuImgStack[i], cv::COLOR_BGR2GRAY); gpuImgStack[i].cv::cuda::GpuMat::convertTo(gpuImgStack[i], CV_64F); } }
After Uploading images
-
After running the
SML()
function, process memory reached at 37.8GB. Why this much memory? How can I decrease it?Process memory after CUDA Kernel
__constant__ double kernel_h[3][3] = { -1.0 , 2.0 , -1.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 }; __constant__ double kernel_v[3][3] = { 0.0 ,-1.0 ,0.0, 0.0 ,2.0 ,0.0 , 0.0 , -1.0 , 0.0 }; __global__ void convolution_Kernel(double* inputImg, double* convolutedImg, unsigned int imgWidth, unsigned int imgHeight) { unsigned int col = blockIdx.x * blockDim.x + threadIdx.x; unsigned int row = blockIdx.y * blockDim.y + threadIdx.y; if (row < 2 || col < 2 || row >= imgHeight - 3 || col >= imgWidth - 3) return; double sumX = 0.0, sumY = 0.0, color=0.0; for (int i = -1; i <= 1; i++) { for (int j = -1; j <= 1; j++) { color = inputImg[(row + j) * imgWidth + (col + i)]; sumX += color * kernel_h[i + 1][j + 1]; sumY += color * kernel_v[i + 1][j + 1]; } } double sum = 0.0; sum = std::abs(sumX) + std::abs(sumY); if (sum > 255) sum = 255; if (sum < 0) sum = 0; convolutedImg[row * imgWidth + col] = sum; } void SML() { height = gpuImgStack[0].rows; width = gpuImgStack[0].cols; maxIndices = cv::cuda::GpuMat(height, width, CV_64F); //For horizontal cv::cuda::GpuMat ML3[IMG_SIZE]; cv::cuda::GpuMat SML3[IMG_SIZE]; for (int i = 0; i < IMG_SIZE; i++) { ML3[i] = cv::cuda::GpuMat(height, width, CV_64F); SML3[i] = cv::cuda::GpuMat(height, width, CV_64F); } //Kernel Variable , can be changed depends on image size, 1566720 dim3 block(16, 16); //16*16 = 256 dim3 grid((width + block.x - 1) / block.x, (height + block.y - 1) / block.y); //80*64 = 5120. So, total threads 5120*256 = 1,310,720. Thus, 1024*1280 = 1,310,720 pixels for (int i = 0; i < IMG_SIZE; i++) { convolution_Kernel << <grid, block >> > (gpuImgStack[i].ptr<double>(), ML3[i].ptr<double>(), width, height); } cudaDeviceSynchronize(); }
I tried to optimize the kernel and this is my kernel right now.
Also, I tried to not use these two containers but it can’t be done as inside the kernel the source and dest images will result in wrong.cv::cuda::GpuMat ML3[IMG_SIZE]; cv::cuda::GpuMat SML3[IMG_SIZE];
6