CUDA image upsampling with FFT method

I’m trying to do image upsampling with FFT in CUDA. I first do forward FFT on the image, then I pad the result with 0 as shown below:

for a transformed image:

1 2

3 4

Pad it to:

1 0 0 2

0 0 0 0

3 0 0 4

Then I do inverse fft and normalize and output the image. However the output seems problematic:

The original input image:

The output image:

Below is my code:

using namespace cv;
using namespace std;

const int upsamplingFactor = 2;
const string inputName = "../input/4096x4096.jpg";

//convert uint8_t array to __half array
__global__
void Upsampling(
    cufftComplex *input,
    int inputWidth,
    int inputHeight,
    cufftComplex *output,
    int outputWidth,
    int outputHeight
) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x >= outputWidth || y >= outputHeight) 
    {
        return;
    }
    int endXLeft = inputWidth / 2;
    int startXRight = outputWidth - inputWidth / 2;
    int endYTop = inputHeight / 2;
    int startYBottom = outputHeight - inputHeight / 2;

    if( x < endXLeft && y < endYTop)
    {
        output[ y * outputWidth + x ] = input[ y * inputWidth + x ];
    } 
    else if( x >= startXRight && y < endYTop ) 
    {
        output[ y * outputWidth + x ] = input[ y * inputWidth + x - startXRight + endXLeft ];
    }
    else if( x < endXLeft && y >= startYBottom )
    {
        output[ y * outputWidth + x ] = input[ (y - startYBottom + endYTop) * inputWidth + x ];
    }
    else if( x >= startXRight && y >= startYBottom )
    {
        output[ y * outputWidth + x ] = input[ (y - startYBottom + endYTop) * inputWidth + x - startXRight + endXLeft ];
    }
    else {
        output[y * outputWidth + x] = make_cuComplex( 0.f, 0.f );
    }
}

//normalize and switch to uint8_t
__global__
void Normalize( 
    float *input, 
    uint8_t *output,
    int imageWidth,
    int imageHeight,
    int size
)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x >= imageWidth || y >= imageHeight) return;

    output[y * imageWidth + x] = static_cast<uint8_t>(input[y * imageWidth + x] / size + 0.5f);
}

int main(){

    //load input image
    cv::Mat image = cv::imread( inputName, cv::IMREAD_GRAYSCALE );
    if( image.empty() )
    {
        cerr<<"Failed to load image"<<std::endl;
        return EXIT_FAILURE;
    }
    image.convertTo(image,CV_32F);

    //set up params
    const int imageWidth        = image.cols;
    const int imageHeight       = image.rows;
    const size_t imageSize         = ( size_t ) imageWidth * imageHeight;
    const int upsamplingWidth = imageWidth * upsamplingFactor;
    const int upsamplingHeight = imageHeight * upsamplingFactor;
    const size_t upsamplingSize = upsamplingWidth * upsamplingHeight;

    //copy image and kernel to gpu
    float *d_image, *d_output;
    uint8_t *d_outputInt;
    cufftComplex *d_freqImage, *d_freqUpsampling;
    cufftHandle  forwardImagePlan, inverseImagePlan;
    cufftResult  result;

    cudaMalloc( &d_image, imageSize * sizeof(float) );
    cudaMalloc( &d_freqImage, imageSize * sizeof( cufftComplex ) );
    cudaMalloc( &d_freqUpsampling, upsamplingSize * sizeof( cufftComplex) );
    cudaMalloc( &d_output, upsamplingSize * sizeof(float) );
    cudaMalloc( &d_outputInt, upsamplingSize * sizeof(uint8_t) );
    cudaMemcpy( d_image, image.data, imageSize * sizeof(float), cudaMemcpyHostToDevice );
    
    dim3 convertBlockSize(32, 32);
    dim3 convertGridSize( ( upsamplingWidth + convertBlockSize.x - 1 ) / convertBlockSize.x,
                        ( upsamplingHeight + convertBlockSize.y - 1 ) / convertBlockSize.y );

    //creating 2d fft plans
    //invfft should have size after upsampling
    result = cufftPlan2d(&inverseImagePlan, upsamplingWidth, upsamplingHeight, CUFFT_C2R);
    if(result!=CUFFT_SUCCESS)
    {
        std::cerr << "Failed to create inverse FFT plan with code: " << result <<std::endl;
        return EXIT_FAILURE;
    }
    //forward fft should have the size of the original image
    result = cufftPlan2d(&forwardImagePlan, imageWidth, imageHeight, CUFFT_R2C);
    if(result!=CUFFT_SUCCESS){...}

    result = cufftExecR2C( forwardImagePlan, d_image, d_freqImage );
    if( result != CUFFT_SUCCESS ){...}

    Upsampling<<< convertGridSize, convertBlockSize >>>( d_freqImage, imageWidth, imageHeight, d_freqUpsampling, upsamplingWidth, upsamplingHeight );
    
    result = cufftExecC2R( inverseImagePlan, d_freqUpsampling, d_output );
    if( result != CUFFT_SUCCESS ){...}
    //normalize and switch output to uint8_t
    Normalize<<< convertGridSize, convertBlockSize >>>( d_output, d_outputInt, upsamplingWidth, upsamplingHeight, imageSize ); 

    cudaDeviceSynchronize();
    
    Mat output = Mat::zeros( upsamplingHeight, upsamplingWidth, CV_8U );
    cudaMemcpy( output.data, d_outputInt, upsamplingSize * sizeof( uint8_t ), cudaMemcpyDeviceToHost );

    //save output
    cv::imwrite( "fftOutput.png", output );

    //free memory
    cufftDestroy(forwardImagePlan);
    cufftDestroy(inverseImagePlan);
    cudaFree(d_image);
    cudaFree(d_freqImage);
    cudaFree(d_output);
    cudaFree(d_outputInt);
    cudaFree(d_freqUpsampling);
    return 0;
}

Is my padding strategy wrong? I think I implemented the global function correctly, maybe there’s some problem in the code elsewhere that I missed. Thank you in advance!!

I haven’t tried to find all the issues in your code. However, a R2C transform does not work the way you think (I suspect). In the 2D case, one of the image output dimensions is not the input dimension, but is the input dimension divided by two, plus one. You don’t seem to have accounted for that in your d_freqImage allocation, which by itself is not a showstopper, but if you are then expecting your complex allocation to be completely filled with complex data via the forward FFT, you would be mistaken.

Speaking for myself, although of course it is possible to use R2C with sufficient effort, I would not try to use R2C and C2R transforms with this method to get a first article working/for understanding. The R2C and C2R transforms expect and make use of hermitian symmetry (with all that that implies in terms of data formatting) in the complex domain. (See below for added section covering R2C/C2R case).

I followed the description here and came up with this, which seems to work using C2C transforms:

#include "lodepng.h"
#include <iostream>
#include <vector>
#include <cufft.h>

//Encode from raw pixels to disk with a single function call
//The image argument has width * height RGBA pixels or width * height * 4 bytes
void encodeOneStep(const char* filename, std::vector<unsigned char>& image, unsigned width, unsigned height) {
  //Encode the image
  unsigned error = lodepng::encode(filename, image, width, height);

  //if there's an error, display it
  if(error) std::cout << "encoder error " << error << ": "<< lodepng_error_text(error) << std::endl;
}

void decodeOneStep(std::vector<unsigned char> &image, unsigned &width, unsigned &height, const char* filename) {

  //decode
  unsigned error = lodepng::decode(image, width, height, filename);

  //if there's an error, display it
  if(error) std::cout << "decoder error " << error << ": " << lodepng_error_text(error) << std::endl;

  //the pixels are now in the vector "image", 4 bytes per pixel, ordered RGBARGBA..., use it as texture, draw it, ...
}

template <typename T>
__global__ void split(const T *i, T *o, unsigned w, unsigned h, unsigned uf){

  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  int idy = threadIdx.y+blockDim.y*blockIdx.y;
  if ((idx < w) && (idy < h)){
    int oidx = (idx >= (w/2))?(idx+w*(uf-1)):idx;
    int oidy = (idy >= (h/2))?(idy+h*(uf-1)):idy;
    int iid = w*idy+idx;
    int oid = uf*w*oidy+oidx;
    o[oid] = i[iid];}
}

int main(){
  unsigned uf = 3; // upscaling factor
  const char *ifn = "test_i.png";
  const char *ofn = "test_o.png";
  // load input image
  std::vector<unsigned char> i;
  unsigned w, h;
  decodeOneStep(i, w, h, ifn);
  std::cout << "width: " << w << " height: " << h << std::endl;
  // convert to grayscale float normalized 255 = 1.0f
  std::vector<float> in_f(w*h*2);
  for (int j = 0; j < w*h; j++) in_f[j*2] = (0.299*i[4*j] + 0.587*i[4*j+1] + 0.114*i[4*j+2])/256.0;
  // copy to device
  cufftComplex *d_i;
  cudaMalloc(&d_i, w*h*sizeof(cufftComplex));
  cudaMemcpy(d_i, in_f.data(), w*h*sizeof(cufftComplex), cudaMemcpyHostToDevice);
  // perform forward FFT
  cufftHandle hf;
  cufftResult r = cufftPlan2d(&hf, h, w, CUFFT_C2C); 
  r = cufftExecC2C(hf, d_i, d_i, CUFFT_FORWARD);
  // perform data split to corners of upscale complex image
  cufftComplex *d_u; 
  cudaMalloc(&d_u, uf*uf*w*h*sizeof(cufftComplex));
  cudaMemset(d_u, 0, uf*uf*w*h*sizeof(cufftComplex));
  dim3 sblock(32,32);
  dim3 sgrid((w+sblock.x-1)/sblock.x, (h+sblock.y-1)/sblock.y);
  split<<<sgrid, sblock>>>(d_i, d_u, w, h, uf);
  // perform inverse FFT
  cufftHandle hi;
  r = cufftPlan2d(&hi, h*uf, w*uf, CUFFT_C2C); 
  r = cufftExecC2C(hi, d_u, d_u, CUFFT_INVERSE);
  // copy to host
  std::vector<float2> of(w*h*uf*uf);
  cudaMemcpy(of.data(), d_u, w*h*uf*uf*sizeof(float2), cudaMemcpyDeviceToHost);
  // convert to ordinary unsigned char RGBA image (as grayscale image)
  std::vector<unsigned char> o(w*h*uf*uf*4);
  for (int i = 0; i < w*h*uf*uf; i++) {
    if (of[i].x > w*h) of[i].x = w*h;  // clamp
    if (of[i].x < 0) of[i].x = 0;      // clamp
    unsigned char pix = (of[i].x/(w*h))*255;  //scale to unsigned char
    o[4*i]   = pix;  //RGBA for PNG image
    o[4*i+1] = pix;
    o[4*i+2] = pix;
    o[4*i+3] = 255;}
  // save output image
  encodeOneStep(ofn, o, w*uf, h*uf);
}

The lodepng stuff can be gotten here. The original input image:

came from here. This is what the output image looks like (3x larger dimensions for my example):

Compile with nvcc -o test example.cu lodepng.cpp -lcufft

EDIT/LATER:
There was a comment/question posted (now deleted) asking effectively how to do this with R2C and C2R.

It seems that it is possible according to my testing. The basic difference is during the split operation, we will split only horizontally (split the complex data to top and bottom of upscaled complex image), not to all 4 corners. There are of course a number of other changes related to the mechanics of the R2C and C2R, and the special sizes for associated buffers.

Here’s an example that seems to produce a similar output:

#include "lodepng.h"
#include <iostream>
#include <vector>
#include <cufft.h>

//Encode from raw pixels to disk with a single function call
//The image argument has width * height RGBA pixels or width * height * 4 bytes
void encodeOneStep(const char* filename, std::vector<unsigned char>& image, unsigned width, unsigned height) {
  //Encode the image
  unsigned error = lodepng::encode(filename, image, width, height);

  //if there's an error, display it
  if(error) std::cout << "encoder error " << error << ": "<< lodepng_error_text(error) << std::endl;
}

void decodeOneStep(std::vector<unsigned char> &image, unsigned &width, unsigned &height, const char* filename) {

  //decode
  unsigned error = lodepng::decode(image, width, height, filename);

  //if there's an error, display it
  if(error) std::cout << "decoder error " << error << ": " << lodepng_error_text(error) << std::endl;

  //the pixels are now in the vector "image", 4 bytes per pixel, ordered RGBARGBA..., use it as texture, draw it, ...
}

template <typename T>
__global__ void split(const T *i, T *o, unsigned w, unsigned h, unsigned uf){

  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  int idy = threadIdx.y+blockDim.y*blockIdx.y;
  if ((idx < (w/2+1)) && (idy < h)){
    int oidx = idx;
    int oidy = (idy >= (h/2))?(idy+h*(uf-1)):idy;
    int iid = (w/2+1)*idy+idx;
    int oid = ((uf*w)/2+1)*oidy+oidx;
    o[oid] = i[iid];}
}

int main(){
  unsigned uf = 3;
  const char *ifn = "test_i.png";
  const char *ofn = "test_o.png";
  // load input image
  std::vector<unsigned char> i;
  unsigned w, h;
  decodeOneStep(i, w, h, ifn);
  std::cout << "width: " << w << " height: " << h << std::endl;
  // convert to grayscale float normalized 255 = 1.0f
  std::vector<float> in_f((w+2)*h);
  for (int j = 0; j < h; j++)
    for (int k = 0; k < w; k++){ 
      int idx1 = j*(w+2)+k;
      int idx2 = j*w+k;
      int val = 0.299*i[4*idx2] + 0.587*i[4*idx2+1] + 0.114*i[4*idx2+2];
      if (val > 255) val = 255;
      in_f[idx1] = val/256.0;}
  // copy to device
  cufftComplex *d_i;
  cudaMalloc(&d_i, ((w/2)+1)*h*sizeof(cufftComplex));
  cudaMemcpy(d_i, in_f.data(), ((w/2)+1)*h*sizeof(cufftComplex), cudaMemcpyHostToDevice);
  // perform forward FFT
  cufftHandle hf;
  cufftResult r = cufftPlan2d(&hf, h, w, CUFFT_R2C); 
  r = cufftExecR2C(hf, (cufftReal *)d_i, d_i);
  // perform data split to corners of upscale complex image
  cufftComplex *d_u; 
  cudaMalloc(&d_u, uf*h*((uf*w)/2+1)*sizeof(cufftComplex));
  cudaMemset(d_u, 0, uf*h*((uf*w)/2+1)*sizeof(cufftComplex));
  dim3 sblock(32,32);
  dim3 sgrid((((w/2)+1)+sblock.x-1)/sblock.x, (h+sblock.y-1)/sblock.y);
  split<<<sgrid, sblock>>>(d_i, d_u, w, h, uf);
  // perform inverse FFT
  cufftHandle hi;
  r = cufftPlan2d(&hi, h*uf, w*uf, CUFFT_C2R); 
  r = cufftExecC2R(hi, d_u, (cufftReal *)d_u);
  std::vector<float> of((w*uf+2)*h*uf);
  cudaMemcpy(of.data(), d_u, (w*uf+2)*h*uf*sizeof(float), cudaMemcpyDeviceToHost);
  // convert to ordinary unsigned char RGBA image (as grayscale image)
  std::vector<unsigned char> o(w*h*uf*uf*4);
  for (int j = 0; j < h*uf; j++)
    for (int k = 0; k < w*uf; k++) {
      int idx1 = j*(w*uf)+k;
      int idx2 = j*(w*uf+2)+k;
      if (of[idx2] > w*h) of[idx2] = w*h;
      if (of[idx2] < 0) of[idx2] = 0;
      unsigned char pix = (of[idx2]/(w*h))*255;
      o[4*idx1]   = pix;
      o[4*idx1+1] = pix;
      o[4*idx1+2] = pix;
      o[4*idx1+3] = 255;}
  encodeOneStep(ofn, o, w*uf, h*uf);
}

Here is the generated output:

Trang chủ Giới thiệu Sinh nhật bé trai Sinh nhật bé gái Tổ chức sự kiện Biểu diễn giải trí Dịch vụ khác Trang trí tiệc cưới Tổ chức khai trương Tư vấn dịch vụ Thư viện ảnh Tin tức - sự kiện Liên hệ Chú hề sinh nhật Trang trí YEAR END PARTY công ty Trang trí tất niên cuối năm Trang trí tất niên xu hướng mới nhất Trang trí sinh nhật bé trai Hải Đăng Trang trí sinh nhật bé Khánh Vân Trang trí sinh nhật Bích Ngân Trang trí sinh nhật bé Thanh Trang Thuê ông già Noel phát quà Biểu diễn xiếc khỉ Xiếc quay đĩa Dịch vụ tổ chức sự kiện 5 sao Thông tin về chúng tôi Dịch vụ sinh nhật bé trai Dịch vụ sinh nhật bé gái Sự kiện trọn gói Các tiết mục giải trí Dịch vụ bổ trợ Tiệc cưới sang trọng Dịch vụ khai trương Tư vấn tổ chức sự kiện Hình ảnh sự kiện Cập nhật tin tức Liên hệ ngay Thuê chú hề chuyên nghiệp Tiệc tất niên cho công ty Trang trí tiệc cuối năm Tiệc tất niên độc đáo Sinh nhật bé Hải Đăng Sinh nhật đáng yêu bé Khánh Vân Sinh nhật sang trọng Bích Ngân Tiệc sinh nhật bé Thanh Trang Dịch vụ ông già Noel Xiếc thú vui nhộn Biểu diễn xiếc quay đĩa Dịch vụ tổ chức tiệc uy tín Khám phá dịch vụ của chúng tôi Tiệc sinh nhật cho bé trai Trang trí tiệc cho bé gái Gói sự kiện chuyên nghiệp Chương trình giải trí hấp dẫn Dịch vụ hỗ trợ sự kiện Trang trí tiệc cưới đẹp Khởi đầu thành công với khai trương Chuyên gia tư vấn sự kiện Xem ảnh các sự kiện đẹp Tin mới về sự kiện Kết nối với đội ngũ chuyên gia Chú hề vui nhộn cho tiệc sinh nhật Ý tưởng tiệc cuối năm Tất niên độc đáo Trang trí tiệc hiện đại Tổ chức sinh nhật cho Hải Đăng Sinh nhật độc quyền Khánh Vân Phong cách tiệc Bích Ngân Trang trí tiệc bé Thanh Trang Thuê dịch vụ ông già Noel chuyên nghiệp Xem xiếc khỉ đặc sắc Xiếc quay đĩa thú vị

Filed under: Kiến thức lập trình - @ 14:12

Thẻ: image-processingcudafftcufft

Thiết kế website giá rẻ

Danh mục

CUDA image upsampling with FFT method