I am writing a software for convolution of 2D images. I am adopting OpenCL due to the high performance on big images (30 MB and above). I have the following kernel code:
static const char* src3x3= R"(
__kernel void conv3x3(__global unsigned int *pixels, unsigned int gwidth, unsigned int gheight,
__global float* kern)
{
unsigned int MyMat[9];
int x = get_global_id(0);
int y = get_global_id(1);
int res = 0;
for(int i=0; i <3; i++) {
int r,g,b;
for(int j=0; j < 3; j++) {
r = (pixels[(x+j) + (gwidth * (y+i))] >> 16) & 0xFFu;
g = (pixels[(x+j) + (gwidth * (y+i))] >> 8) & 0xFFu;
b = (pixels[(x+j) + (gwidth * (y+i))]) & 0xFFu;
r *= kern[i * 3 + j];
g *= kern[i * 3 + j];
b *= kern[i * 3 + j];
MyMat[i * 3 + j]= (0xFF000000) | ( ( r & 0xFFu) << 16) | ( (g & 0xFFu) << 8) | (b & 0xFFu);
}
}
pixels[y * gwidth + x] = MyMat[4];
}
)";
and the following [host]:
void gpu_kernel::conv3x3(unsigned int *pixels, unsigned int x, unsigned int y, unsigned int w, unsigned int h,
float (&kern)[9])
{
size_t dim = 2;
size_t global_offset[] = {x, y};
size_t global_size[] = {w, h};
size_t wsize = 0 ;
size_t globwsize = w * h * sizeof(unsigned int);
cl_mem bufferx, bufferkern;
size_t log_size;
char* program_log = 0;
printf("x:%lu y:%lu w:%lu h:%lu size is (%lu)rn", x, y, w, h,globwsize);
clGetPlatformIDs(1, &m_kernctx.platform, &m_kernctx.platforms);
clGetDeviceIDs(m_kernctx.platform, CL_DEVICE_TYPE_ALL, 1, &m_kernctx.device, &m_kernctx.numdevs);
m_kernctx.context = clCreateContext(NULL, 1, &m_kernctx.device, NULL, NULL, NULL);
m_kernctx.command_queue = clCreateCommandQueue(m_kernctx.context, m_kernctx.device, 0, NULL);
bufferx = clCreateBuffer(m_kernctx.context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
globwsize , pixels, NULL);
bufferkern = clCreateBuffer(m_kernctx.context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(float) * 9 , kern , NULL);
m_kernctx.program = clCreateProgramWithSource(m_kernctx.context, 1, &src3x3, NULL, &m_kernctx.compilerr);
if (m_kernctx.compilerr) {
return;
}
cl_int res = clBuildProgram(m_kernctx.program, 1, &m_kernctx.device, "", NULL, NULL);
if (res < 0) {
clGetProgramBuildInfo(m_kernctx.program, m_kernctx.device, CL_PROGRAM_BUILD_LOG,
0, NULL, &log_size);
program_log = (char*) calloc(log_size+1, sizeof(char));
clGetProgramBuildInfo(m_kernctx.program, m_kernctx.device, CL_PROGRAM_BUILD_LOG,
log_size+1, program_log, NULL);
printf("%sn", program_log);
free(program_log);
return;
}
cl_int callres = 0;
m_kernctx.kernel = clCreateKernel(m_kernctx.program, "conv3x3", NULL);
callres |= clSetKernelArg(m_kernctx.kernel, 0, sizeof(cl_mem), &bufferx);
callres |= clSetKernelArg(m_kernctx.kernel, 1, sizeof(unsigned int), &w);
callres |= clSetKernelArg(m_kernctx.kernel, 2, sizeof(unsigned int), &h);
callres |= clSetKernelArg(m_kernctx.kernel, 3, sizeof(cl_mem), &bufferkern);
callres |= clGetKernelWorkGroupInfo(m_kernctx.kernel, m_kernctx.device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wsize, NULL);
callres |= clEnqueueNDRangeKernel(m_kernctx.command_queue,
m_kernctx.kernel,
dim,
global_offset,
global_size,
0,
0,
NULL,
NULL);
clFlush(m_kernctx.command_queue);
clFinish(m_kernctx.command_queue);
callres|= clEnqueueReadBuffer(m_kernctx.command_queue, bufferx, CL_TRUE, 0, globwsize, pixels, 0, NULL, NULL);
clReleaseMemObject(bufferx);
clReleaseMemObject(bufferkern);
}
The problem is in the kernel assigning the new middle element to the original data :
pixels[y * gwidth + x] = MyMat[4];
if however changed to :
pixels[y * gwidth + x] ^= MyMat[4];
it does change the image across however not with the bulr filter that is :
[0.111,0.111,0.111,
0.111,0.111,0.111,
0.111,0.111,0.111];
Below are some examples of the application:
The original image:
Without opencl blurring:
Assigning with pixels[y * gwidth + x] = MyMat[4];
Substracting (to test) the original by mid pixels[y * gwidth + x] -= MyMat[4];
I am using AMD RYZEN PRO7 Series with GPU (however can’t recall the model) on Ubuntu 22.04 with installed AMD driver from the official site https://support.zivid.com/en/latest/getting-started/software-installation/gpu/install-opencl-drivers-ubuntu.html
I’ve tested different modifications and the only one that seems not to work is the one that I need to set the middle element to the output data. I’ve searched for similar problems but I couldn’t find any.
Any advice would be helpful. I am new to OpenCL so I may also have misused it or did something incorrect.