I’m trying to understand of impact of thread coarsening in a convolution kernel. I have been trying to reuse convolution matrix and issue less global memory access for each pixel while doing more work per thread. But I do not understand the difference in performance at each coarsening level :
Coarsening | Runtime in ms | Uncoalesced Access Ratio |
---|---|---|
Baseline (none) | 850 | 24% of the total 13076236 sectors |
2x | 651 | 38% of the total 13600268 sectors |
4x | 546 | 55% of the total 17006476 sectors |
8x | 641 | 71% of the total 24997964 sectors |
12x | 581 | 78% of the total 33407040 sectors |
16x | 897 | 83% of the total 41558194 sectors |
Then I also do not understand why there is uncoalesced memory access for baseline and other kernels (both input and output is row-major image data). These are the examples of kernels:
// Baseline (no coarsening):
__global__ void k_1D_gf_3x3_global(unsigned char* input, unsigned char* output, int rows, int cols)
{
int ty = blockIdx.x * blockDim.x + threadIdx.x;
int tx = blockIdx.y * blockDim.y + threadIdx.y;
if ((tx > 0 && tx < rows - 1) && (ty > 0 && ty < cols - 1)) {
output[tx * cols + ty] = (global_conv_kernel3x3[0][0] * input[(tx - 1) * cols + ty - 1]
+ global_conv_kernel3x3[0][1] * input[(tx - 1) * cols + ty]
+ global_conv_kernel3x3[0][2] * input[(tx - 1) * cols + ty + 1]
+ global_conv_kernel3x3[1][0] * input[tx * cols + ty - 1]
+ global_conv_kernel3x3[1][1] * input[tx * cols + ty]
+ global_conv_kernel3x3[1][2] * input[tx * cols + ty + 1]
+ global_conv_kernel3x3[2][0] * input[(tx + 1) * cols + ty - 1]
+ global_conv_kernel3x3[2][1] * input[(tx + 1) * cols + ty]
+ global_conv_kernel3x3[2][2] * input[(tx + 1) * cols + ty + 1]) >> 4;
}
}
// 2x coarsened :
__global__ void k_1D_gf_3x3_load_balance2_global(unsigned char* input, unsigned char* output, int rows, int cols)
{
int ty = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
int tx = blockIdx.y * blockDim.y + threadIdx.y;
unsigned char frame[3][3];
if ((tx > 0 && tx < rows - 1) && (ty > 0 && ty < cols - 1)) {
frame[0][0] = input[(tx - 1) * cols + ty - 1];
frame[0][1] = input[(tx - 1) * cols + ty];
frame[0][2] = input[(tx - 1) * cols + ty + 1];
frame[1][0] = input[tx * cols + ty - 1];
frame[1][1] = input[tx * cols + ty];
frame[1][2] = input[tx * cols + ty + 1];
frame[2][0] = input[(tx + 1) * cols + ty - 1];
frame[2][1] = input[(tx + 1) * cols + ty];
frame[2][2] = input[(tx + 1) * cols + ty + 1];
output[(tx * cols + ty)] = (global_conv_kernel3x3[0][0] * frame[0][0]
+ global_conv_kernel3x3[0][1] * frame[0][1]
+ global_conv_kernel3x3[0][2] * frame[0][2]
+ global_conv_kernel3x3[1][0] * frame[1][0]
+ global_conv_kernel3x3[1][1] * frame[1][1]
+ global_conv_kernel3x3[1][2] * frame[1][2]
+ global_conv_kernel3x3[2][0] * frame[2][0]
+ global_conv_kernel3x3[2][1] * frame[2][1]
+ global_conv_kernel3x3[2][2] * frame[2][2]) >> 4;
for (int i = 1; i < 2; i++) {
int _ty = ty + i;
shift_left(frame);
if ((tx > 0 && tx < rows - 1) && (_ty > 0 && _ty < cols - 1)) {
frame[0][2] = input[(tx - 1) * cols + _ty + 1];
frame[1][2] = input[tx * cols + _ty + 1];
frame[2][2] = input[(tx + 1) * cols + _ty + 1];
output[(tx * cols + _ty)] = (global_conv_kernel3x3[0][0] * frame[0][0]
+ global_conv_kernel3x3[0][1] * frame[0][1]
+ global_conv_kernel3x3[0][2] * frame[0][2]
+ global_conv_kernel3x3[1][0] * frame[1][0]
+ global_conv_kernel3x3[1][1] * frame[1][1]
+ global_conv_kernel3x3[1][2] * frame[1][2]
+ global_conv_kernel3x3[2][0] * frame[2][0]
+ global_conv_kernel3x3[2][1] * frame[2][1]
+ global_conv_kernel3x3[2][2] * frame[2][2]) >> 4;
}
}
}
}
There is comparement of baseline and 4x:
comparement of 8x and 12x:
Above I added the comparison memory chart of the kernels. Why does data transfer between L1 and L2 increase in direct proportion to the coarsening rate? (300% for 4x, 700% for 8x, …)
I have tried profiling with ncu but didn’t exactly know where I am supposed to take a look.
9