I am trying to implement the nn.Conv1d using CUDA. However, I cannot get the correct answer. It would be great if anyone could help me debug it. Currently, the dimensions of input and output is correct, and the input x is (B, L, D) (batch, length, dimension).
Here is my code
// Simple 1D depthwise convolution implementation with dilation and stride = 1
const uint STRIDE = 1;
const uint DILATION = 1;
__global__ void conv1d_kernel(
const __half2 *__restrict__ u,
const __half2 *__restrict__ weights,
const __half2 *__restrict__ bias,
__half2 *__restrict__ out,
uint padding,
uint B,
uint L,
uint D,
uint L_out,
uint K)
{
const int d_idx = blockIdx.x * blockDim.x + threadIdx.x;
const int l_idx = blockIdx.y * blockDim.y + threadIdx.y;
const int b_idx = blockIdx.z * blockDim.z + threadIdx.z;
if (b_idx >= B || d_idx >= D) {
return;
}
__half2 tmp = __float2half2_rn(0.0f);
for (int k = 0; k < K; k++) {
int input_idx = l_idx * STRIDE + k * DILATION - padding;
if (input_idx >= 0 && input_idx < L) {
tmp = __hfma2(u[b_idx * L * D + input_idx * D + d_idx], weights[k * D + d_idx], tmp);
}
}
out[b_idx * L_out * D + l_idx * D + d_idx] = tmp;
}
//Do not change the function signature or return type!
torch::Tensor conv1d_cuda_half(
torch::Tensor u,
torch::Tensor weight,
torch::Tensor bias,
uint padding)
{
const uint b = u.size(0);
const uint l = u.size(1);
const uint d = u.size(2);
const uint k = weight.size(0);
printf("b = %dn", b);
printf("l = %dn", l);
printf("d = %dn", d);
printf("k = %dn", k);
//computes the output length. For more info see https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
uint l_out = ((l + 2 * padding - DILATION * (k - 1) - 1) / STRIDE + 1);
printf("l_out = %dn", l_out);
//block dimensions
dim3 blockDims;
//TODO: set the block dimensions
blockDims.x = 1; // along L
blockDims.y = 1; // along D
blockDims.z = 1; // along B
dim3 gridDims;
// //TODO: set the grid dimensions
gridDims.x = (b + blockDims.x - 1) / blockDims.x;
gridDims.y = (l + blockDims.y - 1) / blockDims.y;
gridDims.z = (d + blockDims.z - 1) / blockDims.z;
//create the output tensor
torch::Tensor out = torch::empty({b, l_out, d}, u.options());
//feel free to modify the kernel call however you like
conv1d_kernel<<<gridDims, blockDims>>>(
static_cast<__half2 *>(u.data_ptr()),
static_cast<__half2 *>(weight.data_ptr()),
static_cast<__half2 *>(bias.data_ptr()),
static_cast<__half2 *>(out.data_ptr()),
padding,
b,
l,
d,
l_out,
k);
return out;
}
Thanks a lot!!
I had tried to switch d_idx
to = blockIdx.z * blockDim.z + threadIdx.z;
but the result is still not correct.