I want to perform A fast flip operation similar to Matlab for 3D matrix in CUDA C++, but I have encountered a speed bottleneck and need to ask for help. The following will take 222 matrix A to demonstrate the flip function as an example (A = reshape(1:8,2,2,2):
A (:,:,1)
1 3
2 4
----
A (:,:,2)
5 7
6 8
After operating flip(A,1)
A (:,:,1)
2 4
1 3
----
A (:,:,2)
6 8
5 7
The following is my current CUDA version of flip function, but because this function will be called many times, so the speed is still not acceptable.
__global__ void flip(int dim, Tensor3_C_GPU result, Tensor3_C_GPU X)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int dimx1 = X.dim1; int dimx2 = X.dim2; int dimx3 = X.dim3;
int dim_x = dimx1 * dimx2 * dimx3;
int idx1 = idx % dimx1; int idx2 = (idx / dimx1) % dimx2; int idx3 = idx / (dimx1 * dimx2);
if (dim == 1)
{
if (idx < dim_x)
result.val[-1 - idx1 + ((idx2 + 1) + idx3 * dimx2) * dimx1] = X.val[idx];
//result.val[(dimx1 - 1 - idx1) + idx2 * dimx1 + idx3 * dimx1 * dimx2] = X.val[idx];
}
else if (dim == 2)
{
if (idx < dim_x)
result.val[idx1 + ((idx3 + 1) * dimx2 - (1 + idx2)) * dimx1] = X.val[idx];
//result.val[idx1 + (dimx2 - 1 - idx2) * dimx1 + idx3 * dimx1 * dimx2] = X.val[idx];
}
else if (dim == 3)
{
if (idx < dim_x)
result.val[idx1 + (idx2 - (1 + idx3) * dimx2) * dimx1 + dim_x] = X.val[idx];
//result.val[idx1 + idx2 * dimx1 + (dimx3 - 1 - idx3) * dimx1 * dimx2] = X.val[idx];
}
}
The following is written using three-dimensional threads:
__global__ void flip3D(int dim, Tensor3_C_GPU result, Tensor3_C_GPU X)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int z = blockIdx.z * blockDim.z + threadIdx.z;
int dimx1 = X.dim1; int dimx2 = X.dim2; int dimx3 = X.dim3;
int dim_x = dimx1 * dimx2 * dimx3;
if (dim == 1)
{
if (x < dimx1 && y < dimx2 && z < dimx3)
result.val[(dimx1 - 1 - x) + y * dimx1 + z * dimx1 * dimx2] = X.val[x + y * dimx1 + z * dimx1 * dimx2];
}
else if (dim == 2)
{
if (x < dimx1 && y < dimx2 && z < dimx3)
result.val[x + (dimx2 - 1 - y) * dimx1 + z * dimx1 * dimx2] = X.val[x + y * dimx1 + z * dimx1 * dimx2];
}
else if (dim == 3)
{
if (x < dimx1 && y < dimx2 && z < dimx3)
result.val[x + y * dimx1 + (dimx3 - 1 - z) * dimx1 * dimx2] = X.val[x + y * dimx1 + z * dimx1 * dimx2];
}
}
The comments section was originally written in this way, and later changed to this way in order to save some multiplication, but the time cost has not changed much. Later, I changed to a 3D thread to write to eliminate division, but the time is still not much difference between the two, may I ask if anyone has a faster method to execute, or can borrow some functions of the official CUDA library to indirectly achieve flip operation?
SS Schuyler Z is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.
4