I’m seeing an error which I am unable to explain.
I wrote a simple kernel for matrix multiplication that performs the following optimizations:
- Coalesced Global Memory Access
- Shared Memory loading in phases and computing result
This is the problem setup:
// Test for all sizes as (n, n) - n in [16, 32, 64, 128, ..., 2048]
<code>// Compute C = A*B
// A shape = (M, K)
// B shape = (K, N)
// C shape = (M, N)
// Test for all sizes as (n, n) - n in [16, 32, 64, 128, ..., 2048]
</code>
// Compute C = A*B
// A shape = (M, K)
// B shape = (K, N)
// C shape = (M, N)
// Test for all sizes as (n, n) - n in [16, 32, 64, 128, ..., 2048]
Some snippets from my code:
<code>__constant__ float MAX_VALUE = 3.4028235e+38;
__global__ void sharedMemKernel(float *C, float *A, float *B, int M, int K, int N){
Matmul between A(M*K) and B(K*N) to give C (M*N)
Used shared memory optimization
Each blocks loads required input tiles into shared memory
computations are performed by reading from shared memory
Final writes from threads are into global memory
Assume square tiles of size blockDim.x = blockDim.y
extern __shared__ float sharedMem[];
// load inputs A,B in shared memory in phases
// Assume that block is square
// int phases = (int)ceil((float)K / Bsize);
int phases = (K + Bsize - 1)/Bsize;
float *CTile = sharedMem;
float *ATile = CTile + Bsize*Bsize;
float *BTile = ATile + Bsize*Bsize;
// Ensure that shared memory allocated is enough to store all matrices
size_t sharedMemSize = 3 * Bsize * Bsize * sizeof(float);
assert((char*)(BTile + Bsize * Bsize) - (char*)sharedMem <= sharedMemSize);
// ensure CTile is all zeros
CTile[threadIdx.y * Bsize + threadIdx.x] = 0.0f;
for(int p=0; p<phases; p++){
//load A, B into shared memory
loadIntoSharedMemoryColPhase(ATile, A, p, M, K);
loadIntoSharedMemoryRowPhase(BTile, B, p, K, N);
for(int i=0; i<Bsize; i++){
//compute phase-wise matrix multiplication
result += ATile[(threadIdx.y)*Bsize + i]*BTile[i*Bsize + threadIdx.x];
// Write result to output
if (result > MAX_VALUE) {
CTile[threadIdx.y*Bsize + threadIdx.x] += result;
if( (blockIdx.y*Bsize + threadIdx.y)<M && blockIdx.x*Bsize+threadIdx.x<N){
C[(blockIdx.y*Bsize + threadIdx.y)*N + blockIdx.x*Bsize+threadIdx.x] = CTile[threadIdx.y*Bsize + threadIdx.x];
<code>__constant__ float MAX_VALUE = 3.4028235e+38;
__global__ void sharedMemKernel(float *C, float *A, float *B, int M, int K, int N){
/*
Matmul between A(M*K) and B(K*N) to give C (M*N)
Used shared memory optimization
Each blocks loads required input tiles into shared memory
computations are performed by reading from shared memory
Final writes from threads are into global memory
Assume square tiles of size blockDim.x = blockDim.y
*/
extern __shared__ float sharedMem[];
int Bsize = blockDim.x;
// load inputs A,B in shared memory in phases
// Assume that block is square
// int phases = (int)ceil((float)K / Bsize);
int phases = (K + Bsize - 1)/Bsize;
float *CTile = sharedMem;
float *ATile = CTile + Bsize*Bsize;
float *BTile = ATile + Bsize*Bsize;
// Ensure that shared memory allocated is enough to store all matrices
size_t sharedMemSize = 3 * Bsize * Bsize * sizeof(float);
assert((char*)(BTile + Bsize * Bsize) - (char*)sharedMem <= sharedMemSize);
// ensure CTile is all zeros
CTile[threadIdx.y * Bsize + threadIdx.x] = 0.0f;
__syncthreads();
for(int p=0; p<phases; p++){
//load A, B into shared memory
loadIntoSharedMemoryColPhase(ATile, A, p, M, K);
loadIntoSharedMemoryRowPhase(BTile, B, p, K, N);
__syncthreads();
float result = 0.0f;
for(int i=0; i<Bsize; i++){
//compute phase-wise matrix multiplication
result += ATile[(threadIdx.y)*Bsize + i]*BTile[i*Bsize + threadIdx.x];
}
// Write result to output
if (result > MAX_VALUE) {
printf("Max value!!n");
result = MAX_VALUE;
}
CTile[threadIdx.y*Bsize + threadIdx.x] += result;
}
if( (blockIdx.y*Bsize + threadIdx.y)<M && blockIdx.x*Bsize+threadIdx.x<N){
C[(blockIdx.y*Bsize + threadIdx.y)*N + blockIdx.x*Bsize+threadIdx.x] = CTile[threadIdx.y*Bsize + threadIdx.x];
}
}
</code>
__constant__ float MAX_VALUE = 3.4028235e+38;
__global__ void sharedMemKernel(float *C, float *A, float *B, int M, int K, int N){
/*
Matmul between A(M*K) and B(K*N) to give C (M*N)
Used shared memory optimization
Each blocks loads required input tiles into shared memory
computations are performed by reading from shared memory
Final writes from threads are into global memory
Assume square tiles of size blockDim.x = blockDim.y
*/
extern __shared__ float sharedMem[];
int Bsize = blockDim.x;
// load inputs A,B in shared memory in phases
// Assume that block is square
// int phases = (int)ceil((float)K / Bsize);
int phases = (K + Bsize - 1)/Bsize;
float *CTile = sharedMem;
float *ATile = CTile + Bsize*Bsize;
float *BTile = ATile + Bsize*Bsize;
// Ensure that shared memory allocated is enough to store all matrices
size_t sharedMemSize = 3 * Bsize * Bsize * sizeof(float);
assert((char*)(BTile + Bsize * Bsize) - (char*)sharedMem <= sharedMemSize);
// ensure CTile is all zeros
CTile[threadIdx.y * Bsize + threadIdx.x] = 0.0f;
__syncthreads();
for(int p=0; p<phases; p++){
//load A, B into shared memory
loadIntoSharedMemoryColPhase(ATile, A, p, M, K);
loadIntoSharedMemoryRowPhase(BTile, B, p, K, N);
__syncthreads();
float result = 0.0f;
for(int i=0; i<Bsize; i++){
//compute phase-wise matrix multiplication
result += ATile[(threadIdx.y)*Bsize + i]*BTile[i*Bsize + threadIdx.x];
}
// Write result to output
if (result > MAX_VALUE) {
printf("Max value!!n");
result = MAX_VALUE;
}
CTile[threadIdx.y*Bsize + threadIdx.x] += result;
}
if( (blockIdx.y*Bsize + threadIdx.y)<M && blockIdx.x*Bsize+threadIdx.x<N){
C[(blockIdx.y*Bsize + threadIdx.y)*N + blockIdx.x*Bsize+threadIdx.x] = CTile[threadIdx.y*Bsize + threadIdx.x];
}
}
<code>__device__ void loadIntoSharedMemoryColPhase(float *Ashared, float *A, int colTileIdx, int ArowSize, int AcolSize){
Load data from input A into shared memory Ashared - of size (blockDim.y, blockDim.x)
Assume that thread block size is same as Ashared size
Phases move across columns of input matrix A
const unsigned int RowIdx = blockIdx.y*blockDim.y + threadIdx.y;
const unsigned int ColIdx = colTileIdx*blockDim.x + threadIdx.x;
if(RowIdx<ArowSize && ColIdx<AcolSize) {
Ashared[threadIdx.y*blockDim.x + threadIdx.x] = A[RowIdx*AcolSize + ColIdx];
Ashared[threadIdx.y*blockDim.x + threadIdx.x] = 0.0f;
// if(blockIdx.x==0 && blockIdx.y==0 && threadIdx.x==0 && threadIdx.y==0){
// printf("loaded value(%f) from A[%d][%d]n",A[RowIdx*AcolSize + ColIdx],RowIdx,ColIdx);
<code>__device__ void loadIntoSharedMemoryColPhase(float *Ashared, float *A, int colTileIdx, int ArowSize, int AcolSize){
/*
Load data from input A into shared memory Ashared - of size (blockDim.y, blockDim.x)
Assume that thread block size is same as Ashared size
Phases move across columns of input matrix A
*/
const unsigned int RowIdx = blockIdx.y*blockDim.y + threadIdx.y;
const unsigned int ColIdx = colTileIdx*blockDim.x + threadIdx.x;
if(RowIdx<ArowSize && ColIdx<AcolSize) {
Ashared[threadIdx.y*blockDim.x + threadIdx.x] = A[RowIdx*AcolSize + ColIdx];
}
else{
Ashared[threadIdx.y*blockDim.x + threadIdx.x] = 0.0f;
}
// Debug outptus
// if(blockIdx.x==0 && blockIdx.y==0 && threadIdx.x==0 && threadIdx.y==0){
// printf("loaded value(%f) from A[%d][%d]n",A[RowIdx*AcolSize + ColIdx],RowIdx,ColIdx);
// }
}
</code>
__device__ void loadIntoSharedMemoryColPhase(float *Ashared, float *A, int colTileIdx, int ArowSize, int AcolSize){
/*
Load data from input A into shared memory Ashared - of size (blockDim.y, blockDim.x)
Assume that thread block size is same as Ashared size
Phases move across columns of input matrix A
*/
const unsigned int RowIdx = blockIdx.y*blockDim.y + threadIdx.y;
const unsigned int ColIdx = colTileIdx*blockDim.x + threadIdx.x;
if(RowIdx<ArowSize && ColIdx<AcolSize) {
Ashared[threadIdx.y*blockDim.x + threadIdx.x] = A[RowIdx*AcolSize + ColIdx];
}
else{
Ashared[threadIdx.y*blockDim.x + threadIdx.x] = 0.0f;
}
// Debug outptus
// if(blockIdx.x==0 && blockIdx.y==0 && threadIdx.x==0 && threadIdx.y==0){
// printf("loaded value(%f) from A[%d][%d]n",A[RowIdx*AcolSize + ColIdx],RowIdx,ColIdx);
// }
}
Here are interesting errors that I see:
-
Without checking if result is beyond MAX_VALUE
, I see that matrices of size 64×64 randomly differ from the CPU output used to test. Matrices beyond that – 128, 256, 512, etc.. always give an error.
-
When I had reduced the block size to (4, 4) instead of the original (32, 32), all tests passed!
This is unclear because of the following reasons:
-
In my experience the floating point issues that arise in a GPU should arise in the CPU as well and they outputs should be “equally” wrong. Why did the condition help pass the 64×64 test case?
I found a relevant source here: PyCUDA precision of matrix multiplication code that potentially makes it clear. It would make sense if my CPU code is performing this clipping by default.
-
Why is the block size deciding the result of my computation at all? If SMs do not have enough resources, would the resource manager not wait until enough resources are available for execution? What could the possible reason for this solution to work be? I tested this without any intuition and fail to explain this behavior.
Adding python test code:
<code>import pycuda.driver as cuda
import pycuda.gpuarray as gpuarray
from pycuda.compiler import SourceModule
BLOCK = (4, 4, 1) # Use a tuple of integers for BLOCK dimensions
return (int(np.ceil(M / BLOCK[0])), int(np.ceil(N / BLOCK[1])),
1) # Return a tuple of integers for GRID dimensions
def get_random_matrices(M, K, N):
A = np.random.random((M, K)).astype(np.float32)
B = np.random.random((K, N)).astype(np.float32)
C = np.random.random((M, N)).astype(np.float32)
def get_ABC_gpu(A, B, C):
A_gpu = gpuarray.to_gpu(A)
B_gpu = gpuarray.to_gpu(B)
C_gpu = gpuarray.to_gpu(C)
return A_gpu, B_gpu, C_gpu
def load_kernel(kernel_path='./matmulKernels.cpp'):
with open(kernel_path, 'r') as f:
kernel = r'{}'.format(kernel)
return SourceModule(kernel)
def matmul_gpu(A, B, C, M, K, N, fname='naive', shared=False):
A_gpu, B_gpu, C_gpu = get_ABC_gpu(A, B, C)
func = mod.get_function(fname)
func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N))
SHARED_MEM_SIZE = int(BLOCK[0] * BLOCK[1] * 3
* np.dtype(np.float32).itemsize)
func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N),
def matmul_withtranspose_gpu(A, B, C, M, K, N, fname='naive', shared=False):
A_gpu, B_gpu, C_gpu = get_ABC_gpu(A, B.T, C)
func = mod.get_function(fname)
func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N))
SHARED_MEM_SIZE = int(BLOCK[0] * BLOCK[1] * 3
* np.dtype(np.float32).itemsize)
func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N),
def matmul_cpu(A, B, C, M, K, N, transpose=False):
assert (Ashape[1] == Bshape[0])
assert (Ashape[1] == Bshape[1])
def valid_check(M, K, N, fname='naive', verbose=False, shared=False,
A, B, C = get_random_matrices(M, K, N)
C_gpu_comp = matmul_withtranspose_gpu(A, B, C, M, K, N, fname=fname,
C_gpu_comp = matmul_gpu(A, B, C, M, K, N, fname=fname, shared=shared)
C_cpu_comp = matmul_cpu(A, B, C, M, K, N, transpose)
assert (np.allclose(C_cpu_comp, C_gpu_comp))
print("Valid computation!")
raise Exception("CPU and GPU results do not match!")
# sizes = 2**np.arange(4, 12).astype(np.int32)
sizes = np.array([1235, 257]).astype(np.int32)
for i in range(num_exps):
M, K, N = size, size, size
print(f"Testing M = {M}, K = {K}, N = {N}, transpose={TRANSPOSE}")
valid = valid_check(M, K, N, fname='sharedMemKernel', shared=True,
verbose=True, transpose=TRANSPOSE)
print("33[32mTest passed!33[0m")
print("33[31m.....FAILED!.....33[0m")
<code>import pycuda.driver as cuda
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
from pycuda.compiler import SourceModule
import numpy as np
BLOCK = (4, 4, 1) # Use a tuple of integers for BLOCK dimensions
def GRID(M, N):
return (int(np.ceil(M / BLOCK[0])), int(np.ceil(N / BLOCK[1])),
1) # Return a tuple of integers for GRID dimensions
def get_random_matrices(M, K, N):
A = np.random.random((M, K)).astype(np.float32)
B = np.random.random((K, N)).astype(np.float32)
C = np.random.random((M, N)).astype(np.float32)
return A, B, C
def get_ABC_gpu(A, B, C):
A_gpu = gpuarray.to_gpu(A)
B_gpu = gpuarray.to_gpu(B)
C_gpu = gpuarray.to_gpu(C)
return A_gpu, B_gpu, C_gpu
def load_kernel(kernel_path='./matmulKernels.cpp'):
with open(kernel_path, 'r') as f:
kernel = f.read()
kernel = r'{}'.format(kernel)
return SourceModule(kernel)
def matmul_gpu(A, B, C, M, K, N, fname='naive', shared=False):
A_gpu, B_gpu, C_gpu = get_ABC_gpu(A, B, C)
mod = load_kernel()
func = mod.get_function(fname)
if (not shared):
func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N))
else:
SHARED_MEM_SIZE = int(BLOCK[0] * BLOCK[1] * 3
* np.dtype(np.float32).itemsize)
func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N),
shared=SHARED_MEM_SIZE)
C = C_gpu.get()
return C
def matmul_withtranspose_gpu(A, B, C, M, K, N, fname='naive', shared=False):
A_gpu, B_gpu, C_gpu = get_ABC_gpu(A, B.T, C)
mod = load_kernel()
func = mod.get_function(fname)
if (not shared):
func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N))
else:
SHARED_MEM_SIZE = int(BLOCK[0] * BLOCK[1] * 3
* np.dtype(np.float32).itemsize)
func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N),
shared=SHARED_MEM_SIZE)
C = C_gpu.get()
return C
def matmul_cpu(A, B, C, M, K, N, transpose=False):
Ashape = A.shape
Bshape = B.shape
if (not transpose):
assert (Ashape[1] == Bshape[0])
C = np.matmul(A, B)
else:
assert (Ashape[1] == Bshape[1])
C = np.matmul(A, B.T)
return C
def valid_check(M, K, N, fname='naive', verbose=False, shared=False,
transpose=False):
A, B, C = get_random_matrices(M, K, N)
if (transpose):
C_gpu_comp = matmul_withtranspose_gpu(A, B, C, M, K, N, fname=fname,
shared=shared)
else:
C_gpu_comp = matmul_gpu(A, B, C, M, K, N, fname=fname, shared=shared)
C_cpu_comp = matmul_cpu(A, B, C, M, K, N, transpose)
try:
assert (np.allclose(C_cpu_comp, C_gpu_comp))
print("Valid computation!")
return 1
except Exception:
if (verbose):
return 0
else:
raise Exception("CPU and GPU results do not match!")
# sizes = 2**np.arange(4, 12).astype(np.int32)
sizes = np.array([1235, 257]).astype(np.int32)
num_exps = 1
for i in range(num_exps):
TRANSPOSE = False
for size in sizes:
M, K, N = size, size, size
print("*"*50)
print(f"Testing M = {M}, K = {K}, N = {N}, transpose={TRANSPOSE}")
valid = valid_check(M, K, N, fname='sharedMemKernel', shared=True,
verbose=True, transpose=TRANSPOSE)
if (valid == 1):
print("33[32mTest passed!33[0m")
print("*"*50)
elif (valid == 0):
print("33[31m.....FAILED!.....33[0m")
print("*"*50)
</code>
import pycuda.driver as cuda
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
from pycuda.compiler import SourceModule
import numpy as np
BLOCK = (4, 4, 1) # Use a tuple of integers for BLOCK dimensions
def GRID(M, N):
return (int(np.ceil(M / BLOCK[0])), int(np.ceil(N / BLOCK[1])),
1) # Return a tuple of integers for GRID dimensions
def get_random_matrices(M, K, N):
A = np.random.random((M, K)).astype(np.float32)
B = np.random.random((K, N)).astype(np.float32)
C = np.random.random((M, N)).astype(np.float32)
return A, B, C
def get_ABC_gpu(A, B, C):
A_gpu = gpuarray.to_gpu(A)
B_gpu = gpuarray.to_gpu(B)
C_gpu = gpuarray.to_gpu(C)
return A_gpu, B_gpu, C_gpu
def load_kernel(kernel_path='./matmulKernels.cpp'):
with open(kernel_path, 'r') as f:
kernel = f.read()
kernel = r'{}'.format(kernel)
return SourceModule(kernel)
def matmul_gpu(A, B, C, M, K, N, fname='naive', shared=False):
A_gpu, B_gpu, C_gpu = get_ABC_gpu(A, B, C)
mod = load_kernel()
func = mod.get_function(fname)
if (not shared):
func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N))
else:
SHARED_MEM_SIZE = int(BLOCK[0] * BLOCK[1] * 3
* np.dtype(np.float32).itemsize)
func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N),
shared=SHARED_MEM_SIZE)
C = C_gpu.get()
return C
def matmul_withtranspose_gpu(A, B, C, M, K, N, fname='naive', shared=False):
A_gpu, B_gpu, C_gpu = get_ABC_gpu(A, B.T, C)
mod = load_kernel()
func = mod.get_function(fname)
if (not shared):
func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N))
else:
SHARED_MEM_SIZE = int(BLOCK[0] * BLOCK[1] * 3
* np.dtype(np.float32).itemsize)
func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N),
shared=SHARED_MEM_SIZE)
C = C_gpu.get()
return C
def matmul_cpu(A, B, C, M, K, N, transpose=False):
Ashape = A.shape
Bshape = B.shape
if (not transpose):
assert (Ashape[1] == Bshape[0])
C = np.matmul(A, B)
else:
assert (Ashape[1] == Bshape[1])
C = np.matmul(A, B.T)
return C
def valid_check(M, K, N, fname='naive', verbose=False, shared=False,
transpose=False):
A, B, C = get_random_matrices(M, K, N)
if (transpose):
C_gpu_comp = matmul_withtranspose_gpu(A, B, C, M, K, N, fname=fname,
shared=shared)
else:
C_gpu_comp = matmul_gpu(A, B, C, M, K, N, fname=fname, shared=shared)
C_cpu_comp = matmul_cpu(A, B, C, M, K, N, transpose)
try:
assert (np.allclose(C_cpu_comp, C_gpu_comp))
print("Valid computation!")
return 1
except Exception:
if (verbose):
return 0
else:
raise Exception("CPU and GPU results do not match!")
# sizes = 2**np.arange(4, 12).astype(np.int32)
sizes = np.array([1235, 257]).astype(np.int32)
num_exps = 1
for i in range(num_exps):
TRANSPOSE = False
for size in sizes:
M, K, N = size, size, size
print("*"*50)
print(f"Testing M = {M}, K = {K}, N = {N}, transpose={TRANSPOSE}")
valid = valid_check(M, K, N, fname='sharedMemKernel', shared=True,
verbose=True, transpose=TRANSPOSE)
if (valid == 1):
print("33[32mTest passed!33[0m")
print("*"*50)
elif (valid == 0):
print("33[31m.....FAILED!.....33[0m")
print("*"*50)
Addition device func:
<code>__device__ void loadIntoSharedMemoryRowPhase(float *Ashared, float *A, int rowTileIdx, int ArowSize, int AcolSize){
Load data from input A into shared memory Ashared - of size (blockDim.y, blockDim.x)
Assume that thread block size is same as Ashared size
Phases move across rows of input matrix A
const unsigned int RowIdx = rowTileIdx*blockDim.y + threadIdx.y;
const unsigned int ColIdx = blockIdx.x*blockDim.x + threadIdx.x;
if(RowIdx<ArowSize && ColIdx<AcolSize) {
Ashared[threadIdx.y*blockDim.x + threadIdx.x] = A[RowIdx*AcolSize + ColIdx];
Ashared[threadIdx.y*blockDim.x + threadIdx.x] = 0.0f;
// if(blockIdx.x==0 && blockIdx.y==0 && threadIdx.x==0 && threadIdx.y==0){
// printf("loaded value(%f) from B[%d][%d]n",A[RowIdx*AcolSize + ColIdx],RowIdx,ColIdx);
<code>__device__ void loadIntoSharedMemoryRowPhase(float *Ashared, float *A, int rowTileIdx, int ArowSize, int AcolSize){
/*
Load data from input A into shared memory Ashared - of size (blockDim.y, blockDim.x)
Assume that thread block size is same as Ashared size
Phases move across rows of input matrix A
*/
const unsigned int RowIdx = rowTileIdx*blockDim.y + threadIdx.y;
const unsigned int ColIdx = blockIdx.x*blockDim.x + threadIdx.x;
if(RowIdx<ArowSize && ColIdx<AcolSize) {
Ashared[threadIdx.y*blockDim.x + threadIdx.x] = A[RowIdx*AcolSize + ColIdx];
}
else{
Ashared[threadIdx.y*blockDim.x + threadIdx.x] = 0.0f;
}
// Debug outptus
// if(blockIdx.x==0 && blockIdx.y==0 && threadIdx.x==0 && threadIdx.y==0){
// printf("loaded value(%f) from B[%d][%d]n",A[RowIdx*AcolSize + ColIdx],RowIdx,ColIdx);
// }
}
</code>
__device__ void loadIntoSharedMemoryRowPhase(float *Ashared, float *A, int rowTileIdx, int ArowSize, int AcolSize){
/*
Load data from input A into shared memory Ashared - of size (blockDim.y, blockDim.x)
Assume that thread block size is same as Ashared size
Phases move across rows of input matrix A
*/
const unsigned int RowIdx = rowTileIdx*blockDim.y + threadIdx.y;
const unsigned int ColIdx = blockIdx.x*blockDim.x + threadIdx.x;
if(RowIdx<ArowSize && ColIdx<AcolSize) {
Ashared[threadIdx.y*blockDim.x + threadIdx.x] = A[RowIdx*AcolSize + ColIdx];
}
else{
Ashared[threadIdx.y*blockDim.x + threadIdx.x] = 0.0f;
}
// Debug outptus
// if(blockIdx.x==0 && blockIdx.y==0 && threadIdx.x==0 && threadIdx.y==0){
// printf("loaded value(%f) from B[%d][%d]n",A[RowIdx*AcolSize + ColIdx],RowIdx,ColIdx);
// }
}