PyCUDA | Shared Matrix Multiplication with Phases | Unintuitive Error

I’m seeing an error which I am unable to explain.

I wrote a simple kernel for matrix multiplication that performs the following optimizations:

  1. Coalesced Global Memory Access
  2. Shared Memory loading in phases and computing result

This is the problem setup:

Plain text
Copy to clipboard
Open code in new window
EnlighterJS 3 Syntax Highlighter
<code>// Compute C = A*B
// A shape = (M, K)
// B shape = (K, N)
// C shape = (M, N)
// Test for all sizes as (n, n) - n in [16, 32, 64, 128, ..., 2048]
</code>
<code>// Compute C = A*B // A shape = (M, K) // B shape = (K, N) // C shape = (M, N) // Test for all sizes as (n, n) - n in [16, 32, 64, 128, ..., 2048] </code>
// Compute C = A*B
// A shape = (M, K) 
// B shape = (K, N) 
// C shape = (M, N) 
// Test for all sizes as (n, n) - n in  [16, 32, 64, 128, ..., 2048]

Some snippets from my code:

Plain text
Copy to clipboard
Open code in new window
EnlighterJS 3 Syntax Highlighter
<code>__constant__ float MAX_VALUE = 3.4028235e+38;
__global__ void sharedMemKernel(float *C, float *A, float *B, int M, int K, int N){
/*
Matmul between A(M*K) and B(K*N) to give C (M*N)
Used shared memory optimization
Each blocks loads required input tiles into shared memory
computations are performed by reading from shared memory
Final writes from threads are into global memory
Assume square tiles of size blockDim.x = blockDim.y
*/
extern __shared__ float sharedMem[];
int Bsize = blockDim.x;
// load inputs A,B in shared memory in phases
// Assume that block is square
// int phases = (int)ceil((float)K / Bsize);
int phases = (K + Bsize - 1)/Bsize;
float *CTile = sharedMem;
float *ATile = CTile + Bsize*Bsize;
float *BTile = ATile + Bsize*Bsize;
// Ensure that shared memory allocated is enough to store all matrices
size_t sharedMemSize = 3 * Bsize * Bsize * sizeof(float);
assert((char*)(BTile + Bsize * Bsize) - (char*)sharedMem <= sharedMemSize);
// ensure CTile is all zeros
CTile[threadIdx.y * Bsize + threadIdx.x] = 0.0f;
__syncthreads();
for(int p=0; p<phases; p++){
//load A, B into shared memory
loadIntoSharedMemoryColPhase(ATile, A, p, M, K);
loadIntoSharedMemoryRowPhase(BTile, B, p, K, N);
__syncthreads();
float result = 0.0f;
for(int i=0; i<Bsize; i++){
//compute phase-wise matrix multiplication
result += ATile[(threadIdx.y)*Bsize + i]*BTile[i*Bsize + threadIdx.x];
}
// Write result to output
if (result > MAX_VALUE) {
printf("Max value!!n");
result = MAX_VALUE;
}
CTile[threadIdx.y*Bsize + threadIdx.x] += result;
}
if( (blockIdx.y*Bsize + threadIdx.y)<M && blockIdx.x*Bsize+threadIdx.x<N){
C[(blockIdx.y*Bsize + threadIdx.y)*N + blockIdx.x*Bsize+threadIdx.x] = CTile[threadIdx.y*Bsize + threadIdx.x];
}
}
</code>
<code>__constant__ float MAX_VALUE = 3.4028235e+38; __global__ void sharedMemKernel(float *C, float *A, float *B, int M, int K, int N){ /* Matmul between A(M*K) and B(K*N) to give C (M*N) Used shared memory optimization Each blocks loads required input tiles into shared memory computations are performed by reading from shared memory Final writes from threads are into global memory Assume square tiles of size blockDim.x = blockDim.y */ extern __shared__ float sharedMem[]; int Bsize = blockDim.x; // load inputs A,B in shared memory in phases // Assume that block is square // int phases = (int)ceil((float)K / Bsize); int phases = (K + Bsize - 1)/Bsize; float *CTile = sharedMem; float *ATile = CTile + Bsize*Bsize; float *BTile = ATile + Bsize*Bsize; // Ensure that shared memory allocated is enough to store all matrices size_t sharedMemSize = 3 * Bsize * Bsize * sizeof(float); assert((char*)(BTile + Bsize * Bsize) - (char*)sharedMem <= sharedMemSize); // ensure CTile is all zeros CTile[threadIdx.y * Bsize + threadIdx.x] = 0.0f; __syncthreads(); for(int p=0; p<phases; p++){ //load A, B into shared memory loadIntoSharedMemoryColPhase(ATile, A, p, M, K); loadIntoSharedMemoryRowPhase(BTile, B, p, K, N); __syncthreads(); float result = 0.0f; for(int i=0; i<Bsize; i++){ //compute phase-wise matrix multiplication result += ATile[(threadIdx.y)*Bsize + i]*BTile[i*Bsize + threadIdx.x]; } // Write result to output if (result > MAX_VALUE) { printf("Max value!!n"); result = MAX_VALUE; } CTile[threadIdx.y*Bsize + threadIdx.x] += result; } if( (blockIdx.y*Bsize + threadIdx.y)<M && blockIdx.x*Bsize+threadIdx.x<N){ C[(blockIdx.y*Bsize + threadIdx.y)*N + blockIdx.x*Bsize+threadIdx.x] = CTile[threadIdx.y*Bsize + threadIdx.x]; } } </code>
__constant__ float MAX_VALUE = 3.4028235e+38;

__global__ void sharedMemKernel(float *C, float *A, float *B, int M, int K, int N){
    /*
    Matmul between A(M*K) and B(K*N) to give C (M*N)
    Used shared memory optimization
    Each blocks loads required input tiles into shared memory
    computations are performed by reading from shared memory
    Final writes from threads are into global memory
    Assume square tiles of size blockDim.x = blockDim.y
    */
    extern __shared__ float sharedMem[];
    
    int Bsize = blockDim.x;
    
    // load inputs A,B in shared memory in phases
    // Assume that block is square
    
    // int phases = (int)ceil((float)K / Bsize);
    int phases = (K + Bsize - 1)/Bsize;

    float *CTile = sharedMem;
    float *ATile = CTile + Bsize*Bsize;
    float *BTile = ATile + Bsize*Bsize;

    // Ensure that shared memory allocated is enough to store all matrices
    size_t sharedMemSize = 3 * Bsize * Bsize * sizeof(float); 
    assert((char*)(BTile + Bsize * Bsize) - (char*)sharedMem <= sharedMemSize);


    // ensure CTile is all zeros
    CTile[threadIdx.y * Bsize + threadIdx.x] = 0.0f;
    __syncthreads();
    
    
    for(int p=0; p<phases; p++){
        //load A, B into shared memory
        
        loadIntoSharedMemoryColPhase(ATile, A, p, M, K);
        loadIntoSharedMemoryRowPhase(BTile, B, p, K, N);

        __syncthreads();

        float result = 0.0f;
        for(int i=0; i<Bsize; i++){
            //compute phase-wise matrix multiplication
            result += ATile[(threadIdx.y)*Bsize + i]*BTile[i*Bsize + threadIdx.x];
        }
        // Write result to output
        if (result > MAX_VALUE) {
            printf("Max value!!n");
            result = MAX_VALUE;
        }
        CTile[threadIdx.y*Bsize + threadIdx.x] += result;
    } 
    if( (blockIdx.y*Bsize + threadIdx.y)<M && blockIdx.x*Bsize+threadIdx.x<N){
        C[(blockIdx.y*Bsize + threadIdx.y)*N + blockIdx.x*Bsize+threadIdx.x] = CTile[threadIdx.y*Bsize + threadIdx.x];
    }
}
Plain text
Copy to clipboard
Open code in new window
EnlighterJS 3 Syntax Highlighter
<code>__device__ void loadIntoSharedMemoryColPhase(float *Ashared, float *A, int colTileIdx, int ArowSize, int AcolSize){
/*
Load data from input A into shared memory Ashared - of size (blockDim.y, blockDim.x)
Assume that thread block size is same as Ashared size
Phases move across columns of input matrix A
*/
const unsigned int RowIdx = blockIdx.y*blockDim.y + threadIdx.y;
const unsigned int ColIdx = colTileIdx*blockDim.x + threadIdx.x;
if(RowIdx<ArowSize && ColIdx<AcolSize) {
Ashared[threadIdx.y*blockDim.x + threadIdx.x] = A[RowIdx*AcolSize + ColIdx];
}
else{
Ashared[threadIdx.y*blockDim.x + threadIdx.x] = 0.0f;
}
// Debug outptus
// if(blockIdx.x==0 && blockIdx.y==0 && threadIdx.x==0 && threadIdx.y==0){
// printf("loaded value(%f) from A[%d][%d]n",A[RowIdx*AcolSize + ColIdx],RowIdx,ColIdx);
// }
}
</code>
<code>__device__ void loadIntoSharedMemoryColPhase(float *Ashared, float *A, int colTileIdx, int ArowSize, int AcolSize){ /* Load data from input A into shared memory Ashared - of size (blockDim.y, blockDim.x) Assume that thread block size is same as Ashared size Phases move across columns of input matrix A */ const unsigned int RowIdx = blockIdx.y*blockDim.y + threadIdx.y; const unsigned int ColIdx = colTileIdx*blockDim.x + threadIdx.x; if(RowIdx<ArowSize && ColIdx<AcolSize) { Ashared[threadIdx.y*blockDim.x + threadIdx.x] = A[RowIdx*AcolSize + ColIdx]; } else{ Ashared[threadIdx.y*blockDim.x + threadIdx.x] = 0.0f; } // Debug outptus // if(blockIdx.x==0 && blockIdx.y==0 && threadIdx.x==0 && threadIdx.y==0){ // printf("loaded value(%f) from A[%d][%d]n",A[RowIdx*AcolSize + ColIdx],RowIdx,ColIdx); // } } </code>
__device__ void loadIntoSharedMemoryColPhase(float *Ashared, float *A, int colTileIdx, int ArowSize, int AcolSize){
    /*
    Load data from input A into shared memory Ashared - of size (blockDim.y, blockDim.x)
    Assume that thread block size is same as Ashared size
    Phases move across columns of input matrix A
    */
    const unsigned int RowIdx =  blockIdx.y*blockDim.y + threadIdx.y;
    const unsigned int ColIdx =  colTileIdx*blockDim.x + threadIdx.x;
    
    if(RowIdx<ArowSize && ColIdx<AcolSize) {
        Ashared[threadIdx.y*blockDim.x + threadIdx.x] = A[RowIdx*AcolSize + ColIdx];
    }
    else{
        Ashared[threadIdx.y*blockDim.x + threadIdx.x] = 0.0f;
    }
    // Debug outptus
    // if(blockIdx.x==0 && blockIdx.y==0 && threadIdx.x==0 && threadIdx.y==0){
    //     printf("loaded value(%f) from A[%d][%d]n",A[RowIdx*AcolSize + ColIdx],RowIdx,ColIdx);
    // }
}

Here are interesting errors that I see:

  1. Without checking if result is beyond MAX_VALUE, I see that matrices of size 64×64 randomly differ from the CPU output used to test. Matrices beyond that – 128, 256, 512, etc.. always give an error.

  2. When I had reduced the block size to (4, 4) instead of the original (32, 32), all tests passed!

This is unclear because of the following reasons:

  1. In my experience the floating point issues that arise in a GPU should arise in the CPU as well and they outputs should be “equally” wrong. Why did the condition help pass the 64×64 test case?

    I found a relevant source here: PyCUDA precision of matrix multiplication code that potentially makes it clear. It would make sense if my CPU code is performing this clipping by default.

  2. Why is the block size deciding the result of my computation at all? If SMs do not have enough resources, would the resource manager not wait until enough resources are available for execution? What could the possible reason for this solution to work be? I tested this without any intuition and fail to explain this behavior.

Adding python test code:

Plain text
Copy to clipboard
Open code in new window
EnlighterJS 3 Syntax Highlighter
<code>import pycuda.driver as cuda
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
from pycuda.compiler import SourceModule
import numpy as np
BLOCK = (4, 4, 1) # Use a tuple of integers for BLOCK dimensions
def GRID(M, N):
return (int(np.ceil(M / BLOCK[0])), int(np.ceil(N / BLOCK[1])),
1) # Return a tuple of integers for GRID dimensions
def get_random_matrices(M, K, N):
A = np.random.random((M, K)).astype(np.float32)
B = np.random.random((K, N)).astype(np.float32)
C = np.random.random((M, N)).astype(np.float32)
return A, B, C
def get_ABC_gpu(A, B, C):
A_gpu = gpuarray.to_gpu(A)
B_gpu = gpuarray.to_gpu(B)
C_gpu = gpuarray.to_gpu(C)
return A_gpu, B_gpu, C_gpu
def load_kernel(kernel_path='./matmulKernels.cpp'):
with open(kernel_path, 'r') as f:
kernel = f.read()
kernel = r'{}'.format(kernel)
return SourceModule(kernel)
def matmul_gpu(A, B, C, M, K, N, fname='naive', shared=False):
A_gpu, B_gpu, C_gpu = get_ABC_gpu(A, B, C)
mod = load_kernel()
func = mod.get_function(fname)
if (not shared):
func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N))
else:
SHARED_MEM_SIZE = int(BLOCK[0] * BLOCK[1] * 3
* np.dtype(np.float32).itemsize)
func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N),
shared=SHARED_MEM_SIZE)
C = C_gpu.get()
return C
def matmul_withtranspose_gpu(A, B, C, M, K, N, fname='naive', shared=False):
A_gpu, B_gpu, C_gpu = get_ABC_gpu(A, B.T, C)
mod = load_kernel()
func = mod.get_function(fname)
if (not shared):
func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N))
else:
SHARED_MEM_SIZE = int(BLOCK[0] * BLOCK[1] * 3
* np.dtype(np.float32).itemsize)
func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N),
shared=SHARED_MEM_SIZE)
C = C_gpu.get()
return C
def matmul_cpu(A, B, C, M, K, N, transpose=False):
Ashape = A.shape
Bshape = B.shape
if (not transpose):
assert (Ashape[1] == Bshape[0])
C = np.matmul(A, B)
else:
assert (Ashape[1] == Bshape[1])
C = np.matmul(A, B.T)
return C
def valid_check(M, K, N, fname='naive', verbose=False, shared=False,
transpose=False):
A, B, C = get_random_matrices(M, K, N)
if (transpose):
C_gpu_comp = matmul_withtranspose_gpu(A, B, C, M, K, N, fname=fname,
shared=shared)
else:
C_gpu_comp = matmul_gpu(A, B, C, M, K, N, fname=fname, shared=shared)
C_cpu_comp = matmul_cpu(A, B, C, M, K, N, transpose)
try:
assert (np.allclose(C_cpu_comp, C_gpu_comp))
print("Valid computation!")
return 1
except Exception:
if (verbose):
return 0
else:
raise Exception("CPU and GPU results do not match!")
# sizes = 2**np.arange(4, 12).astype(np.int32)
sizes = np.array([1235, 257]).astype(np.int32)
num_exps = 1
for i in range(num_exps):
TRANSPOSE = False
for size in sizes:
M, K, N = size, size, size
print("*"*50)
print(f"Testing M = {M}, K = {K}, N = {N}, transpose={TRANSPOSE}")
valid = valid_check(M, K, N, fname='sharedMemKernel', shared=True,
verbose=True, transpose=TRANSPOSE)
if (valid == 1):
print("33[32mTest passed!33[0m")
print("*"*50)
elif (valid == 0):
print("33[31m.....FAILED!.....33[0m")
print("*"*50)
</code>
<code>import pycuda.driver as cuda import pycuda.autoinit import pycuda.gpuarray as gpuarray from pycuda.compiler import SourceModule import numpy as np BLOCK = (4, 4, 1) # Use a tuple of integers for BLOCK dimensions def GRID(M, N): return (int(np.ceil(M / BLOCK[0])), int(np.ceil(N / BLOCK[1])), 1) # Return a tuple of integers for GRID dimensions def get_random_matrices(M, K, N): A = np.random.random((M, K)).astype(np.float32) B = np.random.random((K, N)).astype(np.float32) C = np.random.random((M, N)).astype(np.float32) return A, B, C def get_ABC_gpu(A, B, C): A_gpu = gpuarray.to_gpu(A) B_gpu = gpuarray.to_gpu(B) C_gpu = gpuarray.to_gpu(C) return A_gpu, B_gpu, C_gpu def load_kernel(kernel_path='./matmulKernels.cpp'): with open(kernel_path, 'r') as f: kernel = f.read() kernel = r'{}'.format(kernel) return SourceModule(kernel) def matmul_gpu(A, B, C, M, K, N, fname='naive', shared=False): A_gpu, B_gpu, C_gpu = get_ABC_gpu(A, B, C) mod = load_kernel() func = mod.get_function(fname) if (not shared): func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N)) else: SHARED_MEM_SIZE = int(BLOCK[0] * BLOCK[1] * 3 * np.dtype(np.float32).itemsize) func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N), shared=SHARED_MEM_SIZE) C = C_gpu.get() return C def matmul_withtranspose_gpu(A, B, C, M, K, N, fname='naive', shared=False): A_gpu, B_gpu, C_gpu = get_ABC_gpu(A, B.T, C) mod = load_kernel() func = mod.get_function(fname) if (not shared): func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N)) else: SHARED_MEM_SIZE = int(BLOCK[0] * BLOCK[1] * 3 * np.dtype(np.float32).itemsize) func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N), shared=SHARED_MEM_SIZE) C = C_gpu.get() return C def matmul_cpu(A, B, C, M, K, N, transpose=False): Ashape = A.shape Bshape = B.shape if (not transpose): assert (Ashape[1] == Bshape[0]) C = np.matmul(A, B) else: assert (Ashape[1] == Bshape[1]) C = np.matmul(A, B.T) return C def valid_check(M, K, N, fname='naive', verbose=False, shared=False, transpose=False): A, B, C = get_random_matrices(M, K, N) if (transpose): C_gpu_comp = matmul_withtranspose_gpu(A, B, C, M, K, N, fname=fname, shared=shared) else: C_gpu_comp = matmul_gpu(A, B, C, M, K, N, fname=fname, shared=shared) C_cpu_comp = matmul_cpu(A, B, C, M, K, N, transpose) try: assert (np.allclose(C_cpu_comp, C_gpu_comp)) print("Valid computation!") return 1 except Exception: if (verbose): return 0 else: raise Exception("CPU and GPU results do not match!") # sizes = 2**np.arange(4, 12).astype(np.int32) sizes = np.array([1235, 257]).astype(np.int32) num_exps = 1 for i in range(num_exps): TRANSPOSE = False for size in sizes: M, K, N = size, size, size print("*"*50) print(f"Testing M = {M}, K = {K}, N = {N}, transpose={TRANSPOSE}") valid = valid_check(M, K, N, fname='sharedMemKernel', shared=True, verbose=True, transpose=TRANSPOSE) if (valid == 1): print("33[32mTest passed!33[0m") print("*"*50) elif (valid == 0): print("33[31m.....FAILED!.....33[0m") print("*"*50) </code>
import pycuda.driver as cuda
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
from pycuda.compiler import SourceModule
import numpy as np

BLOCK = (4, 4, 1)  # Use a tuple of integers for BLOCK dimensions


def GRID(M, N):
    return (int(np.ceil(M / BLOCK[0])), int(np.ceil(N / BLOCK[1])),
            1)  # Return a tuple of integers for GRID dimensions


def get_random_matrices(M, K, N):
    A = np.random.random((M, K)).astype(np.float32)
    B = np.random.random((K, N)).astype(np.float32)
    C = np.random.random((M, N)).astype(np.float32)
    return A, B, C


def get_ABC_gpu(A, B, C):
    A_gpu = gpuarray.to_gpu(A)
    B_gpu = gpuarray.to_gpu(B)
    C_gpu = gpuarray.to_gpu(C)
    return A_gpu, B_gpu, C_gpu


def load_kernel(kernel_path='./matmulKernels.cpp'):
    with open(kernel_path, 'r') as f:
        kernel = f.read()
    kernel = r'{}'.format(kernel)
    return SourceModule(kernel)


def matmul_gpu(A, B, C, M, K, N, fname='naive', shared=False):
    A_gpu, B_gpu, C_gpu = get_ABC_gpu(A, B, C)
    mod = load_kernel()
    func = mod.get_function(fname)
    if (not shared):
        func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N))
    else:
        SHARED_MEM_SIZE = int(BLOCK[0] * BLOCK[1] * 3
                              * np.dtype(np.float32).itemsize)
        func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N),
             shared=SHARED_MEM_SIZE)
    C = C_gpu.get()
    return C


def matmul_withtranspose_gpu(A, B, C, M, K, N, fname='naive', shared=False):
    A_gpu, B_gpu, C_gpu = get_ABC_gpu(A, B.T, C)
    mod = load_kernel()
    func = mod.get_function(fname)
    if (not shared):
        func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N))
    else:
        SHARED_MEM_SIZE = int(BLOCK[0] * BLOCK[1] * 3
                              * np.dtype(np.float32).itemsize)
        func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N),
             shared=SHARED_MEM_SIZE)
    C = C_gpu.get()
    return C


def matmul_cpu(A, B, C, M, K, N, transpose=False):
    Ashape = A.shape
    Bshape = B.shape
    if (not transpose):
        assert (Ashape[1] == Bshape[0])
        C = np.matmul(A, B)
    else:
        assert (Ashape[1] == Bshape[1])
        C = np.matmul(A, B.T)
    return C


def valid_check(M, K, N, fname='naive', verbose=False, shared=False,
                transpose=False):
    A, B, C = get_random_matrices(M, K, N)
    if (transpose):
        C_gpu_comp = matmul_withtranspose_gpu(A, B, C, M, K, N, fname=fname,
                                              shared=shared)
    else:
        C_gpu_comp = matmul_gpu(A, B, C, M, K, N, fname=fname, shared=shared)
    C_cpu_comp = matmul_cpu(A, B, C, M, K, N, transpose)
    try:
        assert (np.allclose(C_cpu_comp, C_gpu_comp))
        print("Valid computation!")
        return 1
    except Exception:
        if (verbose):
            return 0
        else:
            raise Exception("CPU and GPU results do not match!")


# sizes = 2**np.arange(4, 12).astype(np.int32)
sizes = np.array([1235, 257]).astype(np.int32)
num_exps = 1
for i in range(num_exps):
    TRANSPOSE = False
    for size in sizes:
        M, K, N = size, size, size
        print("*"*50)
        print(f"Testing M = {M}, K = {K}, N = {N}, transpose={TRANSPOSE}")
        valid = valid_check(M, K, N, fname='sharedMemKernel', shared=True,
                            verbose=True, transpose=TRANSPOSE)
        if (valid == 1):
            print("33[32mTest passed!33[0m")
            print("*"*50)
        elif (valid == 0):
            print("33[31m.....FAILED!.....33[0m")
            print("*"*50)

Addition device func:

Plain text
Copy to clipboard
Open code in new window
EnlighterJS 3 Syntax Highlighter
<code>__device__ void loadIntoSharedMemoryRowPhase(float *Ashared, float *A, int rowTileIdx, int ArowSize, int AcolSize){
/*
Load data from input A into shared memory Ashared - of size (blockDim.y, blockDim.x)
Assume that thread block size is same as Ashared size
Phases move across rows of input matrix A
*/
const unsigned int RowIdx = rowTileIdx*blockDim.y + threadIdx.y;
const unsigned int ColIdx = blockIdx.x*blockDim.x + threadIdx.x;
if(RowIdx<ArowSize && ColIdx<AcolSize) {
Ashared[threadIdx.y*blockDim.x + threadIdx.x] = A[RowIdx*AcolSize + ColIdx];
}
else{
Ashared[threadIdx.y*blockDim.x + threadIdx.x] = 0.0f;
}
// Debug outptus
// if(blockIdx.x==0 && blockIdx.y==0 && threadIdx.x==0 && threadIdx.y==0){
// printf("loaded value(%f) from B[%d][%d]n",A[RowIdx*AcolSize + ColIdx],RowIdx,ColIdx);
// }
}
</code>
<code>__device__ void loadIntoSharedMemoryRowPhase(float *Ashared, float *A, int rowTileIdx, int ArowSize, int AcolSize){ /* Load data from input A into shared memory Ashared - of size (blockDim.y, blockDim.x) Assume that thread block size is same as Ashared size Phases move across rows of input matrix A */ const unsigned int RowIdx = rowTileIdx*blockDim.y + threadIdx.y; const unsigned int ColIdx = blockIdx.x*blockDim.x + threadIdx.x; if(RowIdx<ArowSize && ColIdx<AcolSize) { Ashared[threadIdx.y*blockDim.x + threadIdx.x] = A[RowIdx*AcolSize + ColIdx]; } else{ Ashared[threadIdx.y*blockDim.x + threadIdx.x] = 0.0f; } // Debug outptus // if(blockIdx.x==0 && blockIdx.y==0 && threadIdx.x==0 && threadIdx.y==0){ // printf("loaded value(%f) from B[%d][%d]n",A[RowIdx*AcolSize + ColIdx],RowIdx,ColIdx); // } } </code>
__device__ void loadIntoSharedMemoryRowPhase(float *Ashared, float *A, int rowTileIdx, int ArowSize, int AcolSize){
    /*
    Load data from input A into shared memory Ashared - of size (blockDim.y, blockDim.x)
    Assume that thread block size is same as Ashared size
    Phases move across rows of input matrix A
    */
    const unsigned int RowIdx =  rowTileIdx*blockDim.y + threadIdx.y;
    const unsigned int ColIdx =  blockIdx.x*blockDim.x + threadIdx.x;
    
    if(RowIdx<ArowSize && ColIdx<AcolSize) {
        Ashared[threadIdx.y*blockDim.x + threadIdx.x] = A[RowIdx*AcolSize + ColIdx];
    }
    else{
        Ashared[threadIdx.y*blockDim.x + threadIdx.x] = 0.0f;
    }
    // Debug outptus
    // if(blockIdx.x==0 && blockIdx.y==0 && threadIdx.x==0 && threadIdx.y==0){
    //         printf("loaded value(%f) from B[%d][%d]n",A[RowIdx*AcolSize + ColIdx],RowIdx,ColIdx);
    //     }
}

Trang chủ Giới thiệu Sinh nhật bé trai Sinh nhật bé gái Tổ chức sự kiện Biểu diễn giải trí Dịch vụ khác Trang trí tiệc cưới Tổ chức khai trương Tư vấn dịch vụ Thư viện ảnh Tin tức - sự kiện Liên hệ Chú hề sinh nhật Trang trí YEAR END PARTY công ty Trang trí tất niên cuối năm Trang trí tất niên xu hướng mới nhất Trang trí sinh nhật bé trai Hải Đăng Trang trí sinh nhật bé Khánh Vân Trang trí sinh nhật Bích Ngân Trang trí sinh nhật bé Thanh Trang Thuê ông già Noel phát quà Biểu diễn xiếc khỉ Xiếc quay đĩa Dịch vụ tổ chức sự kiện 5 sao Thông tin về chúng tôi Dịch vụ sinh nhật bé trai Dịch vụ sinh nhật bé gái Sự kiện trọn gói Các tiết mục giải trí Dịch vụ bổ trợ Tiệc cưới sang trọng Dịch vụ khai trương Tư vấn tổ chức sự kiện Hình ảnh sự kiện Cập nhật tin tức Liên hệ ngay Thuê chú hề chuyên nghiệp Tiệc tất niên cho công ty Trang trí tiệc cuối năm Tiệc tất niên độc đáo Sinh nhật bé Hải Đăng Sinh nhật đáng yêu bé Khánh Vân Sinh nhật sang trọng Bích Ngân Tiệc sinh nhật bé Thanh Trang Dịch vụ ông già Noel Xiếc thú vui nhộn Biểu diễn xiếc quay đĩa Dịch vụ tổ chức tiệc uy tín Khám phá dịch vụ của chúng tôi Tiệc sinh nhật cho bé trai Trang trí tiệc cho bé gái Gói sự kiện chuyên nghiệp Chương trình giải trí hấp dẫn Dịch vụ hỗ trợ sự kiện Trang trí tiệc cưới đẹp Khởi đầu thành công với khai trương Chuyên gia tư vấn sự kiện Xem ảnh các sự kiện đẹp Tin mới về sự kiện Kết nối với đội ngũ chuyên gia Chú hề vui nhộn cho tiệc sinh nhật Ý tưởng tiệc cuối năm Tất niên độc đáo Trang trí tiệc hiện đại Tổ chức sinh nhật cho Hải Đăng Sinh nhật độc quyền Khánh Vân Phong cách tiệc Bích Ngân Trang trí tiệc bé Thanh Trang Thuê dịch vụ ông già Noel chuyên nghiệp Xem xiếc khỉ đặc sắc Xiếc quay đĩa thú vị
Trang chủ Giới thiệu Sinh nhật bé trai Sinh nhật bé gái Tổ chức sự kiện Biểu diễn giải trí Dịch vụ khác Trang trí tiệc cưới Tổ chức khai trương Tư vấn dịch vụ Thư viện ảnh Tin tức - sự kiện Liên hệ Chú hề sinh nhật Trang trí YEAR END PARTY công ty Trang trí tất niên cuối năm Trang trí tất niên xu hướng mới nhất Trang trí sinh nhật bé trai Hải Đăng Trang trí sinh nhật bé Khánh Vân Trang trí sinh nhật Bích Ngân Trang trí sinh nhật bé Thanh Trang Thuê ông già Noel phát quà Biểu diễn xiếc khỉ Xiếc quay đĩa

PyCUDA | Shared Matrix Multiplication with Phases | Unintuitive Error

I’m seeing an error which I am unable to explain.

I wrote a simple kernel for matrix multiplication that performs the following optimizations:

  1. Coalesced Global Memory Access
  2. Shared Memory loading in phases and computing result

This is the problem setup:

Plain text
Copy to clipboard
Open code in new window
EnlighterJS 3 Syntax Highlighter
<code>// Compute C = A*B
// A shape = (M, K)
// B shape = (K, N)
// C shape = (M, N)
// Test for all sizes as (n, n) - n in [16, 32, 64, 128, ..., 2048]
</code>
<code>// Compute C = A*B // A shape = (M, K) // B shape = (K, N) // C shape = (M, N) // Test for all sizes as (n, n) - n in [16, 32, 64, 128, ..., 2048] </code>
// Compute C = A*B
// A shape = (M, K) 
// B shape = (K, N) 
// C shape = (M, N) 
// Test for all sizes as (n, n) - n in  [16, 32, 64, 128, ..., 2048]

Some snippets from my code:

Plain text
Copy to clipboard
Open code in new window
EnlighterJS 3 Syntax Highlighter
<code>__constant__ float MAX_VALUE = 3.4028235e+38;
__global__ void sharedMemKernel(float *C, float *A, float *B, int M, int K, int N){
/*
Matmul between A(M*K) and B(K*N) to give C (M*N)
Used shared memory optimization
Each blocks loads required input tiles into shared memory
computations are performed by reading from shared memory
Final writes from threads are into global memory
Assume square tiles of size blockDim.x = blockDim.y
*/
extern __shared__ float sharedMem[];
int Bsize = blockDim.x;
// load inputs A,B in shared memory in phases
// Assume that block is square
// int phases = (int)ceil((float)K / Bsize);
int phases = (K + Bsize - 1)/Bsize;
float *CTile = sharedMem;
float *ATile = CTile + Bsize*Bsize;
float *BTile = ATile + Bsize*Bsize;
// Ensure that shared memory allocated is enough to store all matrices
size_t sharedMemSize = 3 * Bsize * Bsize * sizeof(float);
assert((char*)(BTile + Bsize * Bsize) - (char*)sharedMem <= sharedMemSize);
// ensure CTile is all zeros
CTile[threadIdx.y * Bsize + threadIdx.x] = 0.0f;
__syncthreads();
for(int p=0; p<phases; p++){
//load A, B into shared memory
loadIntoSharedMemoryColPhase(ATile, A, p, M, K);
loadIntoSharedMemoryRowPhase(BTile, B, p, K, N);
__syncthreads();
float result = 0.0f;
for(int i=0; i<Bsize; i++){
//compute phase-wise matrix multiplication
result += ATile[(threadIdx.y)*Bsize + i]*BTile[i*Bsize + threadIdx.x];
}
// Write result to output
if (result > MAX_VALUE) {
printf("Max value!!n");
result = MAX_VALUE;
}
CTile[threadIdx.y*Bsize + threadIdx.x] += result;
}
if( (blockIdx.y*Bsize + threadIdx.y)<M && blockIdx.x*Bsize+threadIdx.x<N){
C[(blockIdx.y*Bsize + threadIdx.y)*N + blockIdx.x*Bsize+threadIdx.x] = CTile[threadIdx.y*Bsize + threadIdx.x];
}
}
</code>
<code>__constant__ float MAX_VALUE = 3.4028235e+38; __global__ void sharedMemKernel(float *C, float *A, float *B, int M, int K, int N){ /* Matmul between A(M*K) and B(K*N) to give C (M*N) Used shared memory optimization Each blocks loads required input tiles into shared memory computations are performed by reading from shared memory Final writes from threads are into global memory Assume square tiles of size blockDim.x = blockDim.y */ extern __shared__ float sharedMem[]; int Bsize = blockDim.x; // load inputs A,B in shared memory in phases // Assume that block is square // int phases = (int)ceil((float)K / Bsize); int phases = (K + Bsize - 1)/Bsize; float *CTile = sharedMem; float *ATile = CTile + Bsize*Bsize; float *BTile = ATile + Bsize*Bsize; // Ensure that shared memory allocated is enough to store all matrices size_t sharedMemSize = 3 * Bsize * Bsize * sizeof(float); assert((char*)(BTile + Bsize * Bsize) - (char*)sharedMem <= sharedMemSize); // ensure CTile is all zeros CTile[threadIdx.y * Bsize + threadIdx.x] = 0.0f; __syncthreads(); for(int p=0; p<phases; p++){ //load A, B into shared memory loadIntoSharedMemoryColPhase(ATile, A, p, M, K); loadIntoSharedMemoryRowPhase(BTile, B, p, K, N); __syncthreads(); float result = 0.0f; for(int i=0; i<Bsize; i++){ //compute phase-wise matrix multiplication result += ATile[(threadIdx.y)*Bsize + i]*BTile[i*Bsize + threadIdx.x]; } // Write result to output if (result > MAX_VALUE) { printf("Max value!!n"); result = MAX_VALUE; } CTile[threadIdx.y*Bsize + threadIdx.x] += result; } if( (blockIdx.y*Bsize + threadIdx.y)<M && blockIdx.x*Bsize+threadIdx.x<N){ C[(blockIdx.y*Bsize + threadIdx.y)*N + blockIdx.x*Bsize+threadIdx.x] = CTile[threadIdx.y*Bsize + threadIdx.x]; } } </code>
__constant__ float MAX_VALUE = 3.4028235e+38;

__global__ void sharedMemKernel(float *C, float *A, float *B, int M, int K, int N){
    /*
    Matmul between A(M*K) and B(K*N) to give C (M*N)
    Used shared memory optimization
    Each blocks loads required input tiles into shared memory
    computations are performed by reading from shared memory
    Final writes from threads are into global memory
    Assume square tiles of size blockDim.x = blockDim.y
    */
    extern __shared__ float sharedMem[];
    
    int Bsize = blockDim.x;
    
    // load inputs A,B in shared memory in phases
    // Assume that block is square
    
    // int phases = (int)ceil((float)K / Bsize);
    int phases = (K + Bsize - 1)/Bsize;

    float *CTile = sharedMem;
    float *ATile = CTile + Bsize*Bsize;
    float *BTile = ATile + Bsize*Bsize;

    // Ensure that shared memory allocated is enough to store all matrices
    size_t sharedMemSize = 3 * Bsize * Bsize * sizeof(float); 
    assert((char*)(BTile + Bsize * Bsize) - (char*)sharedMem <= sharedMemSize);


    // ensure CTile is all zeros
    CTile[threadIdx.y * Bsize + threadIdx.x] = 0.0f;
    __syncthreads();
    
    
    for(int p=0; p<phases; p++){
        //load A, B into shared memory
        
        loadIntoSharedMemoryColPhase(ATile, A, p, M, K);
        loadIntoSharedMemoryRowPhase(BTile, B, p, K, N);

        __syncthreads();

        float result = 0.0f;
        for(int i=0; i<Bsize; i++){
            //compute phase-wise matrix multiplication
            result += ATile[(threadIdx.y)*Bsize + i]*BTile[i*Bsize + threadIdx.x];
        }
        // Write result to output
        if (result > MAX_VALUE) {
            printf("Max value!!n");
            result = MAX_VALUE;
        }
        CTile[threadIdx.y*Bsize + threadIdx.x] += result;
    } 
    if( (blockIdx.y*Bsize + threadIdx.y)<M && blockIdx.x*Bsize+threadIdx.x<N){
        C[(blockIdx.y*Bsize + threadIdx.y)*N + blockIdx.x*Bsize+threadIdx.x] = CTile[threadIdx.y*Bsize + threadIdx.x];
    }
}
Plain text
Copy to clipboard
Open code in new window
EnlighterJS 3 Syntax Highlighter
<code>__device__ void loadIntoSharedMemoryColPhase(float *Ashared, float *A, int colTileIdx, int ArowSize, int AcolSize){
/*
Load data from input A into shared memory Ashared - of size (blockDim.y, blockDim.x)
Assume that thread block size is same as Ashared size
Phases move across columns of input matrix A
*/
const unsigned int RowIdx = blockIdx.y*blockDim.y + threadIdx.y;
const unsigned int ColIdx = colTileIdx*blockDim.x + threadIdx.x;
if(RowIdx<ArowSize && ColIdx<AcolSize) {
Ashared[threadIdx.y*blockDim.x + threadIdx.x] = A[RowIdx*AcolSize + ColIdx];
}
else{
Ashared[threadIdx.y*blockDim.x + threadIdx.x] = 0.0f;
}
// Debug outptus
// if(blockIdx.x==0 && blockIdx.y==0 && threadIdx.x==0 && threadIdx.y==0){
// printf("loaded value(%f) from A[%d][%d]n",A[RowIdx*AcolSize + ColIdx],RowIdx,ColIdx);
// }
}
</code>
<code>__device__ void loadIntoSharedMemoryColPhase(float *Ashared, float *A, int colTileIdx, int ArowSize, int AcolSize){ /* Load data from input A into shared memory Ashared - of size (blockDim.y, blockDim.x) Assume that thread block size is same as Ashared size Phases move across columns of input matrix A */ const unsigned int RowIdx = blockIdx.y*blockDim.y + threadIdx.y; const unsigned int ColIdx = colTileIdx*blockDim.x + threadIdx.x; if(RowIdx<ArowSize && ColIdx<AcolSize) { Ashared[threadIdx.y*blockDim.x + threadIdx.x] = A[RowIdx*AcolSize + ColIdx]; } else{ Ashared[threadIdx.y*blockDim.x + threadIdx.x] = 0.0f; } // Debug outptus // if(blockIdx.x==0 && blockIdx.y==0 && threadIdx.x==0 && threadIdx.y==0){ // printf("loaded value(%f) from A[%d][%d]n",A[RowIdx*AcolSize + ColIdx],RowIdx,ColIdx); // } } </code>
__device__ void loadIntoSharedMemoryColPhase(float *Ashared, float *A, int colTileIdx, int ArowSize, int AcolSize){
    /*
    Load data from input A into shared memory Ashared - of size (blockDim.y, blockDim.x)
    Assume that thread block size is same as Ashared size
    Phases move across columns of input matrix A
    */
    const unsigned int RowIdx =  blockIdx.y*blockDim.y + threadIdx.y;
    const unsigned int ColIdx =  colTileIdx*blockDim.x + threadIdx.x;
    
    if(RowIdx<ArowSize && ColIdx<AcolSize) {
        Ashared[threadIdx.y*blockDim.x + threadIdx.x] = A[RowIdx*AcolSize + ColIdx];
    }
    else{
        Ashared[threadIdx.y*blockDim.x + threadIdx.x] = 0.0f;
    }
    // Debug outptus
    // if(blockIdx.x==0 && blockIdx.y==0 && threadIdx.x==0 && threadIdx.y==0){
    //     printf("loaded value(%f) from A[%d][%d]n",A[RowIdx*AcolSize + ColIdx],RowIdx,ColIdx);
    // }
}

Here are interesting errors that I see:

  1. Without checking if result is beyond MAX_VALUE, I see that matrices of size 64×64 randomly differ from the CPU output used to test. Matrices beyond that – 128, 256, 512, etc.. always give an error.

  2. When I had reduced the block size to (4, 4) instead of the original (32, 32), all tests passed!

This is unclear because of the following reasons:

  1. In my experience the floating point issues that arise in a GPU should arise in the CPU as well and they outputs should be “equally” wrong. Why did the condition help pass the 64×64 test case?

    I found a relevant source here: PyCUDA precision of matrix multiplication code that potentially makes it clear. It would make sense if my CPU code is performing this clipping by default.

  2. Why is the block size deciding the result of my computation at all? If SMs do not have enough resources, would the resource manager not wait until enough resources are available for execution? What could the possible reason for this solution to work be? I tested this without any intuition and fail to explain this behavior.

Adding python test code:

Plain text
Copy to clipboard
Open code in new window
EnlighterJS 3 Syntax Highlighter
<code>import pycuda.driver as cuda
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
from pycuda.compiler import SourceModule
import numpy as np
BLOCK = (4, 4, 1) # Use a tuple of integers for BLOCK dimensions
def GRID(M, N):
return (int(np.ceil(M / BLOCK[0])), int(np.ceil(N / BLOCK[1])),
1) # Return a tuple of integers for GRID dimensions
def get_random_matrices(M, K, N):
A = np.random.random((M, K)).astype(np.float32)
B = np.random.random((K, N)).astype(np.float32)
C = np.random.random((M, N)).astype(np.float32)
return A, B, C
def get_ABC_gpu(A, B, C):
A_gpu = gpuarray.to_gpu(A)
B_gpu = gpuarray.to_gpu(B)
C_gpu = gpuarray.to_gpu(C)
return A_gpu, B_gpu, C_gpu
def load_kernel(kernel_path='./matmulKernels.cpp'):
with open(kernel_path, 'r') as f:
kernel = f.read()
kernel = r'{}'.format(kernel)
return SourceModule(kernel)
def matmul_gpu(A, B, C, M, K, N, fname='naive', shared=False):
A_gpu, B_gpu, C_gpu = get_ABC_gpu(A, B, C)
mod = load_kernel()
func = mod.get_function(fname)
if (not shared):
func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N))
else:
SHARED_MEM_SIZE = int(BLOCK[0] * BLOCK[1] * 3
* np.dtype(np.float32).itemsize)
func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N),
shared=SHARED_MEM_SIZE)
C = C_gpu.get()
return C
def matmul_withtranspose_gpu(A, B, C, M, K, N, fname='naive', shared=False):
A_gpu, B_gpu, C_gpu = get_ABC_gpu(A, B.T, C)
mod = load_kernel()
func = mod.get_function(fname)
if (not shared):
func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N))
else:
SHARED_MEM_SIZE = int(BLOCK[0] * BLOCK[1] * 3
* np.dtype(np.float32).itemsize)
func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N),
shared=SHARED_MEM_SIZE)
C = C_gpu.get()
return C
def matmul_cpu(A, B, C, M, K, N, transpose=False):
Ashape = A.shape
Bshape = B.shape
if (not transpose):
assert (Ashape[1] == Bshape[0])
C = np.matmul(A, B)
else:
assert (Ashape[1] == Bshape[1])
C = np.matmul(A, B.T)
return C
def valid_check(M, K, N, fname='naive', verbose=False, shared=False,
transpose=False):
A, B, C = get_random_matrices(M, K, N)
if (transpose):
C_gpu_comp = matmul_withtranspose_gpu(A, B, C, M, K, N, fname=fname,
shared=shared)
else:
C_gpu_comp = matmul_gpu(A, B, C, M, K, N, fname=fname, shared=shared)
C_cpu_comp = matmul_cpu(A, B, C, M, K, N, transpose)
try:
assert (np.allclose(C_cpu_comp, C_gpu_comp))
print("Valid computation!")
return 1
except Exception:
if (verbose):
return 0
else:
raise Exception("CPU and GPU results do not match!")
# sizes = 2**np.arange(4, 12).astype(np.int32)
sizes = np.array([1235, 257]).astype(np.int32)
num_exps = 1
for i in range(num_exps):
TRANSPOSE = False
for size in sizes:
M, K, N = size, size, size
print("*"*50)
print(f"Testing M = {M}, K = {K}, N = {N}, transpose={TRANSPOSE}")
valid = valid_check(M, K, N, fname='sharedMemKernel', shared=True,
verbose=True, transpose=TRANSPOSE)
if (valid == 1):
print("33[32mTest passed!33[0m")
print("*"*50)
elif (valid == 0):
print("33[31m.....FAILED!.....33[0m")
print("*"*50)
</code>
<code>import pycuda.driver as cuda import pycuda.autoinit import pycuda.gpuarray as gpuarray from pycuda.compiler import SourceModule import numpy as np BLOCK = (4, 4, 1) # Use a tuple of integers for BLOCK dimensions def GRID(M, N): return (int(np.ceil(M / BLOCK[0])), int(np.ceil(N / BLOCK[1])), 1) # Return a tuple of integers for GRID dimensions def get_random_matrices(M, K, N): A = np.random.random((M, K)).astype(np.float32) B = np.random.random((K, N)).astype(np.float32) C = np.random.random((M, N)).astype(np.float32) return A, B, C def get_ABC_gpu(A, B, C): A_gpu = gpuarray.to_gpu(A) B_gpu = gpuarray.to_gpu(B) C_gpu = gpuarray.to_gpu(C) return A_gpu, B_gpu, C_gpu def load_kernel(kernel_path='./matmulKernels.cpp'): with open(kernel_path, 'r') as f: kernel = f.read() kernel = r'{}'.format(kernel) return SourceModule(kernel) def matmul_gpu(A, B, C, M, K, N, fname='naive', shared=False): A_gpu, B_gpu, C_gpu = get_ABC_gpu(A, B, C) mod = load_kernel() func = mod.get_function(fname) if (not shared): func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N)) else: SHARED_MEM_SIZE = int(BLOCK[0] * BLOCK[1] * 3 * np.dtype(np.float32).itemsize) func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N), shared=SHARED_MEM_SIZE) C = C_gpu.get() return C def matmul_withtranspose_gpu(A, B, C, M, K, N, fname='naive', shared=False): A_gpu, B_gpu, C_gpu = get_ABC_gpu(A, B.T, C) mod = load_kernel() func = mod.get_function(fname) if (not shared): func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N)) else: SHARED_MEM_SIZE = int(BLOCK[0] * BLOCK[1] * 3 * np.dtype(np.float32).itemsize) func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N), shared=SHARED_MEM_SIZE) C = C_gpu.get() return C def matmul_cpu(A, B, C, M, K, N, transpose=False): Ashape = A.shape Bshape = B.shape if (not transpose): assert (Ashape[1] == Bshape[0]) C = np.matmul(A, B) else: assert (Ashape[1] == Bshape[1]) C = np.matmul(A, B.T) return C def valid_check(M, K, N, fname='naive', verbose=False, shared=False, transpose=False): A, B, C = get_random_matrices(M, K, N) if (transpose): C_gpu_comp = matmul_withtranspose_gpu(A, B, C, M, K, N, fname=fname, shared=shared) else: C_gpu_comp = matmul_gpu(A, B, C, M, K, N, fname=fname, shared=shared) C_cpu_comp = matmul_cpu(A, B, C, M, K, N, transpose) try: assert (np.allclose(C_cpu_comp, C_gpu_comp)) print("Valid computation!") return 1 except Exception: if (verbose): return 0 else: raise Exception("CPU and GPU results do not match!") # sizes = 2**np.arange(4, 12).astype(np.int32) sizes = np.array([1235, 257]).astype(np.int32) num_exps = 1 for i in range(num_exps): TRANSPOSE = False for size in sizes: M, K, N = size, size, size print("*"*50) print(f"Testing M = {M}, K = {K}, N = {N}, transpose={TRANSPOSE}") valid = valid_check(M, K, N, fname='sharedMemKernel', shared=True, verbose=True, transpose=TRANSPOSE) if (valid == 1): print("33[32mTest passed!33[0m") print("*"*50) elif (valid == 0): print("33[31m.....FAILED!.....33[0m") print("*"*50) </code>
import pycuda.driver as cuda
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
from pycuda.compiler import SourceModule
import numpy as np

BLOCK = (4, 4, 1)  # Use a tuple of integers for BLOCK dimensions


def GRID(M, N):
    return (int(np.ceil(M / BLOCK[0])), int(np.ceil(N / BLOCK[1])),
            1)  # Return a tuple of integers for GRID dimensions


def get_random_matrices(M, K, N):
    A = np.random.random((M, K)).astype(np.float32)
    B = np.random.random((K, N)).astype(np.float32)
    C = np.random.random((M, N)).astype(np.float32)
    return A, B, C


def get_ABC_gpu(A, B, C):
    A_gpu = gpuarray.to_gpu(A)
    B_gpu = gpuarray.to_gpu(B)
    C_gpu = gpuarray.to_gpu(C)
    return A_gpu, B_gpu, C_gpu


def load_kernel(kernel_path='./matmulKernels.cpp'):
    with open(kernel_path, 'r') as f:
        kernel = f.read()
    kernel = r'{}'.format(kernel)
    return SourceModule(kernel)


def matmul_gpu(A, B, C, M, K, N, fname='naive', shared=False):
    A_gpu, B_gpu, C_gpu = get_ABC_gpu(A, B, C)
    mod = load_kernel()
    func = mod.get_function(fname)
    if (not shared):
        func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N))
    else:
        SHARED_MEM_SIZE = int(BLOCK[0] * BLOCK[1] * 3
                              * np.dtype(np.float32).itemsize)
        func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N),
             shared=SHARED_MEM_SIZE)
    C = C_gpu.get()
    return C


def matmul_withtranspose_gpu(A, B, C, M, K, N, fname='naive', shared=False):
    A_gpu, B_gpu, C_gpu = get_ABC_gpu(A, B.T, C)
    mod = load_kernel()
    func = mod.get_function(fname)
    if (not shared):
        func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N))
    else:
        SHARED_MEM_SIZE = int(BLOCK[0] * BLOCK[1] * 3
                              * np.dtype(np.float32).itemsize)
        func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N),
             shared=SHARED_MEM_SIZE)
    C = C_gpu.get()
    return C


def matmul_cpu(A, B, C, M, K, N, transpose=False):
    Ashape = A.shape
    Bshape = B.shape
    if (not transpose):
        assert (Ashape[1] == Bshape[0])
        C = np.matmul(A, B)
    else:
        assert (Ashape[1] == Bshape[1])
        C = np.matmul(A, B.T)
    return C


def valid_check(M, K, N, fname='naive', verbose=False, shared=False,
                transpose=False):
    A, B, C = get_random_matrices(M, K, N)
    if (transpose):
        C_gpu_comp = matmul_withtranspose_gpu(A, B, C, M, K, N, fname=fname,
                                              shared=shared)
    else:
        C_gpu_comp = matmul_gpu(A, B, C, M, K, N, fname=fname, shared=shared)
    C_cpu_comp = matmul_cpu(A, B, C, M, K, N, transpose)
    try:
        assert (np.allclose(C_cpu_comp, C_gpu_comp))
        print("Valid computation!")
        return 1
    except Exception:
        if (verbose):
            return 0
        else:
            raise Exception("CPU and GPU results do not match!")


# sizes = 2**np.arange(4, 12).astype(np.int32)
sizes = np.array([1235, 257]).astype(np.int32)
num_exps = 1
for i in range(num_exps):
    TRANSPOSE = False
    for size in sizes:
        M, K, N = size, size, size
        print("*"*50)
        print(f"Testing M = {M}, K = {K}, N = {N}, transpose={TRANSPOSE}")
        valid = valid_check(M, K, N, fname='sharedMemKernel', shared=True,
                            verbose=True, transpose=TRANSPOSE)
        if (valid == 1):
            print("33[32mTest passed!33[0m")
            print("*"*50)
        elif (valid == 0):
            print("33[31m.....FAILED!.....33[0m")
            print("*"*50)

Addition device func:

Plain text
Copy to clipboard
Open code in new window
EnlighterJS 3 Syntax Highlighter
<code>__device__ void loadIntoSharedMemoryRowPhase(float *Ashared, float *A, int rowTileIdx, int ArowSize, int AcolSize){
/*
Load data from input A into shared memory Ashared - of size (blockDim.y, blockDim.x)
Assume that thread block size is same as Ashared size
Phases move across rows of input matrix A
*/
const unsigned int RowIdx = rowTileIdx*blockDim.y + threadIdx.y;
const unsigned int ColIdx = blockIdx.x*blockDim.x + threadIdx.x;
if(RowIdx<ArowSize && ColIdx<AcolSize) {
Ashared[threadIdx.y*blockDim.x + threadIdx.x] = A[RowIdx*AcolSize + ColIdx];
}
else{
Ashared[threadIdx.y*blockDim.x + threadIdx.x] = 0.0f;
}
// Debug outptus
// if(blockIdx.x==0 && blockIdx.y==0 && threadIdx.x==0 && threadIdx.y==0){
// printf("loaded value(%f) from B[%d][%d]n",A[RowIdx*AcolSize + ColIdx],RowIdx,ColIdx);
// }
}
</code>
<code>__device__ void loadIntoSharedMemoryRowPhase(float *Ashared, float *A, int rowTileIdx, int ArowSize, int AcolSize){ /* Load data from input A into shared memory Ashared - of size (blockDim.y, blockDim.x) Assume that thread block size is same as Ashared size Phases move across rows of input matrix A */ const unsigned int RowIdx = rowTileIdx*blockDim.y + threadIdx.y; const unsigned int ColIdx = blockIdx.x*blockDim.x + threadIdx.x; if(RowIdx<ArowSize && ColIdx<AcolSize) { Ashared[threadIdx.y*blockDim.x + threadIdx.x] = A[RowIdx*AcolSize + ColIdx]; } else{ Ashared[threadIdx.y*blockDim.x + threadIdx.x] = 0.0f; } // Debug outptus // if(blockIdx.x==0 && blockIdx.y==0 && threadIdx.x==0 && threadIdx.y==0){ // printf("loaded value(%f) from B[%d][%d]n",A[RowIdx*AcolSize + ColIdx],RowIdx,ColIdx); // } } </code>
__device__ void loadIntoSharedMemoryRowPhase(float *Ashared, float *A, int rowTileIdx, int ArowSize, int AcolSize){
    /*
    Load data from input A into shared memory Ashared - of size (blockDim.y, blockDim.x)
    Assume that thread block size is same as Ashared size
    Phases move across rows of input matrix A
    */
    const unsigned int RowIdx =  rowTileIdx*blockDim.y + threadIdx.y;
    const unsigned int ColIdx =  blockIdx.x*blockDim.x + threadIdx.x;
    
    if(RowIdx<ArowSize && ColIdx<AcolSize) {
        Ashared[threadIdx.y*blockDim.x + threadIdx.x] = A[RowIdx*AcolSize + ColIdx];
    }
    else{
        Ashared[threadIdx.y*blockDim.x + threadIdx.x] = 0.0f;
    }
    // Debug outptus
    // if(blockIdx.x==0 && blockIdx.y==0 && threadIdx.x==0 && threadIdx.y==0){
    //         printf("loaded value(%f) from B[%d][%d]n",A[RowIdx*AcolSize + ColIdx],RowIdx,ColIdx);
    //     }
}

Trang chủ Giới thiệu Sinh nhật bé trai Sinh nhật bé gái Tổ chức sự kiện Biểu diễn giải trí Dịch vụ khác Trang trí tiệc cưới Tổ chức khai trương Tư vấn dịch vụ Thư viện ảnh Tin tức - sự kiện Liên hệ Chú hề sinh nhật Trang trí YEAR END PARTY công ty Trang trí tất niên cuối năm Trang trí tất niên xu hướng mới nhất Trang trí sinh nhật bé trai Hải Đăng Trang trí sinh nhật bé Khánh Vân Trang trí sinh nhật Bích Ngân Trang trí sinh nhật bé Thanh Trang Thuê ông già Noel phát quà Biểu diễn xiếc khỉ Xiếc quay đĩa Dịch vụ tổ chức sự kiện 5 sao Thông tin về chúng tôi Dịch vụ sinh nhật bé trai Dịch vụ sinh nhật bé gái Sự kiện trọn gói Các tiết mục giải trí Dịch vụ bổ trợ Tiệc cưới sang trọng Dịch vụ khai trương Tư vấn tổ chức sự kiện Hình ảnh sự kiện Cập nhật tin tức Liên hệ ngay Thuê chú hề chuyên nghiệp Tiệc tất niên cho công ty Trang trí tiệc cuối năm Tiệc tất niên độc đáo Sinh nhật bé Hải Đăng Sinh nhật đáng yêu bé Khánh Vân Sinh nhật sang trọng Bích Ngân Tiệc sinh nhật bé Thanh Trang Dịch vụ ông già Noel Xiếc thú vui nhộn Biểu diễn xiếc quay đĩa Dịch vụ tổ chức tiệc uy tín Khám phá dịch vụ của chúng tôi Tiệc sinh nhật cho bé trai Trang trí tiệc cho bé gái Gói sự kiện chuyên nghiệp Chương trình giải trí hấp dẫn Dịch vụ hỗ trợ sự kiện Trang trí tiệc cưới đẹp Khởi đầu thành công với khai trương Chuyên gia tư vấn sự kiện Xem ảnh các sự kiện đẹp Tin mới về sự kiện Kết nối với đội ngũ chuyên gia Chú hề vui nhộn cho tiệc sinh nhật Ý tưởng tiệc cuối năm Tất niên độc đáo Trang trí tiệc hiện đại Tổ chức sinh nhật cho Hải Đăng Sinh nhật độc quyền Khánh Vân Phong cách tiệc Bích Ngân Trang trí tiệc bé Thanh Trang Thuê dịch vụ ông già Noel chuyên nghiệp Xem xiếc khỉ đặc sắc Xiếc quay đĩa thú vị
Trang chủ Giới thiệu Sinh nhật bé trai Sinh nhật bé gái Tổ chức sự kiện Biểu diễn giải trí Dịch vụ khác Trang trí tiệc cưới Tổ chức khai trương Tư vấn dịch vụ Thư viện ảnh Tin tức - sự kiện Liên hệ Chú hề sinh nhật Trang trí YEAR END PARTY công ty Trang trí tất niên cuối năm Trang trí tất niên xu hướng mới nhất Trang trí sinh nhật bé trai Hải Đăng Trang trí sinh nhật bé Khánh Vân Trang trí sinh nhật Bích Ngân Trang trí sinh nhật bé Thanh Trang Thuê ông già Noel phát quà Biểu diễn xiếc khỉ Xiếc quay đĩa
Thiết kế website Thiết kế website Thiết kế website Cách kháng tài khoản quảng cáo Mua bán Fanpage Facebook Dịch vụ SEO Tổ chức sinh nhật