Thiết kế website giá rẻ

Question

I’m seeing an error which I am unable to explain.

I wrote a simple kernel for matrix multiplication that performs the following optimizations:

Coalesced Global Memory Access
Shared Memory loading in phases and computing result

This is the problem setup:

<code>// Compute C = A*B

// A shape = (M, K)

// B shape = (K, N)

// C shape = (M, N)

// Test for all sizes as (n, n) - n in [16, 32, 64, 128, ..., 2048]

</code>

<code>// Compute C = A*B // A shape = (M, K) // B shape = (K, N) // C shape = (M, N) // Test for all sizes as (n, n) - n in [16, 32, 64, 128, ..., 2048] </code>

// Compute C = A*B
// A shape = (M, K) 
// B shape = (K, N) 
// C shape = (M, N) 
// Test for all sizes as (n, n) - n in  [16, 32, 64, 128, ..., 2048]

Some snippets from my code:

<code>__constant__ float MAX_VALUE = 3.4028235e+38;

__global__ void sharedMemKernel(float *C, float *A, float *B, int M, int K, int N){

/*

Matmul between A(M*K) and B(K*N) to give C (M*N)

Used shared memory optimization

Each blocks loads required input tiles into shared memory

computations are performed by reading from shared memory

Final writes from threads are into global memory

Assume square tiles of size blockDim.x = blockDim.y

*/

extern __shared__ float sharedMem[];

int Bsize = blockDim.x;

// load inputs A,B in shared memory in phases

// Assume that block is square

// int phases = (int)ceil((float)K / Bsize);

int phases = (K + Bsize - 1)/Bsize;

float *CTile = sharedMem;

float *ATile = CTile + Bsize*Bsize;

float *BTile = ATile + Bsize*Bsize;

// Ensure that shared memory allocated is enough to store all matrices

size_t sharedMemSize = 3 * Bsize * Bsize * sizeof(float);

assert((char*)(BTile + Bsize * Bsize) - (char*)sharedMem <= sharedMemSize);

// ensure CTile is all zeros

CTile[threadIdx.y * Bsize + threadIdx.x] = 0.0f;

__syncthreads();

for(int p=0; p<phases; p++){

//load A, B into shared memory

loadIntoSharedMemoryColPhase(ATile, A, p, M, K);

loadIntoSharedMemoryRowPhase(BTile, B, p, K, N);

__syncthreads();

float result = 0.0f;

for(int i=0; i<Bsize; i++){

//compute phase-wise matrix multiplication

result += ATile[(threadIdx.y)*Bsize + i]*BTile[i*Bsize + threadIdx.x];

}

// Write result to output

if (result > MAX_VALUE) {

printf("Max value!!n");

result = MAX_VALUE;

}

CTile[threadIdx.y*Bsize + threadIdx.x] += result;

}

if( (blockIdx.y*Bsize + threadIdx.y)<M && blockIdx.x*Bsize+threadIdx.x<N){

C[(blockIdx.y*Bsize + threadIdx.y)*N + blockIdx.x*Bsize+threadIdx.x] = CTile[threadIdx.y*Bsize + threadIdx.x];

}

</code>

<code>__constant__ float MAX_VALUE = 3.4028235e+38; __global__ void sharedMemKernel(float *C, float *A, float *B, int M, int K, int N){ /* Matmul between A(M*K) and B(K*N) to give C (M*N) Used shared memory optimization Each blocks loads required input tiles into shared memory computations are performed by reading from shared memory Final writes from threads are into global memory Assume square tiles of size blockDim.x = blockDim.y */ extern __shared__ float sharedMem[]; int Bsize = blockDim.x; // load inputs A,B in shared memory in phases // Assume that block is square // int phases = (int)ceil((float)K / Bsize); int phases = (K + Bsize - 1)/Bsize; float *CTile = sharedMem; float *ATile = CTile + Bsize*Bsize; float *BTile = ATile + Bsize*Bsize; // Ensure that shared memory allocated is enough to store all matrices size_t sharedMemSize = 3 * Bsize * Bsize * sizeof(float); assert((char*)(BTile + Bsize * Bsize) - (char*)sharedMem <= sharedMemSize); // ensure CTile is all zeros CTile[threadIdx.y * Bsize + threadIdx.x] = 0.0f; __syncthreads(); for(int p=0; p<phases; p++){ //load A, B into shared memory loadIntoSharedMemoryColPhase(ATile, A, p, M, K); loadIntoSharedMemoryRowPhase(BTile, B, p, K, N); __syncthreads(); float result = 0.0f; for(int i=0; i<Bsize; i++){ //compute phase-wise matrix multiplication result += ATile[(threadIdx.y)*Bsize + i]*BTile[i*Bsize + threadIdx.x]; } // Write result to output if (result > MAX_VALUE) { printf("Max value!!n"); result = MAX_VALUE; } CTile[threadIdx.y*Bsize + threadIdx.x] += result; } if( (blockIdx.y*Bsize + threadIdx.y)<M && blockIdx.x*Bsize+threadIdx.x<N){ C[(blockIdx.y*Bsize + threadIdx.y)*N + blockIdx.x*Bsize+threadIdx.x] = CTile[threadIdx.y*Bsize + threadIdx.x]; } } </code>

__constant__ float MAX_VALUE = 3.4028235e+38;

__global__ void sharedMemKernel(float *C, float *A, float *B, int M, int K, int N){
    /*
    Matmul between A(M*K) and B(K*N) to give C (M*N)
    Used shared memory optimization
    Each blocks loads required input tiles into shared memory
    computations are performed by reading from shared memory
    Final writes from threads are into global memory
    Assume square tiles of size blockDim.x = blockDim.y
    */
    extern __shared__ float sharedMem[];
    
    int Bsize = blockDim.x;
    
    // load inputs A,B in shared memory in phases
    // Assume that block is square
    
    // int phases = (int)ceil((float)K / Bsize);
    int phases = (K + Bsize - 1)/Bsize;

    float *CTile = sharedMem;
    float *ATile = CTile + Bsize*Bsize;
    float *BTile = ATile + Bsize*Bsize;

    // Ensure that shared memory allocated is enough to store all matrices
    size_t sharedMemSize = 3 * Bsize * Bsize * sizeof(float); 
    assert((char*)(BTile + Bsize * Bsize) - (char*)sharedMem <= sharedMemSize);


    // ensure CTile is all zeros
    CTile[threadIdx.y * Bsize + threadIdx.x] = 0.0f;
    __syncthreads();
    
    
    for(int p=0; p<phases; p++){
        //load A, B into shared memory
        
        loadIntoSharedMemoryColPhase(ATile, A, p, M, K);
        loadIntoSharedMemoryRowPhase(BTile, B, p, K, N);

        __syncthreads();

        float result = 0.0f;
        for(int i=0; i<Bsize; i++){
            //compute phase-wise matrix multiplication
            result += ATile[(threadIdx.y)*Bsize + i]*BTile[i*Bsize + threadIdx.x];
        }
        // Write result to output
        if (result > MAX_VALUE) {
            printf("Max value!!n");
            result = MAX_VALUE;
        }
        CTile[threadIdx.y*Bsize + threadIdx.x] += result;
    } 
    if( (blockIdx.y*Bsize + threadIdx.y)<M && blockIdx.x*Bsize+threadIdx.x<N){
        C[(blockIdx.y*Bsize + threadIdx.y)*N + blockIdx.x*Bsize+threadIdx.x] = CTile[threadIdx.y*Bsize + threadIdx.x];
    }
}

<code>__device__ void loadIntoSharedMemoryColPhase(float *Ashared, float *A, int colTileIdx, int ArowSize, int AcolSize){

/*

Load data from input A into shared memory Ashared - of size (blockDim.y, blockDim.x)

Assume that thread block size is same as Ashared size

Phases move across columns of input matrix A

*/

const unsigned int RowIdx = blockIdx.y*blockDim.y + threadIdx.y;

const unsigned int ColIdx = colTileIdx*blockDim.x + threadIdx.x;

if(RowIdx<ArowSize && ColIdx<AcolSize) {

Ashared[threadIdx.y*blockDim.x + threadIdx.x] = A[RowIdx*AcolSize + ColIdx];

}

else{

Ashared[threadIdx.y*blockDim.x + threadIdx.x] = 0.0f;

}

// Debug outptus

// if(blockIdx.x==0 && blockIdx.y==0 && threadIdx.x==0 && threadIdx.y==0){

// printf("loaded value(%f) from A[%d][%d]n",A[RowIdx*AcolSize + ColIdx],RowIdx,ColIdx);

// }

}

</code>

<code>__device__ void loadIntoSharedMemoryColPhase(float *Ashared, float *A, int colTileIdx, int ArowSize, int AcolSize){ /* Load data from input A into shared memory Ashared - of size (blockDim.y, blockDim.x) Assume that thread block size is same as Ashared size Phases move across columns of input matrix A */ const unsigned int RowIdx = blockIdx.y*blockDim.y + threadIdx.y; const unsigned int ColIdx = colTileIdx*blockDim.x + threadIdx.x; if(RowIdx<ArowSize && ColIdx<AcolSize) { Ashared[threadIdx.y*blockDim.x + threadIdx.x] = A[RowIdx*AcolSize + ColIdx]; } else{ Ashared[threadIdx.y*blockDim.x + threadIdx.x] = 0.0f; } // Debug outptus // if(blockIdx.x==0 && blockIdx.y==0 && threadIdx.x==0 && threadIdx.y==0){ // printf("loaded value(%f) from A[%d][%d]n",A[RowIdx*AcolSize + ColIdx],RowIdx,ColIdx); // } } </code>

__device__ void loadIntoSharedMemoryColPhase(float *Ashared, float *A, int colTileIdx, int ArowSize, int AcolSize){
    /*
    Load data from input A into shared memory Ashared - of size (blockDim.y, blockDim.x)
    Assume that thread block size is same as Ashared size
    Phases move across columns of input matrix A
    */
    const unsigned int RowIdx =  blockIdx.y*blockDim.y + threadIdx.y;
    const unsigned int ColIdx =  colTileIdx*blockDim.x + threadIdx.x;
    
    if(RowIdx<ArowSize && ColIdx<AcolSize) {
        Ashared[threadIdx.y*blockDim.x + threadIdx.x] = A[RowIdx*AcolSize + ColIdx];
    }
    else{
        Ashared[threadIdx.y*blockDim.x + threadIdx.x] = 0.0f;
    }
    // Debug outptus
    // if(blockIdx.x==0 && blockIdx.y==0 && threadIdx.x==0 && threadIdx.y==0){
    //     printf("loaded value(%f) from A[%d][%d]n",A[RowIdx*AcolSize + ColIdx],RowIdx,ColIdx);
    // }
}

Here are interesting errors that I see:

Without checking if result is beyond MAX_VALUE, I see that matrices of size 64×64 randomly differ from the CPU output used to test. Matrices beyond that – 128, 256, 512, etc.. always give an error.
When I had reduced the block size to (4, 4) instead of the original (32, 32), all tests passed!

This is unclear because of the following reasons:

In my experience the floating point issues that arise in a GPU should arise in the CPU as well and they outputs should be “equally” wrong. Why did the condition help pass the 64×64 test case?

I found a relevant source here: PyCUDA precision of matrix multiplication code that potentially makes it clear. It would make sense if my CPU code is performing this clipping by default.
Why is the block size deciding the result of my computation at all? If SMs do not have enough resources, would the resource manager not wait until enough resources are available for execution? What could the possible reason for this solution to work be? I tested this without any intuition and fail to explain this behavior.

Adding python test code:

<code>import pycuda.driver as cuda

import pycuda.autoinit

import pycuda.gpuarray as gpuarray

from pycuda.compiler import SourceModule

import numpy as np

BLOCK = (4, 4, 1) # Use a tuple of integers for BLOCK dimensions

def GRID(M, N):

return (int(np.ceil(M / BLOCK[0])), int(np.ceil(N / BLOCK[1])),

1) # Return a tuple of integers for GRID dimensions

def get_random_matrices(M, K, N):

A = np.random.random((M, K)).astype(np.float32)

B = np.random.random((K, N)).astype(np.float32)

C = np.random.random((M, N)).astype(np.float32)

return A, B, C

def get_ABC_gpu(A, B, C):

A_gpu = gpuarray.to_gpu(A)

B_gpu = gpuarray.to_gpu(B)

C_gpu = gpuarray.to_gpu(C)

return A_gpu, B_gpu, C_gpu

def load_kernel(kernel_path='./matmulKernels.cpp'):

with open(kernel_path, 'r') as f:

kernel = f.read()

kernel = r'{}'.format(kernel)

return SourceModule(kernel)

def matmul_gpu(A, B, C, M, K, N, fname='naive', shared=False):

A_gpu, B_gpu, C_gpu = get_ABC_gpu(A, B, C)

mod = load_kernel()

func = mod.get_function(fname)

if (not shared):

func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N))

else:

SHARED_MEM_SIZE = int(BLOCK[0] * BLOCK[1] * 3

* np.dtype(np.float32).itemsize)

func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N),

shared=SHARED_MEM_SIZE)

C = C_gpu.get()

return C

def matmul_withtranspose_gpu(A, B, C, M, K, N, fname='naive', shared=False):

A_gpu, B_gpu, C_gpu = get_ABC_gpu(A, B.T, C)

mod = load_kernel()

func = mod.get_function(fname)

if (not shared):

func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N))

else:

SHARED_MEM_SIZE = int(BLOCK[0] * BLOCK[1] * 3

* np.dtype(np.float32).itemsize)

func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N),

shared=SHARED_MEM_SIZE)

C = C_gpu.get()

return C

def matmul_cpu(A, B, C, M, K, N, transpose=False):

Ashape = A.shape

Bshape = B.shape

if (not transpose):

assert (Ashape[1] == Bshape[0])

C = np.matmul(A, B)

else:

assert (Ashape[1] == Bshape[1])

C = np.matmul(A, B.T)

return C

def valid_check(M, K, N, fname='naive', verbose=False, shared=False,

transpose=False):

A, B, C = get_random_matrices(M, K, N)

if (transpose):

C_gpu_comp = matmul_withtranspose_gpu(A, B, C, M, K, N, fname=fname,

shared=shared)

else:

C_gpu_comp = matmul_gpu(A, B, C, M, K, N, fname=fname, shared=shared)

C_cpu_comp = matmul_cpu(A, B, C, M, K, N, transpose)

try:

assert (np.allclose(C_cpu_comp, C_gpu_comp))

print("Valid computation!")

return 1

except Exception:

if (verbose):

return 0

else:

raise Exception("CPU and GPU results do not match!")

# sizes = 2**np.arange(4, 12).astype(np.int32)

sizes = np.array([1235, 257]).astype(np.int32)

num_exps = 1

for i in range(num_exps):

TRANSPOSE = False

for size in sizes:

M, K, N = size, size, size

print("*"*50)

print(f"Testing M = {M}, K = {K}, N = {N}, transpose={TRANSPOSE}")

valid = valid_check(M, K, N, fname='sharedMemKernel', shared=True,

verbose=True, transpose=TRANSPOSE)

if (valid == 1):

print("33[32mTest passed!33[0m")

print("*"*50)

elif (valid == 0):

print("33[31m.....FAILED!.....33[0m")

print("*"*50)

</code>

<code>import pycuda.driver as cuda import pycuda.autoinit import pycuda.gpuarray as gpuarray from pycuda.compiler import SourceModule import numpy as np BLOCK = (4, 4, 1) # Use a tuple of integers for BLOCK dimensions def GRID(M, N): return (int(np.ceil(M / BLOCK[0])), int(np.ceil(N / BLOCK[1])), 1) # Return a tuple of integers for GRID dimensions def get_random_matrices(M, K, N): A = np.random.random((M, K)).astype(np.float32) B = np.random.random((K, N)).astype(np.float32) C = np.random.random((M, N)).astype(np.float32) return A, B, C def get_ABC_gpu(A, B, C): A_gpu = gpuarray.to_gpu(A) B_gpu = gpuarray.to_gpu(B) C_gpu = gpuarray.to_gpu(C) return A_gpu, B_gpu, C_gpu def load_kernel(kernel_path='./matmulKernels.cpp'): with open(kernel_path, 'r') as f: kernel = f.read() kernel = r'{}'.format(kernel) return SourceModule(kernel) def matmul_gpu(A, B, C, M, K, N, fname='naive', shared=False): A_gpu, B_gpu, C_gpu = get_ABC_gpu(A, B, C) mod = load_kernel() func = mod.get_function(fname) if (not shared): func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N)) else: SHARED_MEM_SIZE = int(BLOCK[0] * BLOCK[1] * 3 * np.dtype(np.float32).itemsize) func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N), shared=SHARED_MEM_SIZE) C = C_gpu.get() return C def matmul_withtranspose_gpu(A, B, C, M, K, N, fname='naive', shared=False): A_gpu, B_gpu, C_gpu = get_ABC_gpu(A, B.T, C) mod = load_kernel() func = mod.get_function(fname) if (not shared): func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N)) else: SHARED_MEM_SIZE = int(BLOCK[0] * BLOCK[1] * 3 * np.dtype(np.float32).itemsize) func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N), shared=SHARED_MEM_SIZE) C = C_gpu.get() return C def matmul_cpu(A, B, C, M, K, N, transpose=False): Ashape = A.shape Bshape = B.shape if (not transpose): assert (Ashape[1] == Bshape[0]) C = np.matmul(A, B) else: assert (Ashape[1] == Bshape[1]) C = np.matmul(A, B.T) return C def valid_check(M, K, N, fname='naive', verbose=False, shared=False, transpose=False): A, B, C = get_random_matrices(M, K, N) if (transpose): C_gpu_comp = matmul_withtranspose_gpu(A, B, C, M, K, N, fname=fname, shared=shared) else: C_gpu_comp = matmul_gpu(A, B, C, M, K, N, fname=fname, shared=shared) C_cpu_comp = matmul_cpu(A, B, C, M, K, N, transpose) try: assert (np.allclose(C_cpu_comp, C_gpu_comp)) print("Valid computation!") return 1 except Exception: if (verbose): return 0 else: raise Exception("CPU and GPU results do not match!") # sizes = 2**np.arange(4, 12).astype(np.int32) sizes = np.array([1235, 257]).astype(np.int32) num_exps = 1 for i in range(num_exps): TRANSPOSE = False for size in sizes: M, K, N = size, size, size print("*"*50) print(f"Testing M = {M}, K = {K}, N = {N}, transpose={TRANSPOSE}") valid = valid_check(M, K, N, fname='sharedMemKernel', shared=True, verbose=True, transpose=TRANSPOSE) if (valid == 1): print("33[32mTest passed!33[0m") print("*"*50) elif (valid == 0): print("33[31m.....FAILED!.....33[0m") print("*"*50) </code>

import pycuda.driver as cuda
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
from pycuda.compiler import SourceModule
import numpy as np

BLOCK = (4, 4, 1)  # Use a tuple of integers for BLOCK dimensions


def GRID(M, N):
    return (int(np.ceil(M / BLOCK[0])), int(np.ceil(N / BLOCK[1])),
            1)  # Return a tuple of integers for GRID dimensions


def get_random_matrices(M, K, N):
    A = np.random.random((M, K)).astype(np.float32)
    B = np.random.random((K, N)).astype(np.float32)
    C = np.random.random((M, N)).astype(np.float32)
    return A, B, C


def get_ABC_gpu(A, B, C):
    A_gpu = gpuarray.to_gpu(A)
    B_gpu = gpuarray.to_gpu(B)
    C_gpu = gpuarray.to_gpu(C)
    return A_gpu, B_gpu, C_gpu


def load_kernel(kernel_path='./matmulKernels.cpp'):
    with open(kernel_path, 'r') as f:
        kernel = f.read()
    kernel = r'{}'.format(kernel)
    return SourceModule(kernel)


def matmul_gpu(A, B, C, M, K, N, fname='naive', shared=False):
    A_gpu, B_gpu, C_gpu = get_ABC_gpu(A, B, C)
    mod = load_kernel()
    func = mod.get_function(fname)
    if (not shared):
        func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N))
    else:
        SHARED_MEM_SIZE = int(BLOCK[0] * BLOCK[1] * 3
                              * np.dtype(np.float32).itemsize)
        func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N),
             shared=SHARED_MEM_SIZE)
    C = C_gpu.get()
    return C


def matmul_withtranspose_gpu(A, B, C, M, K, N, fname='naive', shared=False):
    A_gpu, B_gpu, C_gpu = get_ABC_gpu(A, B.T, C)
    mod = load_kernel()
    func = mod.get_function(fname)
    if (not shared):
        func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N))
    else:
        SHARED_MEM_SIZE = int(BLOCK[0] * BLOCK[1] * 3
                              * np.dtype(np.float32).itemsize)
        func(C_gpu, A_gpu, B_gpu, M, K, N, block=BLOCK, grid=GRID(M, N),
             shared=SHARED_MEM_SIZE)
    C = C_gpu.get()
    return C


def matmul_cpu(A, B, C, M, K, N, transpose=False):
    Ashape = A.shape
    Bshape = B.shape
    if (not transpose):
        assert (Ashape[1] == Bshape[0])
        C = np.matmul(A, B)
    else:
        assert (Ashape[1] == Bshape[1])
        C = np.matmul(A, B.T)
    return C


def valid_check(M, K, N, fname='naive', verbose=False, shared=False,
                transpose=False):
    A, B, C = get_random_matrices(M, K, N)
    if (transpose):
        C_gpu_comp = matmul_withtranspose_gpu(A, B, C, M, K, N, fname=fname,
                                              shared=shared)
    else:
        C_gpu_comp = matmul_gpu(A, B, C, M, K, N, fname=fname, shared=shared)
    C_cpu_comp = matmul_cpu(A, B, C, M, K, N, transpose)
    try:
        assert (np.allclose(C_cpu_comp, C_gpu_comp))
        print("Valid computation!")
        return 1
    except Exception:
        if (verbose):
            return 0
        else:
            raise Exception("CPU and GPU results do not match!")


# sizes = 2**np.arange(4, 12).astype(np.int32)
sizes = np.array([1235, 257]).astype(np.int32)
num_exps = 1
for i in range(num_exps):
    TRANSPOSE = False
    for size in sizes:
        M, K, N = size, size, size
        print("*"*50)
        print(f"Testing M = {M}, K = {K}, N = {N}, transpose={TRANSPOSE}")
        valid = valid_check(M, K, N, fname='sharedMemKernel', shared=True,
                            verbose=True, transpose=TRANSPOSE)
        if (valid == 1):
            print("33[32mTest passed!33[0m")
            print("*"*50)
        elif (valid == 0):
            print("33[31m.....FAILED!.....33[0m")
            print("*"*50)

Addition device func:

<code>__device__ void loadIntoSharedMemoryRowPhase(float *Ashared, float *A, int rowTileIdx, int ArowSize, int AcolSize){

/*

Load data from input A into shared memory Ashared - of size (blockDim.y, blockDim.x)

Assume that thread block size is same as Ashared size

Phases move across rows of input matrix A

*/

const unsigned int RowIdx = rowTileIdx*blockDim.y + threadIdx.y;

const unsigned int ColIdx = blockIdx.x*blockDim.x + threadIdx.x;

if(RowIdx<ArowSize && ColIdx<AcolSize) {

Ashared[threadIdx.y*blockDim.x + threadIdx.x] = A[RowIdx*AcolSize + ColIdx];

}

else{

Ashared[threadIdx.y*blockDim.x + threadIdx.x] = 0.0f;

}

// Debug outptus

// if(blockIdx.x==0 && blockIdx.y==0 && threadIdx.x==0 && threadIdx.y==0){

// printf("loaded value(%f) from B[%d][%d]n",A[RowIdx*AcolSize + ColIdx],RowIdx,ColIdx);

// }

}

</code>

<code>__device__ void loadIntoSharedMemoryRowPhase(float *Ashared, float *A, int rowTileIdx, int ArowSize, int AcolSize){ /* Load data from input A into shared memory Ashared - of size (blockDim.y, blockDim.x) Assume that thread block size is same as Ashared size Phases move across rows of input matrix A */ const unsigned int RowIdx = rowTileIdx*blockDim.y + threadIdx.y; const unsigned int ColIdx = blockIdx.x*blockDim.x + threadIdx.x; if(RowIdx<ArowSize && ColIdx<AcolSize) { Ashared[threadIdx.y*blockDim.x + threadIdx.x] = A[RowIdx*AcolSize + ColIdx]; } else{ Ashared[threadIdx.y*blockDim.x + threadIdx.x] = 0.0f; } // Debug outptus // if(blockIdx.x==0 && blockIdx.y==0 && threadIdx.x==0 && threadIdx.y==0){ // printf("loaded value(%f) from B[%d][%d]n",A[RowIdx*AcolSize + ColIdx],RowIdx,ColIdx); // } } </code>

__device__ void loadIntoSharedMemoryRowPhase(float *Ashared, float *A, int rowTileIdx, int ArowSize, int AcolSize){
    /*
    Load data from input A into shared memory Ashared - of size (blockDim.y, blockDim.x)
    Assume that thread block size is same as Ashared size
    Phases move across rows of input matrix A
    */
    const unsigned int RowIdx =  rowTileIdx*blockDim.y + threadIdx.y;
    const unsigned int ColIdx =  blockIdx.x*blockDim.x + threadIdx.x;
    
    if(RowIdx<ArowSize && ColIdx<AcolSize) {
        Ashared[threadIdx.y*blockDim.x + threadIdx.x] = A[RowIdx*AcolSize + ColIdx];
    }
    else{
        Ashared[threadIdx.y*blockDim.x + threadIdx.x] = 0.0f;
    }
    // Debug outptus
    // if(blockIdx.x==0 && blockIdx.y==0 && threadIdx.x==0 && threadIdx.y==0){
    //         printf("loaded value(%f) from B[%d][%d]n",A[RowIdx*AcolSize + ColIdx],RowIdx,ColIdx);
    //     }
}

Thiết kế website giá rẻ

Danh mục

PyCUDA | Shared Matrix Multiplication with Phases | Unintuitive Error

PyCUDA | Shared Matrix Multiplication with Phases | Unintuitive Error