Thiết kế website giá rẻ

Question

I want to change the variable M,N by the argv parameters when the code is executed.My code[tvm_test.cu] is below:

<code>#include <cuda_fp16.h>

#include <iostream>

#include <cuda_runtime.h>

#include <chrono>

#include <assert.h>

#define CUDA_CHECK(status)

{

cudaError_t error = status;

if (error != cudaSuccess)

{

std::cerr << "Got bad cuda status: " << cudaGetErrorString(error)

<< " at line: " << __LINE__ << std::endl;

exit(EXIT_FAILURE);

}

__device__ int M=8192,N=8192;

const int tx = 1024, ty = 1, tz = 1;

const int element_per_thread = 256;

// const int M = 2048, N = 2048;

__global__

void int2half_I2H(int8_t* __restrict__ input, half* __restrict__ output){

#pragma unroll

for(int i = 0; i < 256; ++ i){

output[(blockIdx.x * 1024 + threadIdx.x) * 256 + i] = __int2half_rn(input[(blockIdx.x * 1024 + threadIdx.x) * 256 + i]);

}

__global__

void int2half_I2H_tvm(int8_t* input, half* output, int M, int N){

#pragma unroll

for(int i = 0; i < 256; ++ i){

output[blockIdx.x * 1024 + threadIdx.x + i * (M * N) / 256] =

__int2half_rn(input[blockIdx.x * 1024 + threadIdx.x + i * (M * N) / 256]);

}

__global__

void int2half_I2H_tvm(int8_t* input, half* output){

int rM = 8192;

int rN = 8192;

// #pragma unroll

for(int i = 0; i < 256; ++ i){

output[blockIdx.x * 1024 + threadIdx.x + i * (rM * rN) / 256] =

__int2half_rn(input[blockIdx.x * 1024 + threadIdx.x + i * (rM * rN) / 256]);

}

int main(int argc, char *argv[]){

M=atoi(argv[1]);

N=atoi(argv[2]);

srand(time(NULL));

// for int2half

int8_t *input = (int8_t *)malloc(M * N);

half *output_int2half = (half *)malloc(M * N * 2);

// for prmt, + 128

int8_t *input_preprocess = (int8_t *)malloc(M * N);

half *output_prmt = (half *)malloc(M * N * 2);

// result

float *golden = (float *)malloc(M * N * 4);

// rand input [-10, 10]

for(int i = 0; i < M * N; ++ i){

input[i] = (int8_t)((rand() % 21) - 10);

golden[i] = (float)input[i];

}

// preprocess + 128

for(int i = 0; i < M * N; ++ i){

input_preprocess[i] = input[i] + 128;

}

int8_t *d_input;

int8_t *d_input_preprocess;

half *d_output_int2half;

half *d_output_prmt;

CUDA_CHECK(cudaMalloc(&d_input, M * N));

CUDA_CHECK(cudaMalloc(&d_input_preprocess, M * N));

CUDA_CHECK(cudaMalloc(&d_output_int2half, M * N * 2));

CUDA_CHECK(cudaMalloc(&d_output_prmt, M * N * 2));

CUDA_CHECK(cudaMemcpy(d_input, input, M * N, cudaMemcpyHostToDevice));

CUDA_CHECK(cudaMemcpy(d_input_preprocess, input_preprocess, M * N, cudaMemcpyHostToDevice));

dim3 dimBlock(1024);

dim3 dimGrid(256);

for(int i = 0; i < 1000; ++ i){

// printf("%d %d", M, N);

//int2half_I2H_tvm<<<dimGrid, dimBlock>>> (d_input, d_output_int2half);

int2half_I2H_tvm<<<dimGrid, dimBlock>>> (d_input, d_output_int2half,M,N);

cudaDeviceSynchronize();

}

CUDA_CHECK(cudaMemcpy(output_prmt, d_output_prmt, M * N * 2, cudaMemcpyDeviceToHost));

CUDA_CHECK(cudaMemcpy(output_int2half, d_output_int2half, M * N * 2, cudaMemcpyDeviceToHost));

}

</code>

<code>#include <cuda_fp16.h> #include <iostream> #include <cuda_runtime.h> #include <chrono> #include <assert.h> #define CUDA_CHECK(status) { cudaError_t error = status; if (error != cudaSuccess) { std::cerr << "Got bad cuda status: " << cudaGetErrorString(error) << " at line: " << __LINE__ << std::endl; exit(EXIT_FAILURE); } } __device__ int M=8192,N=8192; const int tx = 1024, ty = 1, tz = 1; const int element_per_thread = 256; // const int M = 2048, N = 2048; __global__ void int2half_I2H(int8_t* __restrict__ input, half* __restrict__ output){ #pragma unroll for(int i = 0; i < 256; ++ i){ output[(blockIdx.x * 1024 + threadIdx.x) * 256 + i] = __int2half_rn(input[(blockIdx.x * 1024 + threadIdx.x) * 256 + i]); } } __global__ void int2half_I2H_tvm(int8_t* input, half* output, int M, int N){ #pragma unroll for(int i = 0; i < 256; ++ i){ output[blockIdx.x * 1024 + threadIdx.x + i * (M * N) / 256] = __int2half_rn(input[blockIdx.x * 1024 + threadIdx.x + i * (M * N) / 256]); } } __global__ void int2half_I2H_tvm(int8_t* input, half* output){ int rM = 8192; int rN = 8192; // #pragma unroll for(int i = 0; i < 256; ++ i){ output[blockIdx.x * 1024 + threadIdx.x + i * (rM * rN) / 256] = __int2half_rn(input[blockIdx.x * 1024 + threadIdx.x + i * (rM * rN) / 256]); } } int main(int argc, char *argv[]){ M=atoi(argv[1]); N=atoi(argv[2]); srand(time(NULL)); // for int2half int8_t *input = (int8_t *)malloc(M * N); half *output_int2half = (half *)malloc(M * N * 2); // for prmt, + 128 int8_t *input_preprocess = (int8_t *)malloc(M * N); half *output_prmt = (half *)malloc(M * N * 2); // result float *golden = (float *)malloc(M * N * 4); // rand input [-10, 10] for(int i = 0; i < M * N; ++ i){ input[i] = (int8_t)((rand() % 21) - 10); golden[i] = (float)input[i]; } // preprocess + 128 for(int i = 0; i < M * N; ++ i){ input_preprocess[i] = input[i] + 128; } int8_t *d_input; int8_t *d_input_preprocess; half *d_output_int2half; half *d_output_prmt; CUDA_CHECK(cudaMalloc(&d_input, M * N)); CUDA_CHECK(cudaMalloc(&d_input_preprocess, M * N)); CUDA_CHECK(cudaMalloc(&d_output_int2half, M * N * 2)); CUDA_CHECK(cudaMalloc(&d_output_prmt, M * N * 2)); CUDA_CHECK(cudaMemcpy(d_input, input, M * N, cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(d_input_preprocess, input_preprocess, M * N, cudaMemcpyHostToDevice)); dim3 dimBlock(1024); dim3 dimGrid(256); for(int i = 0; i < 1000; ++ i){ // printf("%d %d", M, N); //int2half_I2H_tvm<<<dimGrid, dimBlock>>> (d_input, d_output_int2half); int2half_I2H_tvm<<<dimGrid, dimBlock>>> (d_input, d_output_int2half,M,N); cudaDeviceSynchronize(); } CUDA_CHECK(cudaMemcpy(output_prmt, d_output_prmt, M * N * 2, cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(output_int2half, d_output_int2half, M * N * 2, cudaMemcpyDeviceToHost)); } </code>

#include <cuda_fp16.h>
#include <iostream>
#include <cuda_runtime.h>
#include <chrono>
#include <assert.h>

#define CUDA_CHECK(status)                                                    
    {                                                                         
        cudaError_t error = status;                                           
        if (error != cudaSuccess)                                             
        {                                                                     
            std::cerr << "Got bad cuda status: " << cudaGetErrorString(error) 
                      << " at line: " << __LINE__ << std::endl;               
            exit(EXIT_FAILURE);                                               
        }                                                                     
    }

__device__ int M=8192,N=8192;
const int tx = 1024, ty = 1, tz = 1;
const int element_per_thread = 256;
// const int M = 2048, N = 2048;


__global__
void int2half_I2H(int8_t* __restrict__ input, half* __restrict__ output){
    #pragma unroll
    for(int i = 0; i < 256; ++ i){
        output[(blockIdx.x * 1024 + threadIdx.x) * 256 + i] = __int2half_rn(input[(blockIdx.x * 1024 + threadIdx.x) * 256 + i]);
    }
}

__global__
void int2half_I2H_tvm(int8_t* input, half* output, int M, int N){
    #pragma unroll
    for(int i = 0; i < 256; ++ i){
        output[blockIdx.x * 1024 + threadIdx.x + i * (M * N) / 256] = 
            __int2half_rn(input[blockIdx.x * 1024 + threadIdx.x + i * (M * N) / 256]);
    }
}

__global__
void int2half_I2H_tvm(int8_t* input, half* output){
    int rM = 8192;
    int rN = 8192;
    // #pragma unroll
    for(int i = 0; i < 256; ++ i){
        output[blockIdx.x * 1024 + threadIdx.x + i * (rM * rN) / 256] = 
            __int2half_rn(input[blockIdx.x * 1024 + threadIdx.x + i * (rM * rN) / 256]);
    }
}

int main(int argc, char *argv[]){
    M=atoi(argv[1]); 
    N=atoi(argv[2]); 
    srand(time(NULL));
    
    // for int2half
    int8_t *input = (int8_t *)malloc(M * N); 
    half *output_int2half = (half *)malloc(M * N * 2);

    // for prmt, + 128
    int8_t *input_preprocess = (int8_t *)malloc(M * N); 
    half *output_prmt = (half *)malloc(M * N * 2);

    // result
    float *golden = (float *)malloc(M * N * 4);

    // rand input [-10, 10]
    for(int i = 0; i < M * N; ++ i){
        input[i] = (int8_t)((rand() % 21) - 10);
        golden[i] = (float)input[i];
    }
    
    // preprocess + 128
    for(int i = 0; i < M * N; ++ i){
        input_preprocess[i] = input[i] + 128;
    }
    
    int8_t *d_input;
    int8_t *d_input_preprocess;
    half *d_output_int2half;
    half *d_output_prmt;

    CUDA_CHECK(cudaMalloc(&d_input, M * N));
    CUDA_CHECK(cudaMalloc(&d_input_preprocess, M * N));
    CUDA_CHECK(cudaMalloc(&d_output_int2half, M * N * 2));
    CUDA_CHECK(cudaMalloc(&d_output_prmt, M * N * 2));

    CUDA_CHECK(cudaMemcpy(d_input, input, M * N, cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_input_preprocess, input_preprocess, M * N, cudaMemcpyHostToDevice));

   
    
    dim3 dimBlock(1024);
    dim3 dimGrid(256);
    for(int i = 0; i < 1000; ++ i){
        // printf("%d %d", M, N);
        //int2half_I2H_tvm<<<dimGrid, dimBlock>>> (d_input, d_output_int2half);
        int2half_I2H_tvm<<<dimGrid, dimBlock>>> (d_input, d_output_int2half,M,N);
        cudaDeviceSynchronize();
    }
    

    CUDA_CHECK(cudaMemcpy(output_prmt, d_output_prmt, M * N * 2, cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(output_int2half, d_output_int2half, M * N * 2, cudaMemcpyDeviceToHost));

   
}

But after the complie instruction “nvcc -o tvm_test tvm_test.cu”,we run “./tvm_test 8192 8192″. The error: Got bad cuda status: an illegal memory access was encountered at line: 104. the line code is ” CUDA_CHECK(cudaMemcpy(output_prmt, d_output_prmt, M * N * 2, cudaMemcpyDeviceToHost));”

I need your help! Please tell me ! Thank you very much!

we use the “int2half_I2H” function instead of the “int2half_I2H_tvm” function. there is no error!

Thiết kế website giá rẻ

Danh mục

CUDA cudaMemcpy an illegal memory access was encountered