Thiết kế website giá rẻ

Question

I want to change the variable M,N by the argv parameters when the code is executed.My code[tvm_test.cu] is below:

#include <cuda_fp16.h>
#include <iostream>
#include <cuda_runtime.h>
#include <chrono>
#include <assert.h>

#define CUDA_CHECK(status)                                                    
    {                                                                         
        cudaError_t error = status;                                           
        if (error != cudaSuccess)                                             
        {                                                                     
            std::cerr << "Got bad cuda status: " << cudaGetErrorString(error) 
                      << " at line: " << __LINE__ << std::endl;               
            exit(EXIT_FAILURE);                                               
        }                                                                     
    }

__device__ int M=8192,N=8192;
const int tx = 1024, ty = 1, tz = 1;
const int element_per_thread = 256;
// const int M = 2048, N = 2048;


__global__
void int2half_I2H(int8_t* __restrict__ input, half* __restrict__ output){
    #pragma unroll
    for(int i = 0; i < 256; ++ i){
        output[(blockIdx.x * 1024 + threadIdx.x) * 256 + i] = __int2half_rn(input[(blockIdx.x * 1024 + threadIdx.x) * 256 + i]);
    }
}

__global__
void int2half_I2H_tvm(int8_t* input, half* output, int M, int N){
    #pragma unroll
    for(int i = 0; i < 256; ++ i){
        output[blockIdx.x * 1024 + threadIdx.x + i * (M * N) / 256] = 
            __int2half_rn(input[blockIdx.x * 1024 + threadIdx.x + i * (M * N) / 256]);
    }
}

__global__
void int2half_I2H_tvm(int8_t* input, half* output){
    int rM = 8192;
    int rN = 8192;
    // #pragma unroll
    for(int i = 0; i < 256; ++ i){
        output[blockIdx.x * 1024 + threadIdx.x + i * (rM * rN) / 256] = 
            __int2half_rn(input[blockIdx.x * 1024 + threadIdx.x + i * (rM * rN) / 256]);
    }
}

int main(int argc, char *argv[]){
    M=atoi(argv[1]); 
    N=atoi(argv[2]); 
    srand(time(NULL));
    
    // for int2half
    int8_t *input = (int8_t *)malloc(M * N); 
    half *output_int2half = (half *)malloc(M * N * 2);

    // for prmt, + 128
    int8_t *input_preprocess = (int8_t *)malloc(M * N); 
    half *output_prmt = (half *)malloc(M * N * 2);

    // result
    float *golden = (float *)malloc(M * N * 4);

    // rand input [-10, 10]
    for(int i = 0; i < M * N; ++ i){
        input[i] = (int8_t)((rand() % 21) - 10);
        golden[i] = (float)input[i];
    }
    
    // preprocess + 128
    for(int i = 0; i < M * N; ++ i){
        input_preprocess[i] = input[i] + 128;
    }
    
    int8_t *d_input;
    int8_t *d_input_preprocess;
    half *d_output_int2half;
    half *d_output_prmt;

    CUDA_CHECK(cudaMalloc(&d_input, M * N));
    CUDA_CHECK(cudaMalloc(&d_input_preprocess, M * N));
    CUDA_CHECK(cudaMalloc(&d_output_int2half, M * N * 2));
    CUDA_CHECK(cudaMalloc(&d_output_prmt, M * N * 2));

    CUDA_CHECK(cudaMemcpy(d_input, input, M * N, cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_input_preprocess, input_preprocess, M * N, cudaMemcpyHostToDevice));

   
    
    dim3 dimBlock(1024);
    dim3 dimGrid(256);
    for(int i = 0; i < 1000; ++ i){
        // printf("%d %d", M, N);
        //int2half_I2H_tvm<<<dimGrid, dimBlock>>> (d_input, d_output_int2half);
        int2half_I2H_tvm<<<dimGrid, dimBlock>>> (d_input, d_output_int2half,M,N);
        cudaDeviceSynchronize();
    }
    

    CUDA_CHECK(cudaMemcpy(output_prmt, d_output_prmt, M * N * 2, cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(output_int2half, d_output_int2half, M * N * 2, cudaMemcpyDeviceToHost));

   
}

But after the complie instruction “nvcc -o tvm_test tvm_test.cu”,we run “./tvm_test 8192 8192″. The error: Got bad cuda status: an illegal memory access was encountered at line: 104. the line code is ” CUDA_CHECK(cudaMemcpy(output_prmt, d_output_prmt, M * N * 2, cudaMemcpyDeviceToHost));”

I need your help! Please tell me ! Thank you very much!

we use the “int2half_I2H” function instead of the “int2half_I2H_tvm” function. there is no error!

Thiết kế website giá rẻ

Danh mục

CUDA cudaMemcpy an illegal memory access was encountered