I want to change the variable M,N by the argv parameters when the code is executed.My code[tvm_test.cu] is below:
#include <cuda_fp16.h>
#include <iostream>
#include <cuda_runtime.h>
#include <chrono>
#include <assert.h>
#define CUDA_CHECK(status)
{
cudaError_t error = status;
if (error != cudaSuccess)
{
std::cerr << "Got bad cuda status: " << cudaGetErrorString(error)
<< " at line: " << __LINE__ << std::endl;
exit(EXIT_FAILURE);
}
}
__device__ int M=8192,N=8192;
const int tx = 1024, ty = 1, tz = 1;
const int element_per_thread = 256;
// const int M = 2048, N = 2048;
__global__
void int2half_I2H(int8_t* __restrict__ input, half* __restrict__ output){
#pragma unroll
for(int i = 0; i < 256; ++ i){
output[(blockIdx.x * 1024 + threadIdx.x) * 256 + i] = __int2half_rn(input[(blockIdx.x * 1024 + threadIdx.x) * 256 + i]);
}
}
__global__
void int2half_I2H_tvm(int8_t* input, half* output, int M, int N){
#pragma unroll
for(int i = 0; i < 256; ++ i){
output[blockIdx.x * 1024 + threadIdx.x + i * (M * N) / 256] =
__int2half_rn(input[blockIdx.x * 1024 + threadIdx.x + i * (M * N) / 256]);
}
}
__global__
void int2half_I2H_tvm(int8_t* input, half* output){
int rM = 8192;
int rN = 8192;
// #pragma unroll
for(int i = 0; i < 256; ++ i){
output[blockIdx.x * 1024 + threadIdx.x + i * (rM * rN) / 256] =
__int2half_rn(input[blockIdx.x * 1024 + threadIdx.x + i * (rM * rN) / 256]);
}
}
int main(int argc, char *argv[]){
M=atoi(argv[1]);
N=atoi(argv[2]);
srand(time(NULL));
// for int2half
int8_t *input = (int8_t *)malloc(M * N);
half *output_int2half = (half *)malloc(M * N * 2);
// for prmt, + 128
int8_t *input_preprocess = (int8_t *)malloc(M * N);
half *output_prmt = (half *)malloc(M * N * 2);
// result
float *golden = (float *)malloc(M * N * 4);
// rand input [-10, 10]
for(int i = 0; i < M * N; ++ i){
input[i] = (int8_t)((rand() % 21) - 10);
golden[i] = (float)input[i];
}
// preprocess + 128
for(int i = 0; i < M * N; ++ i){
input_preprocess[i] = input[i] + 128;
}
int8_t *d_input;
int8_t *d_input_preprocess;
half *d_output_int2half;
half *d_output_prmt;
CUDA_CHECK(cudaMalloc(&d_input, M * N));
CUDA_CHECK(cudaMalloc(&d_input_preprocess, M * N));
CUDA_CHECK(cudaMalloc(&d_output_int2half, M * N * 2));
CUDA_CHECK(cudaMalloc(&d_output_prmt, M * N * 2));
CUDA_CHECK(cudaMemcpy(d_input, input, M * N, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(d_input_preprocess, input_preprocess, M * N, cudaMemcpyHostToDevice));
dim3 dimBlock(1024);
dim3 dimGrid(256);
for(int i = 0; i < 1000; ++ i){
// printf("%d %d", M, N);
//int2half_I2H_tvm<<<dimGrid, dimBlock>>> (d_input, d_output_int2half);
int2half_I2H_tvm<<<dimGrid, dimBlock>>> (d_input, d_output_int2half,M,N);
cudaDeviceSynchronize();
}
CUDA_CHECK(cudaMemcpy(output_prmt, d_output_prmt, M * N * 2, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpy(output_int2half, d_output_int2half, M * N * 2, cudaMemcpyDeviceToHost));
}
But after the complie instruction “nvcc -o tvm_test tvm_test.cu”,we run “./tvm_test 8192 8192″. The error: Got bad cuda status: an illegal memory access was encountered at line: 104. the line code is ” CUDA_CHECK(cudaMemcpy(output_prmt, d_output_prmt, M * N * 2, cudaMemcpyDeviceToHost));”
I need your help! Please tell me ! Thank you very much!
we use the “int2half_I2H” function instead of the “int2half_I2H_tvm” function. there is no error!
hy pan is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.