I use CUDA and CUSPARSE to code COO format SpMV, When I test SpMV run time, I find when I use small matrix it can print right run time, but use bigger matrix it just print 0.
int *d_rows, *d_columns;
float *d_values, *dX, *dY;
CHECK_CUDA(cudaMalloc((void**) &d_rows, nnz * sizeof(int)) )
CHECK_CUDA(cudaMalloc((void**) &d_columns, nnz * sizeof(int)) )
CHECK_CUDA(cudaMalloc((void**) &d_values, nnz * sizeof(float)) )
CHECK_CUDA(cudaMalloc((void**) &dX, num_cols * sizeof(float)) )
CHECK_CUDA(cudaMalloc((void**) &dY, num_rows * sizeof(float)) )
CHECK_CUDA(cudaMemcpy(d_rows, row_indices.data(), nnz * sizeof(int),
cudaMemcpyHostToDevice) )
CHECK_CUDA(cudaMemcpy(d_columns,col_indices.data(), nnz * sizeof(int),
cudaMemcpyHostToDevice) )
CHECK_CUDA(cudaMemcpy(d_values, values.data(), nnz * sizeof(float),
cudaMemcpyHostToDevice) )
CHECK_CUDA(cudaMemcpy(dX, hX.data(), num_cols * sizeof(float),
cudaMemcpyHostToDevice) )
I used “cudaMalloc” and “cudaMemcpy” to manage my device data, and when I check the time, I found “cusparseCreateCoo” function can successfully print time while “cusparseSpMV” cannot.
cusparseCreateCoo(&matA, num_rows, num_cols, nnz,
d_rows, d_columns, d_values,
CUSPARSE_INDEX_32I,
CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
CHECK_CUSPARSE(cusparseCreateDnVec(&vecX, num_cols, dX, CUDA_R_32F) )
CHECK_CUSPARSE(cusparseCreateDnVec(&vecY, num_rows, dY, CUDA_R_32F) )
CHECK_CUSPARSE(cusparseSpMV_bufferSize(
handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
&alpha, matA, vecX, &beta, vecY, CUDA_R_32F,
CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize) )
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
CHECK_CUSPARSE(cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
&alpha, matA, vecX, &beta, vecY, CUDA_R_32F,
CUSPARSE_SPMV_ALG_DEFAULT, dBuffer) )
cudaDeviceSynchronize();
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float s = 0;
cudaEventElapsedTime(&s, start, stop);
printf("Time taken for SpMV: %lf msn" ,s);
The wrong print likes: Time taken for SpMV: 0.000000 ms
CUDA API failed at line 232 with error: an illegal memory access was encountered (700)
line 232 is:
CHECK_CUDA(cudaFree(d_rows) )
CHECK_CUDA(cudaFree(d_columns) )
CHECK_CUDA(cudaFree(d_values) )
CHECK_CUDA(cudaFree(dX) )
CHECK_CUDA(cudaFree(dY) )
So I thought the reason may was GPU memory.
And I changed my mem-function to “cudaMallocManaged”, then I test my code and print every array.
The array data is correct but test time also is 0.
I’d want to know the wrong place in my code
lingr is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.