I’m new to CUDA and just doing some exercises to get myself started. The following program is adapted from a homework problem for matrix multiplication found here. I’m working on a Windows 10/x64 machine using
$ nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Mar_28_02:30:10_Pacific_Daylight_Time_2024
Cuda compilation tools, release 12.4, V12.4.131
Build cuda_12.4.r12.4/compiler.34097967_0
My only CUDA device is
Device 0: "Quadro P620"
Total amount of global memory: 2048 MBytes (2147352576 bytes)
Number of multiprocessors: 4
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block: 49152 bytes
Total number of registers available per block: 65536
Warp size: 32
Maximum number of threads per multiprocessor: 2048
Maximum number of threads per block: 1024
Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535)
Problem. Depending on where I insert the macro cudaCheckErrors()
and the compilation mode, the program can get flagged by Windows Security as threat. For example, the following program does not work:
#include <stdio.h>
// these are just for timing measurments
#include <time.h>
// error checking macro
#define cudaCheckErrors(msg)
do {
cudaError_t __err = cudaGetLastError();
if (__err != cudaSuccess) {
fprintf(stderr, "Fatal error: %s (%s at %s:%d)n",
msg, cudaGetErrorString(__err),
__FILE__, __LINE__);
fprintf(stderr, "*** FAILED - ABORTINGn");
exit(1);
}
} while (0)
const int DSIZE = 4096;
const int block_size = 16; // CUDA maximum is 1024 *total* threads in block
const float A_val = 1.0f;
const float B_val = 2.0f;
// matrix multiply (naive) kernel: C = A * B
__global__ void mmul(const float *A, const float *B, float *C, int ds) {
int idx = threadIdx.x+blockDim.x*blockIdx.x; // create thread x index
int idy = threadIdx.y+blockDim.y*blockIdx.y; // create thread y index
if ((idx < ds) && (idy < ds)){
float temp = 0;
for (int i = 0; i < ds; i++)
temp += A[idy * ds + i] * B[i * ds + idx]; // dot product of row and column
C[idy * ds + idx] = temp;
}
}
int main(){
float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C;
// these are just for timing
clock_t t0, t1, t2;
double t1sum=0.0;
double t2sum=0.0;
// start timing
t0 = clock();
h_A = new float[DSIZE * DSIZE];
h_B = new float[DSIZE * DSIZE];
h_C = new float[DSIZE * DSIZE];
for (int i = 0; i < DSIZE * DSIZE; i++){
h_A[i] = A_val;
h_B[i] = B_val;
h_C[i] = 0;
}
// Initialization timing
t1 = clock();
t1sum = ((double)(t1 - t0)) / CLOCKS_PER_SEC;
printf("Init took %f seconds. Begin computen", t1sum);
// Allocate device memory and copy input data over to GPU
cudaMalloc(&d_A, DSIZE * DSIZE * sizeof(float));
// cudaCheckErrors("cudaMalloc failure");
cudaMalloc(&d_B, DSIZE * DSIZE * sizeof(float));
cudaCheckErrors("cudaMalloc failure");
cudaMalloc(&d_C, DSIZE * DSIZE * sizeof(float));
// cudaCheckErrors("cudaMalloc failure");
cudaMemcpy(d_A, h_A, DSIZE * DSIZE * sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy H2D failure");
cudaMemcpy(d_B, h_B, DSIZE * DSIZE * sizeof(float), cudaMemcpyHostToDevice);
// cudaCheckErrors("cudaMemcpy H2D failure");
// Launch kernel
dim3 block(block_size, block_size); // dim3 variable holds 3 dimensions
dim3 grid((DSIZE + block.x - 1) / block.x, (DSIZE + block.y - 1) / block.y);
mmul<<<grid, block>>>(d_A, d_B, d_C, DSIZE);
// cudaCheckErrors("kernel launch failure");
// Synchronize device
cudaDeviceSynchronize();
// cudaCheckErrors("Device synchronization failure");
// Copy results back to host
cudaMemcpy(h_C, d_C, DSIZE * DSIZE * sizeof(float), cudaMemcpyDeviceToHost);
// cudaCheckErrors("cudaMemcpy D2H failure for d_C");
// free device memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
// GPU timing
t2 = clock();
t2sum = ((double)(t2 - t1)) / CLOCKS_PER_SEC;
printf ("Done. Compute took %f secondsn", t2sum);
// Verify results
for (int i = 0; i < DSIZE * DSIZE; i++) {
if (h_C[i] != A_val * B_val * DSIZE) {
printf("Mismatch at index %d, was: %f, should be: %fn", i, h_C[i], A_val * B_val * DSIZE);
return -1;
}
}
printf("Success!n");
delete[] h_A;
delete[] h_B;
delete[] h_C;
return 0;
}
I tried to compile and run this program:
PS> nvcc -o mm ./matrix_mul.cu
PS> ./mm
and PowerShell throws the error
ResourceUnavailable: Program 'mm.exe' failed to run: An error occurred trying to start process 'mm.exe' with working directory '.'. Operation did not complete successfully because the file contains a virus or potentially unwanted software.At line:1 char:1
+ ./mm
I also receive notice from Windows Security saying that a severe threat of Trojan:Win64/Disdroth.EM!MTB was detected and quarantined. Notice that in this version we disabled some cudaCheckErrors()
but left the others in place.
Compile with -g
and it works. However, if I build the program by
PS> nvcc -clean ./matrix_mul.cu
PS> nvcc -g -o mm ./matrix_mul.cu
then it runs without any problem:
PS> ./mm
Init took 0.070000 seconds. Begin compute
Done. Compute took 2.469000 seconds
Success!
Enable checks after cudaMalloc
and cudaMemcpy
and the program works. If I have cudaCheckErrors()
after all calls to cudaMalloc()
or cudaMemcpy()
, then the program compiled without -g
will work as well. To be precise, what I meant is the modification
// Allocate device memory and copy input data over to GPU
cudaMalloc(&d_A, DSIZE * DSIZE * sizeof(float));
cudaCheckErrors("cudaMalloc failure");
cudaMalloc(&d_B, DSIZE * DSIZE * sizeof(float));
cudaCheckErrors("cudaMalloc failure");
cudaMalloc(&d_C, DSIZE * DSIZE * sizeof(float));
cudaCheckErrors("cudaMalloc failure");
cudaMemcpy(d_A, h_A, DSIZE * DSIZE * sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy H2D failure");
cudaMemcpy(d_B, h_B, DSIZE * DSIZE * sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy H2D failure");
// Launch kernel
dim3 block(block_size, block_size); // dim3 variable holds 3 dimensions
dim3 grid((DSIZE + block.x - 1) / block.x, (DSIZE + block.y - 1) / block.y);
mmul<<<grid, block>>>(d_A, d_B, d_C, DSIZE);
// cudaCheckErrors("kernel launch failure");
// Synchronize device
cudaDeviceSynchronize();
// cudaCheckErrors("Device synchronization failure");
// Copy results back to host
cudaMemcpy(h_C, d_C, DSIZE * DSIZE * sizeof(float), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy D2H failure for d_C");
// free device memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
Note that I still have some cudaCheckErrors()
commented out (after the kernel launch and device synchronization). Besides, I don’t have them following cudaFree()
. Again I tried to compile and run the program:
PS> nvcc -o mm ./matrix_mul.cu
PS> ./mm
Init took 0.050000 seconds. Begin compute
Done. Compute took 2.496000 seconds
Success!
Remove all checks and the program also works. I also experimented with removing the checks altogether, and the program (compiled without -g
) runs without a snag.
Question. I’m wondering if this strange behavior is simply a false positive by Windows Security, or it’s due to my wrong use of the functions cudaGetLastError()
and cudaGetErrorString()
wrapped in the macro. I’m not sufficiently familiar with the CUDA execution model to tell if the latter is the case. But at the same time, a quick search online shows that many CUDA users have seen their program flagged by Windows as malware (here and here for example).
Will appreciate any comment that can get me on the right track in this CUDA journey. Thanks in advance!