This is my first time using shared memory.
I use tmp_
for storing local computing result local_
in the kernel.
After I check each step of computing result, I noticed that these two values are different.
local_
is my expect answer:
k_n: 0, tmp_[k_n]: ( 0.736924, 0.656535 )
k_n: 0, local_: ( 0.713449, 0.655452 )
k_n: 1, tmp_[k_n]: ( -0.196119, -0.062858 )
k_n: 1, local_: ( -0.209227, -0.039096 )
k_n: 2, tmp_[k_n]: ( -0.446193, -0.209284 )
k_n: 2, local_: ( -0.425918, -0.208783 )
k_n: 3, tmp_[k_n]: ( -0.258998, 0.294438 )
k_n: 3, local_: ( -0.273593, 0.264612 )
The way I call the kernel in host:
<code>size_t sharedMemSize = sizeof(cuDoubleComplex) * cfg.m * cfg.n;
first_scanning<<<gridNum_, blockNum_, sharedMemSize>>>(thrust::raw_pointer_cast(d_m_steering_vec.data()), noise_cov.data(), &cfg);
<code>size_t sharedMemSize = sizeof(cuDoubleComplex) * cfg.m * cfg.n;
first_scanning<<<gridNum_, blockNum_, sharedMemSize>>>(thrust::raw_pointer_cast(d_m_steering_vec.data()), noise_cov.data(), &cfg);
</code>
size_t sharedMemSize = sizeof(cuDoubleComplex) * cfg.m * cfg.n;
first_scanning<<<gridNum_, blockNum_, sharedMemSize>>>(thrust::raw_pointer_cast(d_m_steering_vec.data()), noise_cov.data(), &cfg);
Kernel function:
<code>__global__ void first_scanning(cuDoubleComplex **p_steering_v, cuDoubleComplex *noiseSpace, DOAConfig *cfg) {
double lambda = cfg->lambda;
extern __shared__ cuDoubleComplex tmp_[];
int i = blockDim.x * blockIdx.x + threadIdx.x;
int j = blockDim.y * blockIdx.y + threadIdx.y;
int spectrum_m = cfg->FovAzimuth / cfg->scale + 1;
int spectrum_n = cfg->FovElevation / cfg->scale + 1;
if (i < spectrum_m && j < spectrum_n) {
int idx = i * spectrum_n + j;
double azimuth = (-cfg->FovAzimuth/2 + i * cfg->scale) * M_PI / 180.0;
double elevation = (-cfg->FovAzimuth/2 + j * cfg->scale) * M_PI / 180.0;
double sin_theta = sin(elevation);
double cos_theta = cos(elevation);
double sin_phi = sin(azimuth);
double cos_phi = cos(azimuth);
for (int ant_m = 0; ant_m < m; ++ant_m) {
for (int ant_n = 0; ant_n < n; ++ant_n) {
int ant_idx = ant_m * n + ant_n;
double phaseShift = 2 * M_PI * d *
(0 * (cos_phi * sin_theta) + ant_m * (sin_phi * sin_theta) + ant_n * cos_theta) /
cuDoubleComplex exp_j_phaseShift = make_cuDoubleComplex(cos(phaseShift), sin(phaseShift));
p_steering_v[idx][ant_idx] = exp_j_phaseShift;
for (int k_n = 0 ; k_n < m*n ; ++k_n) {
cuDoubleComplex local_ = {0, 0};
for (int k_m = 0 ; k_m < m*n ; ++k_m) {
int noise_idx = k_n + k_m * m * n;
local_ = cuCadd(local_, cuCmul(cuConj(p_steering_v[idx][k_m]), noiseSpace[noise_idx]));
if (i == 50 && j == 50) {
printf("k_n: %d, tmp_[k_n]: ( %f, %f )n", k_n, tmp_[k_n].x, tmp_[k_n].y);
printf("k_n: %d, local_: ( %f, %f )n", k_n, local_.x, local_.y);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("CUDA error in initialize_steering_vector kernel launch: %sn", cudaGetErrorString(err));
<code>__global__ void first_scanning(cuDoubleComplex **p_steering_v, cuDoubleComplex *noiseSpace, DOAConfig *cfg) {
int m = cfg->m;
int n = cfg->n;
double d = cfg->d;
double lambda = cfg->lambda;
extern __shared__ cuDoubleComplex tmp_[];
int i = blockDim.x * blockIdx.x + threadIdx.x;
int j = blockDim.y * blockIdx.y + threadIdx.y;
int spectrum_m = cfg->FovAzimuth / cfg->scale + 1;
int spectrum_n = cfg->FovElevation / cfg->scale + 1;
if (i < spectrum_m && j < spectrum_n) {
int idx = i * spectrum_n + j;
double azimuth = (-cfg->FovAzimuth/2 + i * cfg->scale) * M_PI / 180.0;
double elevation = (-cfg->FovAzimuth/2 + j * cfg->scale) * M_PI / 180.0;
double sin_theta = sin(elevation);
double cos_theta = cos(elevation);
double sin_phi = sin(azimuth);
double cos_phi = cos(azimuth);
for (int ant_m = 0; ant_m < m; ++ant_m) {
for (int ant_n = 0; ant_n < n; ++ant_n) {
int ant_idx = ant_m * n + ant_n;
double phaseShift = 2 * M_PI * d *
(0 * (cos_phi * sin_theta) + ant_m * (sin_phi * sin_theta) + ant_n * cos_theta) /
lambda;
cuDoubleComplex exp_j_phaseShift = make_cuDoubleComplex(cos(phaseShift), sin(phaseShift));
p_steering_v[idx][ant_idx] = exp_j_phaseShift;
}
}
for (int k_n = 0 ; k_n < m*n ; ++k_n) {
cuDoubleComplex local_ = {0, 0};
for (int k_m = 0 ; k_m < m*n ; ++k_m) {
int noise_idx = k_n + k_m * m * n;
local_ = cuCadd(local_, cuCmul(cuConj(p_steering_v[idx][k_m]), noiseSpace[noise_idx]));
}
__syncthreads();
tmp_[k_n] = local_;
__syncthreads();
if (i == 50 && j == 50) {
printf("k_n: %d, tmp_[k_n]: ( %f, %f )n", k_n, tmp_[k_n].x, tmp_[k_n].y);
printf("k_n: %d, local_: ( %f, %f )n", k_n, local_.x, local_.y);
}
}
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("CUDA error in initialize_steering_vector kernel launch: %sn", cudaGetErrorString(err));
}
}
}
</code>
__global__ void first_scanning(cuDoubleComplex **p_steering_v, cuDoubleComplex *noiseSpace, DOAConfig *cfg) {
int m = cfg->m;
int n = cfg->n;
double d = cfg->d;
double lambda = cfg->lambda;
extern __shared__ cuDoubleComplex tmp_[];
int i = blockDim.x * blockIdx.x + threadIdx.x;
int j = blockDim.y * blockIdx.y + threadIdx.y;
int spectrum_m = cfg->FovAzimuth / cfg->scale + 1;
int spectrum_n = cfg->FovElevation / cfg->scale + 1;
if (i < spectrum_m && j < spectrum_n) {
int idx = i * spectrum_n + j;
double azimuth = (-cfg->FovAzimuth/2 + i * cfg->scale) * M_PI / 180.0;
double elevation = (-cfg->FovAzimuth/2 + j * cfg->scale) * M_PI / 180.0;
double sin_theta = sin(elevation);
double cos_theta = cos(elevation);
double sin_phi = sin(azimuth);
double cos_phi = cos(azimuth);
for (int ant_m = 0; ant_m < m; ++ant_m) {
for (int ant_n = 0; ant_n < n; ++ant_n) {
int ant_idx = ant_m * n + ant_n;
double phaseShift = 2 * M_PI * d *
(0 * (cos_phi * sin_theta) + ant_m * (sin_phi * sin_theta) + ant_n * cos_theta) /
lambda;
cuDoubleComplex exp_j_phaseShift = make_cuDoubleComplex(cos(phaseShift), sin(phaseShift));
p_steering_v[idx][ant_idx] = exp_j_phaseShift;
}
}
for (int k_n = 0 ; k_n < m*n ; ++k_n) {
cuDoubleComplex local_ = {0, 0};
for (int k_m = 0 ; k_m < m*n ; ++k_m) {
int noise_idx = k_n + k_m * m * n;
local_ = cuCadd(local_, cuCmul(cuConj(p_steering_v[idx][k_m]), noiseSpace[noise_idx]));
}
__syncthreads();
tmp_[k_n] = local_;
__syncthreads();
if (i == 50 && j == 50) {
printf("k_n: %d, tmp_[k_n]: ( %f, %f )n", k_n, tmp_[k_n].x, tmp_[k_n].y);
printf("k_n: %d, local_: ( %f, %f )n", k_n, local_.x, local_.y);
}
}
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("CUDA error in initialize_steering_vector kernel launch: %sn", cudaGetErrorString(err));
}
}
}
Any comments on the way I use shared memory?
Thanks!