I was running the k-means algorithm using cuda and encountered a problem in this part of the code before for if (idx < numPoints) { atomicAdd(&counts[points[idx].cluster], 1);
code:
#include <iostream>
#include <vector>
#include <cmath>
#include <limits>
#include <cstdlib>
#include <ctime>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
using namespace std;
struct Point {
double* attributes;
int cluster;
};
__device__ double euclideanDistanceCUDA(const double* a, const double* b, int numAttributes) {
double sum = 0.0;
for (int i = 0; i < numAttributes; ++i) {
sum += pow(a[i] - b[i], 2);
}
return sqrt(sum);
}
__global__ void assignClustersCUDA(Point* points, double* attributes, double* centroids, int numPoints, int k, int numAttributes) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < numPoints) {
double minDistance = numeric_limits<double>::max();
int cluster = 0;
for (int i = 0; i < k; ++i) {
double distance = euclideanDistanceCUDA(&attributes[idx * numAttributes], ¢roids[i * numAttributes], numAttributes);
if (distance < minDistance) {
minDistance = distance;
cluster = i;
}
}
points[idx].cluster = cluster;
}
}
__global__ void updateCentroidsCUDA(Point* points, double* centroids, int* counts, int numPoints, int k, int numAttributes) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < numPoints) {
atomicAdd(&counts[points[idx].cluster], 1);
for (int j = 0; j < numAttributes; ++j) {
atomicAdd(¢roids[points[idx].cluster * numAttributes + j], points[idx].attributes[j]);
}
}
}
void initializeCentroids(double* centroids, int numAttributes) {
// Задаем фиксированные начальные позиции для центроидов, такие же, как в CPU-версии
centroids[0 * numAttributes + 0] = 1.0;
centroids[0 * numAttributes + 1] = 2.0;
centroids[0 * numAttributes + 2] = 3.0;
centroids[0 * numAttributes + 3] = 4.0;
centroids[1 * numAttributes + 0] = 4.0;
centroids[1 * numAttributes + 1] = 3.0;
centroids[1 * numAttributes + 2] = 2.0;
centroids[1 * numAttributes + 3] = 1.0;
centroids[2 * numAttributes + 0] = 5.0;
centroids[2 * numAttributes + 1] = 6.0;
centroids[2 * numAttributes + 2] = 7.0;
centroids[2 * numAttributes + 3] = 8.0;
centroids[3 * numAttributes + 0] = 8.0;
centroids[3 * numAttributes + 1] = 7.0;
centroids[3 * numAttributes + 2] = 6.0;
centroids[3 * numAttributes + 3] = 5.0;
centroids[4 * numAttributes + 0] = 2.5;
centroids[4 * numAttributes + 1] = 3.5;
centroids[4 * numAttributes + 2] = 4.5;
centroids[4 * numAttributes + 3] = 5.5;
}
int main() {
int k = 5;
int numPoints = 130000;
int numAttributes = 4;
vector<Point> points(numPoints);
double* attributes = new double[numPoints * numAttributes];
for (int i = 0; i < numPoints; ++i) {
points[i].attributes = &attributes[i * numAttributes];
for (int j = 0; j < numAttributes; ++j) {
points[i].attributes[j] = static_cast<double>(rand()) / RAND_MAX;
}
}
double* centroids = new double[k * numAttributes];
Point* d_points;
double* d_attributes;
double* d_centroids;
int* d_counts;
cudaMalloc(&d_points, numPoints * sizeof(Point));
cudaMalloc(&d_attributes, numPoints * numAttributes * sizeof(double));
cudaMalloc(&d_centroids, k * numAttributes * sizeof(double));
cudaMalloc(&d_counts, k * sizeof(int));
cudaMemcpy(d_attributes, attributes, numPoints * numAttributes * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_points, points.data(), numPoints * sizeof(Point), cudaMemcpyHostToDevice);
initializeCentroids(centroids, numAttributes);
cudaMemcpy(d_centroids, centroids, k * numAttributes * sizeof(double), cudaMemcpyHostToDevice);
int blockSize = 256;
int numBlocks = (numPoints + blockSize - 1) / blockSize;
clock_t start = clock();
bool changed;
do {
changed = false;
cudaMemset(d_counts, 0, k * sizeof(int));
cudaMemset(d_centroids, 0, k * numAttributes * sizeof(double));
assignClustersCUDA <<<numBlocks, blockSize >> > (d_points, d_attributes, d_centroids, numPoints, k, numAttributes);
cudaDeviceSynchronize();
updateCentroidsCUDA <<<numBlocks, blockSize >> > (d_points, d_attributes, d_centroids, d_counts, numPoints, k, numAttributes);
cudaDeviceSynchronize();
cudaMemcpy(centroids, d_centroids, k * numAttributes * sizeof(double), cudaMemcpyDeviceToHost);
vector<double> oldCentroids(centroids, centroids + k * numAttributes);
for (int i = 0; i < k; ++i) {
if (euclideanDistanceCUDA(&oldCentroids[i * numAttributes], ¢roids[i * numAttributes], numAttributes) > 1e-4) {
changed = true;
break;
}
}
} while (changed);
clock_t end = clock();
double duration = double(end - start) / CLOCKS_PER_SEC;
cudaFree(d_points);
cudaFree(d_attributes);
cudaFree(d_centroids);
cudaFree(d_counts);
delete[] attributes;
delete[] centroids;
cout << "Кластеризация завершена на GPU." << endl;
cout << "Время выполнения: " << duration << " секунд." << endl;
return 0;
}
I tried to find a directive that I think is missing for correct definition of an identifier, because I had a problem with blockIdx blockDim + threadIdx until I added the dirictive #include “device_launch_parameters.h”.I’m a student and I’d like to figure out what’s wrong with my code.
chaiz is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.