I am trying to measure cache misses in a multi-threaded application using perf_event_open on Linux. The code involves two threads: Thread t1, that updates an aligned integer (x) and another thread t2 reads from it. However, I am getting cache misses on accessing x with thread t1, even though it is only updated by t1 only.
Here is the relevant code snippet:
#include <bits/stdc++.h>
#include <chrono>
#include <linux/perf_event.h> /* Definition of PERF_* constants */
#include <linux/hw_breakpoint.h> /* Definition of HW_* constants */
#include <sys/syscall.h> /* Definition of SYS_* constants */
#include <unistd.h>
#include <cstring>
#include <atomic>
#include <sys/ioctl.h>
using namespace std;
struct int_aligned {
alignas(1024) int value; // align to 1024 bytes, to avoid false sharing
int_aligned(int val) : value(val) {}
};
int_aligned* x;
int_aligned* y;
atomic<bool> changed;
int loop_count;
vector<int> load_time;
static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) {
int fd;
fd = syscall(SYS_perf_event_open, hw_event, pid, cpu, group_fd, flags);
if (fd == -1) {
perror("");
exit(EXIT_FAILURE);
}
return fd;
}
void print_performance_metrics(vector<int> &numbers, int element_count) {
sort(numbers.begin(), numbers.end());
int min = numbers[0];
int max = numbers[element_count - 1];
cout << "Min: " << min << endl;
cout << "Max: " << max << endl;
int percentile_req[] = {10, 20, 30, 40, 50, 60, 70, 80, 90};
int size_per = sizeof(percentile_req) / sizeof(percentile_req[0]);
int percentile_values[size_per];
for (int i = 0; i < size_per; i++) {
int index = (percentile_req[i] * element_count) / 100;
percentile_values[i] = numbers[index];
}
cout << "Percentile values are: " << endl;
for (int i = 0; i < size_per; i++) {
cout << setw(10) << percentile_req[i];
}
cout << "n";
for (int i = 0; i < size_per; i++) {
cout << setw(10) << percentile_values[i];
}
cout << "n";
}
void writer() {
int cnt = 0;
int fd;
long long count;
struct perf_event_attr pe;
memset(&pe, 0, sizeof(pe));
pe.type = PERF_TYPE_HW_CACHE;
pe.size = sizeof(pe);
pe.config = (PERF_COUNT_HW_CACHE_L1D) |
(PERF_COUNT_HW_CACHE_OP_READ << 8) |
(PERF_COUNT_HW_CACHE_RESULT_MISS << 16);
pe.disabled = 1;
pe.exclude_kernel = 1;
pe.exclude_hv = 1;
fd = perf_event_open(&pe, 0, -1, -1, 0);
if (fd == -1) {
fprintf(stderr, "Error opening leader %llxn", pe.config);
cout << "error opening leadern" << endl;
exit(EXIT_FAILURE);
}
ioctl(fd, PERF_EVENT_IOC_RESET, 0);
ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
while (cnt < loop_count) {
if (!changed.load()) {
auto x_ptr = x;
ioctl(fd, PERF_EVENT_IOC_RESET, 0);
ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
auto x_copy = x_ptr->value; // getting cache miss here
ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
x_copy++;
x->value = x_copy;
read(fd, &count, sizeof(long long));
load_time.push_back(count);
cnt++;
changed.store(true);
}
}
}
void reader() {
int cnt = 0;
while (cnt < loop_count) {
if (changed.load()) {
auto x_copy = x->value;
y->value = x_copy;
cnt++;
changed.store(false);
}
}
}
int main(int argc, char** argv) {
x = new int_aligned(0);
y = new int_aligned(0);
changed.store(false);
loop_count = atoi(argv[1]);
thread t1(writer);
thread t2(reader);
t1.join();
t2.join();
cout << "x: " << x->value << endl;
cout << "y: " << y->value << endl;
print_performance_metrics(load_time, load_time.size());
return 0;
}
I compile the code with the following command:
g++ <filename> -o <out> -std=c++17 -pthread -O2
And I run it with appropriate permissions for perf:
echo 1 | sudo tee /proc/sys/kernel/perf_event_paranoid
The issue is that I am consistently getting cache misses when loading x in the writer thread, even though x is aligned to 1024 bytes and should not suffer from false sharing. x is only updated by the writer thread, so I don’t understand why there are cache misses.
Any insights into why this is happening and how I can resolve it would be greatly appreciated.
Rishi Jain is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.