Thiết kế website giá rẻ

Question

I am trying to measure cache misses in a multi-threaded application using perf_event_open on Linux. The code involves two threads: Thread t1, that updates an aligned integer (x) and another thread t2 reads from it. However, I am getting cache misses on accessing x with thread t1, even though it is only updated by t1 only.

Here is the relevant code snippet:

<code>#include <bits/stdc++.h>

#include <chrono>

#include <linux/perf_event.h> /* Definition of PERF_* constants */

#include <linux/hw_breakpoint.h> /* Definition of HW_* constants */

#include <sys/syscall.h> /* Definition of SYS_* constants */

#include <unistd.h>

#include <cstring>

#include <atomic>

#include <sys/ioctl.h>

using namespace std;

struct int_aligned {

alignas(1024) int value; // align to 1024 bytes, to avoid false sharing

int_aligned(int val) : value(val) {}

};

int_aligned* x;

int_aligned* y;

atomic<bool> changed;

int loop_count;

vector<int> load_time;

static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) {

int fd;

fd = syscall(SYS_perf_event_open, hw_event, pid, cpu, group_fd, flags);

if (fd == -1) {

perror("");

exit(EXIT_FAILURE);

}

return fd;

}

void print_performance_metrics(vector<int> &numbers, int element_count) {

sort(numbers.begin(), numbers.end());

int min = numbers[0];

int max = numbers[element_count - 1];

cout << "Min: " << min << endl;

cout << "Max: " << max << endl;

int percentile_req[] = {10, 20, 30, 40, 50, 60, 70, 80, 90};

int size_per = sizeof(percentile_req) / sizeof(percentile_req[0]);

int percentile_values[size_per];

for (int i = 0; i < size_per; i++) {

int index = (percentile_req[i] * element_count) / 100;

percentile_values[i] = numbers[index];

}

cout << "Percentile values are: " << endl;

for (int i = 0; i < size_per; i++) {

cout << setw(10) << percentile_req[i];

}

cout << "n";

for (int i = 0; i < size_per; i++) {

cout << setw(10) << percentile_values[i];

}

cout << "n";

}

void writer() {

int cnt = 0;

int fd;

long long count;

struct perf_event_attr pe;

memset(&pe, 0, sizeof(pe));

pe.type = PERF_TYPE_HW_CACHE;

pe.size = sizeof(pe);

pe.config = (PERF_COUNT_HW_CACHE_L1D) |

(PERF_COUNT_HW_CACHE_OP_READ << 8) |

(PERF_COUNT_HW_CACHE_RESULT_MISS << 16);

pe.disabled = 1;

pe.exclude_kernel = 1;

pe.exclude_hv = 1;

fd = perf_event_open(&pe, 0, -1, -1, 0);

if (fd == -1) {

fprintf(stderr, "Error opening leader %llxn", pe.config);

cout << "error opening leadern" << endl;

exit(EXIT_FAILURE);

}

ioctl(fd, PERF_EVENT_IOC_RESET, 0);

ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);

while (cnt < loop_count) {

if (!changed.load()) {

auto x_ptr = x;

ioctl(fd, PERF_EVENT_IOC_RESET, 0);

ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);

auto x_copy = x_ptr->value; // getting cache miss here

ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);

x_copy++;

x->value = x_copy;

read(fd, &count, sizeof(long long));

load_time.push_back(count);

cnt++;

changed.store(true);

}

void reader() {

int cnt = 0;

while (cnt < loop_count) {

if (changed.load()) {

auto x_copy = x->value;

y->value = x_copy;

cnt++;

changed.store(false);

}

int main(int argc, char** argv) {

x = new int_aligned(0);

y = new int_aligned(0);

changed.store(false);

loop_count = atoi(argv[1]);

thread t1(writer);

thread t2(reader);

t1.join();

t2.join();

cout << "x: " << x->value << endl;

cout << "y: " << y->value << endl;

print_performance_metrics(load_time, load_time.size());

return 0;

}

</code>

<code>#include <bits/stdc++.h> #include <chrono> #include <linux/perf_event.h> /* Definition of PERF_* constants */ #include <linux/hw_breakpoint.h> /* Definition of HW_* constants */ #include <sys/syscall.h> /* Definition of SYS_* constants */ #include <unistd.h> #include <cstring> #include <atomic> #include <sys/ioctl.h> using namespace std; struct int_aligned { alignas(1024) int value; // align to 1024 bytes, to avoid false sharing int_aligned(int val) : value(val) {} }; int_aligned* x; int_aligned* y; atomic<bool> changed; int loop_count; vector<int> load_time; static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) { int fd; fd = syscall(SYS_perf_event_open, hw_event, pid, cpu, group_fd, flags); if (fd == -1) { perror(""); exit(EXIT_FAILURE); } return fd; } void print_performance_metrics(vector<int> &numbers, int element_count) { sort(numbers.begin(), numbers.end()); int min = numbers[0]; int max = numbers[element_count - 1]; cout << "Min: " << min << endl; cout << "Max: " << max << endl; int percentile_req[] = {10, 20, 30, 40, 50, 60, 70, 80, 90}; int size_per = sizeof(percentile_req) / sizeof(percentile_req[0]); int percentile_values[size_per]; for (int i = 0; i < size_per; i++) { int index = (percentile_req[i] * element_count) / 100; percentile_values[i] = numbers[index]; } cout << "Percentile values are: " << endl; for (int i = 0; i < size_per; i++) { cout << setw(10) << percentile_req[i]; } cout << "n"; for (int i = 0; i < size_per; i++) { cout << setw(10) << percentile_values[i]; } cout << "n"; } void writer() { int cnt = 0; int fd; long long count; struct perf_event_attr pe; memset(&pe, 0, sizeof(pe)); pe.type = PERF_TYPE_HW_CACHE; pe.size = sizeof(pe); pe.config = (PERF_COUNT_HW_CACHE_L1D) | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16); pe.disabled = 1; pe.exclude_kernel = 1; pe.exclude_hv = 1; fd = perf_event_open(&pe, 0, -1, -1, 0); if (fd == -1) { fprintf(stderr, "Error opening leader %llxn", pe.config); cout << "error opening leadern" << endl; exit(EXIT_FAILURE); } ioctl(fd, PERF_EVENT_IOC_RESET, 0); ioctl(fd, PERF_EVENT_IOC_ENABLE, 0); while (cnt < loop_count) { if (!changed.load()) { auto x_ptr = x; ioctl(fd, PERF_EVENT_IOC_RESET, 0); ioctl(fd, PERF_EVENT_IOC_ENABLE, 0); auto x_copy = x_ptr->value; // getting cache miss here ioctl(fd, PERF_EVENT_IOC_DISABLE, 0); x_copy++; x->value = x_copy; read(fd, &count, sizeof(long long)); load_time.push_back(count); cnt++; changed.store(true); } } } void reader() { int cnt = 0; while (cnt < loop_count) { if (changed.load()) { auto x_copy = x->value; y->value = x_copy; cnt++; changed.store(false); } } } int main(int argc, char** argv) { x = new int_aligned(0); y = new int_aligned(0); changed.store(false); loop_count = atoi(argv[1]); thread t1(writer); thread t2(reader); t1.join(); t2.join(); cout << "x: " << x->value << endl; cout << "y: " << y->value << endl; print_performance_metrics(load_time, load_time.size()); return 0; } </code>

#include <bits/stdc++.h>
#include <chrono>
#include <linux/perf_event.h>    /* Definition of PERF_* constants */
#include <linux/hw_breakpoint.h> /* Definition of HW_* constants */
#include <sys/syscall.h>         /* Definition of SYS_* constants */
#include <unistd.h>
#include <cstring>
#include <atomic>
#include <sys/ioctl.h>

using namespace std;

struct int_aligned {
    alignas(1024) int value; // align to 1024 bytes, to avoid false sharing
    int_aligned(int val) : value(val) {}
};

int_aligned* x;
int_aligned* y;
atomic<bool> changed;
int loop_count;
vector<int> load_time;

static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) {
    int fd;
    fd = syscall(SYS_perf_event_open, hw_event, pid, cpu, group_fd, flags);
    if (fd == -1) {
        perror("");
        exit(EXIT_FAILURE);
    }
    return fd;
}

void print_performance_metrics(vector<int> &numbers, int element_count) {
    sort(numbers.begin(), numbers.end());
    int min = numbers[0];
    int max = numbers[element_count - 1];

    cout << "Min: " << min << endl;
    cout << "Max: " << max << endl;

    int percentile_req[] = {10, 20, 30, 40, 50, 60, 70, 80, 90};
    int size_per = sizeof(percentile_req) / sizeof(percentile_req[0]);
    int percentile_values[size_per];

    for (int i = 0; i < size_per; i++) {
        int index = (percentile_req[i] * element_count) / 100;
        percentile_values[i] = numbers[index];
    }

    cout << "Percentile values are: " << endl;
    for (int i = 0; i < size_per; i++) {
        cout << setw(10) << percentile_req[i];
    }
    cout << "n";

    for (int i = 0; i < size_per; i++) {
        cout << setw(10) << percentile_values[i];
    }
    cout << "n";
}

void writer() {
    int cnt = 0;
    int fd;
    long long count;
    struct perf_event_attr pe;
    memset(&pe, 0, sizeof(pe));
    pe.type = PERF_TYPE_HW_CACHE;
    pe.size = sizeof(pe);
    pe.config = (PERF_COUNT_HW_CACHE_L1D) |
                (PERF_COUNT_HW_CACHE_OP_READ << 8) |
                (PERF_COUNT_HW_CACHE_RESULT_MISS << 16);
    pe.disabled = 1;
    pe.exclude_kernel = 1;
    pe.exclude_hv = 1;
    fd = perf_event_open(&pe, 0, -1, -1, 0);
    if (fd == -1) {
        fprintf(stderr, "Error opening leader %llxn", pe.config);
        cout << "error opening leadern" << endl;
        exit(EXIT_FAILURE);
    }
    ioctl(fd, PERF_EVENT_IOC_RESET, 0);
    ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
    while (cnt < loop_count) {
        if (!changed.load()) {
            auto x_ptr = x;
            ioctl(fd, PERF_EVENT_IOC_RESET, 0);
            ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
            auto x_copy = x_ptr->value; // getting cache miss here
            ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
            x_copy++;
            x->value = x_copy;
            read(fd, &count, sizeof(long long));
            load_time.push_back(count);
            cnt++;
            changed.store(true);
        }
    }
}

void reader() {
    int cnt = 0;
    while (cnt < loop_count) {
        if (changed.load()) {
            auto x_copy = x->value;
            y->value = x_copy;
            cnt++;
            changed.store(false);
        }
    }
}

int main(int argc, char** argv) {
    x = new int_aligned(0);
    y = new int_aligned(0);
    changed.store(false);
    loop_count = atoi(argv[1]);
    thread t1(writer);
    thread t2(reader);
    t1.join();
    t2.join();
    cout << "x: " << x->value << endl;
    cout << "y: " << y->value << endl;
    print_performance_metrics(load_time, load_time.size());
    return 0;
}

I compile the code with the following command:

<code>g++ <filename> -o <out> -std=c++17 -pthread -O2

</code>

<code>g++ <filename> -o <out> -std=c++17 -pthread -O2 </code>

g++ <filename> -o <out> -std=c++17 -pthread -O2

And I run it with appropriate permissions for perf:

<code>echo 1 | sudo tee /proc/sys/kernel/perf_event_paranoid

</code>

<code>echo 1 | sudo tee /proc/sys/kernel/perf_event_paranoid </code>

echo 1 | sudo tee /proc/sys/kernel/perf_event_paranoid

The issue is that I am consistently getting cache misses when loading x in the writer thread, even though x is aligned to 1024 bytes and should not suffer from false sharing. x is only updated by the writer thread, so I don’t understand why there are cache misses.

Any insights into why this is happening and how I can resolve it would be greatly appreciated.

Danh mục