I have C++ program that collects hardware counters using the perf_event_open and read methods on CentOS9 Stream.
The code was working perfectly fine with kernel version 5.14.0-319.el9.x86_64. After a kernel update to 5.14.0-472.el9.x86_64, the read method occasionally returns -1. I haven’t been able to find the cause, it seems random when it happens. Sometimes I’m able to collect hundreds of samples before I encounter this error, and other times only 20 or so. Sometimes the program runs error free.
I store the errno value immediately after the read(). The value is 10. Then, I call strerror_r to print its description, which is “No child processes”.
This is an example of how I initialize the hardware counters:
#define TOTAL_HW_COUNTERS 4
const char * pmu_names[TOTAL_HW_COUNTERS] = {"instructions", "cpu-cycles", "branch-misses","cache-misses"};
unsigned int pmu_offsets[TOTAL_HW_COUNTERS] = {PERF_COUNT_HW_INSTRUCTIONS, PERF_COUNT_HW_CPU_CYCLES, PERF_COUNT_HW_BRANCH_MISSES, PERF_COUNT_HW_CACHE_MISSES};
unsigned int pmu_types[TOTAL_HW_COUNTERS] = {PERF_TYPE_HARDWARE, PERF_TYPE_HARDWARE, PERF_TYPE_HARDWARE, PERF_TYPE_HARDWARE};
int initializePerf(pid_t child_pid) {
static struct perf_event_attr pe;
int fd;
/* The first object is the leader */
memset(&pe, 0, sizeof(pe));
pe.type = pmu_types[0];
pe.size = sizeof(pe);
pe.config = pmu_offsets[0];
pe.disabled = 1;
pe.inherit = 1;
pe.exclude_kernel = 0;
pe.exclude_hv = 0;
pe.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
fd = perf_event_open(&pe, child_pid, -1, -1, 0);
if (fd == -1) {
int errsv = errno;
std::cerr << "Error opening leader " << pe.config << " errno: " << errsv << std::endl;
exit(EXIT_FAILURE);
}
/* Add the rest of the perf_event_attr objects in a group with the leader */
for (int i = 1; i < TOTAL_HW_COUNTERS; i++) {
memset(&pe, 0, sizeof(pe));
pe.type = pmu_types[i];
pe.size = sizeof(pe);
pe.config = pmu_offsets[i];
pe.disabled = 1;
pe.inherit = 1;
pe.exclude_kernel = 0;
pe.exclude_hv = 0;
pe.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
if (perf_event_open(&pe, child_pid, -1, fd, 0) == -1) {
int errsv = errno;
std::cerr << "Error opening group counter " << pe.config << " errno: " << errsv << std::endl;
exit(EXIT_FAILURE);
}
}
return fd;
}
int main(int argc, char* argv[]) {
initializePerf(0);
// Do stuff
return 0;
}
I have a thread that wakes up every 100ms to reset the counter values and uses this method to read hardware counters and store all samples in read_pmu_buf
.
int readPerfCounters(int bytes_read, char * read_pmu_buf, int fd, int count) {
char * bufptr = read_pmu_buf + bytes_read;
int errsv;
int size_read = read(fd, bufptr, count);
errsv = errno;
if (size_read == -1) {
std::cerr << "Error reading perf counters at fd " << fd;
std::cerr << " errno: " << errsv << std::endl;
char buffer[ 256 ];
char * errorMsg = strerror_r(errsv, buffer, 256 );
std::cerr << errorMsg << std::endl; // Output: "No child processes"
}
return size_read;
}
Since this happened only after I updated the kernel, I tried a slightly older version than 5.14.0-472.el9.x86_64, 5.14.0-467.el9.x86_64, but the error behavior is still the same.
The error is definitely coming from the read() call, but I don’t understand: (1) how is “No child processes” related to a read()? And (2) what changed between kernel versions?