I tested memcpy
performance from heap to heap and from heap to shared memory ([shm_open](https://www.man7.org/linux/man-pages/man3/shm_open.3.html). The test codes are as follows:
// shm_msg.hpp
#ifndef _SHM_MSG_HPP_
#define _SHM_MSG_HPP_
#include <fcntl.h>
#include <semaphore.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <unistd.h>
#include <array>
#include <stdint.h>
#define BUF_SIZE 100 * 1024 * 1024 /* Maximum size */
typedef struct shmbuf
{
uint32_t a;
double b;
std::array<uint8_t, BUF_SIZE> data;
uint64_t c;
} shmbuf;
#endif // !_SHM_MSG_HPP_
// shm_openWriter.cpp
#include "shm_msg.hpp"
#include <iostream>
#include <ctype.h>
#include <vector>
#include <cstring>
#include <chrono>
int main(int argc, char *argv[])
{
/* Create shared memory object and set its size to the size
of our structure. */
int fd = shm_open("SHM_1", O_CREAT | O_EXCL | O_RDWR, 0600);
if (fd == -1)
{
perror("shm_open");
}
size_t size = sizeof(shmbuf);
if (ftruncate(fd, size) == -1)
{
perror("ftruncate");
}
/* Map the object into the caller's address space. */
shmbuf *shmp = (shmbuf *)mmap(NULL, size, PROT_READ | PROT_WRITE,
MAP_SHARED, fd, 0);
if (shmp == MAP_FAILED)
{
perror("mmap");
}
/* Copy data into the shared memory object. */
std::vector<uint8_t> vec(BUF_SIZE, 0x56);
vec[100] = 0x89;
shmp->a = 5;
shmp->b = 2.5;
shmp->c = 100L;
std::chrono::high_resolution_clock::time_point tp1 = std::chrono::high_resolution_clock::now();
std::memcpy(shmp->data.data(), vec.data(), BUF_SIZE);
std::chrono::high_resolution_clock::time_point tp2 = std::chrono::high_resolution_clock::now();
printf("data[100] = 0x%Xn", shmp->data[100]);
std::cout << "a = " << shmp->a << std::endl;
std::cout << "b = " << shmp->b << std::endl;
std::cout << "c = " << shmp->c << std::endl;
std::cout << "memcpy time from heap to SHM = " << std::chrono::duration_cast<std::chrono::nanoseconds>(tp2 - tp1).count() / 1e6 << " ms." << std::endl;
std::vector<uint8_t> vec2(BUF_SIZE);
std::chrono::high_resolution_clock::time_point tp3 = std::chrono::high_resolution_clock::now();
std::memcpy(shmp->data.data(), vec.data(), BUF_SIZE);
std::chrono::high_resolution_clock::time_point tp4 = std::chrono::high_resolution_clock::now();
std::cout << "memcpy time from heap to heap = " << std::chrono::duration_cast<std::chrono::nanoseconds>(tp4 - tp3).count() / 1e6 << " ms." << std::endl;
shm_unlink("SHM_1");
exit(EXIT_SUCCESS);
}
compile and run the test:
g++ shm_openWriter.cpp -o shm_openWriter
./shm_openWriter
output:
data[100] = 0x89
a = 5
b = 2.5
c = 100
memcpy time from heap to SHM = 343.589 ms.
memcpy time from heap to heap = 27.0181 ms.
It can be seen from the output that memcpy from heap to shared memory is 12x slower than memcpy from heap to heap.
Why is memcpy from heap to shared memory so slow? Is there any way to improve the performance of memcpy?