I’m interested in the most efficient way to check for equality of memory blocks.
Currently, I use memcmp(...) == 0
to determine if two memory blocks are equal. While this works, memcmp
is designed not only to check for equality but also to determine the ordering of two memory blocks (greater, less, or equal). This additional functionality might introduce unnecessary overhead.
Task:
I need to efficiently check whether two memory blocks are equal or not.
Problem:
The fact that memcmp
provides ordering information means it might be less efficient than a function specifically designed to check for equality.
The additional overhead in memcmp
is related to the need to determine the exact ordering of the blocks if they are not equal. For example, if memcmp
returns a non-zero result, it means there are two bytes at some index i
in each memory block that differ in a certain way, and all previous bytes are equal. To achieve this, memcmp
must verify the equality of all previous bytes, which takes time. In contrast, a function that only checks for equality could, for instance, immediately check the last bytes and, upon finding a difference, terminate the check. This could lead to amortized improvements in time complexity.
What I’ve tried:
I wrote my own implementations to demonstrate this idea and ran some tests.
#include <cstring>
#include <immintrin.h>
bool memcmp_memeq(const void* s1, const void* s2, size_t n) {
return memcmp(s1, s2, n) == 0;
}
bool my_memeq(const void* s1, const void* s2, size_t n) {
const static size_t simd_width = 16;
if (n <= simd_width) {
return memcmp(s1, s2, n) == 0;
}
const int8_t* end1 = (const int8_t*)s1 + n - simd_width;
const int8_t* end2 = (const int8_t*)s2 + n - simd_width;
const __m128i m1 = _mm_loadu_si128((__m128i*)end1);
const __m128i m2 = _mm_loadu_si128((__m128i*)end2);
const __m128i result = _mm_cmpeq_epi8(m1, m2);
if (_mm_movemask_epi8(result) != 0xFFFF) {
return false;
}
return memcmp(s1, s2, (const int8_t*)end1 - (const int8_t*)s1) == 0;
}
bool my_memeq_256(const void* s1, const void* s2, size_t n) {
const static size_t simd_width = 32;
if (n <= simd_width) {
return memcmp(s1, s2, n) == 0;
}
const int8_t* end1 = (const int8_t*)s1 + n - simd_width;
const int8_t* end2 = (const int8_t*)s2 + n - simd_width;
const __m256i m1 = _mm256_loadu_si256((__m256i*)end1);
const __m256i m2 = _mm256_loadu_si256((__m256i*)end2);
const __m256i result = _mm256_cmpeq_epi8(m1, m2);
if (static_cast<unsigned int>(_mm256_movemask_epi8(result)) != 0xFFFFFFFF) {
return false;
}
return memcmp(s1, s2, (const int8_t*)end1 - (const int8_t*)s1) == 0;
}
#include <cstring>
#include <sstream>
#include <fstream>
#include <iomanip>
#include <gtest/gtest.h>
#include <src/util/my_memeq.h>
template <typename Func>
std::pair<std::chrono::duration<double>, bool> measure_time(
Func func,
const std::vector<std::pair<std::string, std::string>>& test_cases,
int iterations = 100000) {
if (test_cases.empty()) {
return { std::chrono::duration<double>(0), true };
}
volatile bool result;
const auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < iterations; ++i) {
for (const auto& test_case : test_cases) {
result = func(test_case.first.c_str(), test_case.second.c_str(), test_case.first.size());
}
}
const auto end = std::chrono::high_resolution_clock::now();
return { (end - start) / test_cases.size() / iterations, result };
}
void run_tests(std::ostream& out, size_t len, const std::vector<std::pair<std::string, std::string>>& test_cases) {
const auto memcmp_memeq_time_result = measure_time(memcmp_memeq, test_cases);
const auto my_memeq_time_result = measure_time(my_memeq, test_cases);
const auto my_memeq_256_time_result = measure_time(my_memeq_256, test_cases);
const auto memcmp_memeq_time = memcmp_memeq_time_result.first.count();
const auto my_memeq_time = my_memeq_time_result.first.count();
const auto my_memeq_256_time = my_memeq_256_time_result.first.count();
const auto memcmp_memeq_result = memcmp_memeq_time_result.second;
const auto my_memeq_result = my_memeq_time_result.second;
const auto my_memeq_256_result = my_memeq_256_time_result.second;
EXPECT_EQ(memcmp_memeq_result, my_memeq_result);
EXPECT_EQ(memcmp_memeq_result, my_memeq_256_result);
out << "RESULTS for len: " << len << " bytesn";
out << std::setw(19) << "memcmp_memeq time: " << std::setw(9) << std::setprecision(4) << memcmp_memeq_time << " secondsn";
out << std::setw(19) << "my_memeq time: " << std::setw(9) << std::setprecision(4) << my_memeq_time << " secondsn";
out << std::setw(19) << "my_memeq_256 time: " << std::setw(9) << std::setprecision(4) << my_memeq_256_time << " secondsn";
out << 'n';
}
std::vector<std::pair<std::string, std::string>>
generate_test_cases(
size_t len,
bool include_equal = true,
bool include_start = true,
bool include_middle = true,
bool include_end = true,
bool include_quarters = true) {
std::vector<std::pair<std::string, std::string>> test_cases;
std::string str1 (len, 'A');
std::string str2 (len, 'A');
if (include_equal) {
test_cases.emplace_back(str1, str2);
}
if (include_end) {
str2[len - 1] = 'B';
test_cases.emplace_back(str1, str2);
str2[len - 1] = 'A';
}
if (include_start) {
str2[0] = 'B';
test_cases.emplace_back(str1, str2);
str2[0] = 'A';
}
if (include_middle) {
if (len > 2) {
str2[len / 2] = 'B';
test_cases.emplace_back(str1, str2);
str2[len / 2] = 'A';
}
}
if (include_quarters) {
if (len > 4) {
str2[len / 4] = 'B';
test_cases.emplace_back(str1, str2);
str2[len / 4] = 'A';
str2[len / 4 * 3] = 'B';
test_cases.emplace_back(str1, str2);
str2[len / 4 * 3] = 'A';
}
}
return test_cases;
}
TEST(MemcmpTest, PerformanceComparison) {
const static size_t start_len = 4;
const static size_t max_bytes = 131072;
std::stringstream out;
out << "VARIOUS STRINGSnn";
for (size_t len = start_len; len <= max_bytes; len *= 2) {
run_tests(
out,
len,
generate_test_cases(
len,
true,
true,
true,
true,
true)
);
}
out << "DIFFERENCE AT THE STARTnn";
for (size_t len = start_len; len <= max_bytes; len *= 2) {
run_tests(
out,
len,
generate_test_cases(
len,
false,
true,
false,
false,
false)
);
}
out << "DIFFERENCE AT THE ENDnn";
for (size_t len = start_len; len <= max_bytes; len *= 2) {
run_tests(
out,
len,
generate_test_cases(
len,
false,
false,
false,
true,
false)
);
}
out << "DIFFERENCE AT THE MIDDLEnn";
for (size_t len = start_len; len <= max_bytes; len *= 2) {
run_tests(
out,
len,
generate_test_cases(
len,
false,
false,
true,
false,
false)
);
}
const auto output = out.str();
// std::cout << output;
{{ std::ofstream out_file ("../test/my_memeq_test_results.txt");
out_file << output;
}}
}
Test results:
VARIOUS STRINGS
RESULTS for len: 4 bytes
memcmp_memeq time: 1.6e-08 seconds
my_memeq time: 1.7e-08 seconds
my_memeq_256 time: 1.6e-08 seconds
RESULTS for len: 8 bytes
memcmp_memeq time: 1.6e-08 seconds
my_memeq time: 1.6e-08 seconds
my_memeq_256 time: 1.6e-08 seconds
RESULTS for len: 16 bytes
memcmp_memeq time: 1.6e-08 seconds
my_memeq time: 1.6e-08 seconds
my_memeq_256 time: 1.6e-08 seconds
RESULTS for len: 32 bytes
memcmp_memeq time: 1.6e-08 seconds
my_memeq time: 1.7e-08 seconds
my_memeq_256 time: 1.6e-08 seconds
RESULTS for len: 64 bytes
memcmp_memeq time: 1.6e-08 seconds
my_memeq time: 1.7e-08 seconds
my_memeq_256 time: 1.8e-08 seconds
RESULTS for len: 128 bytes
memcmp_memeq time: 1.5e-08 seconds
my_memeq time: 1.8e-08 seconds
my_memeq_256 time: 1.9e-08 seconds
RESULTS for len: 256 bytes
memcmp_memeq time: 1.6e-08 seconds
my_memeq time: 1.8e-08 seconds
my_memeq_256 time: 1.9e-08 seconds
RESULTS for len: 512 bytes
memcmp_memeq time: 1.7e-08 seconds
my_memeq time: 1.9e-08 seconds
my_memeq_256 time: 2e-08 seconds
RESULTS for len: 1024 bytes
memcmp_memeq time: 1.9e-08 seconds
my_memeq time: 2.1e-08 seconds
my_memeq_256 time: 2.2e-08 seconds
RESULTS for len: 2048 bytes
memcmp_memeq time: 2.5e-08 seconds
my_memeq time: 2.5e-08 seconds
my_memeq_256 time: 2.6e-08 seconds
RESULTS for len: 4096 bytes
memcmp_memeq time: 4.1e-08 seconds
my_memeq time: 3.3e-08 seconds
my_memeq_256 time: 3.5e-08 seconds
RESULTS for len: 8192 bytes
memcmp_memeq time: 8e-08 seconds
my_memeq time: 6e-08 seconds
my_memeq_256 time: 6e-08 seconds
RESULTS for len: 16384 bytes
memcmp_memeq time: 1.57e-07 seconds
my_memeq time: 1.21e-07 seconds
my_memeq_256 time: 1.22e-07 seconds
RESULTS for len: 32768 bytes
memcmp_memeq time: 2.97e-07 seconds
my_memeq time: 2.19e-07 seconds
my_memeq_256 time: 2.2e-07 seconds
RESULTS for len: 65536 bytes
memcmp_memeq time: 6.34e-07 seconds
my_memeq time: 4.25e-07 seconds
my_memeq_256 time: 4.26e-07 seconds
RESULTS for len: 131072 bytes
memcmp_memeq time: 1.429e-06 seconds
my_memeq time: 9.97e-07 seconds
my_memeq_256 time: 9.8e-07 seconds
DIFFERENCE AT THE START
RESULTS for len: 4 bytes
memcmp_memeq time: 2.6e-08 seconds
my_memeq time: 2.6e-08 seconds
my_memeq_256 time: 2.6e-08 seconds
RESULTS for len: 8 bytes
memcmp_memeq time: 2.6e-08 seconds
my_memeq time: 2.6e-08 seconds
my_memeq_256 time: 2.6e-08 seconds
RESULTS for len: 16 bytes
memcmp_memeq time: 2.7e-08 seconds
my_memeq time: 2.6e-08 seconds
my_memeq_256 time: 2.6e-08 seconds
RESULTS for len: 32 bytes
memcmp_memeq time: 2.6e-08 seconds
my_memeq time: 2.8e-08 seconds
my_memeq_256 time: 2.6e-08 seconds
RESULTS for len: 64 bytes
memcmp_memeq time: 2.6e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.9e-08 seconds
RESULTS for len: 128 bytes
memcmp_memeq time: 2.6e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.9e-08 seconds
RESULTS for len: 256 bytes
memcmp_memeq time: 2.6e-08 seconds
my_memeq time: 2.8e-08 seconds
my_memeq_256 time: 2.8e-08 seconds
RESULTS for len: 512 bytes
memcmp_memeq time: 2.6e-08 seconds
my_memeq time: 2.8e-08 seconds
my_memeq_256 time: 2.9e-08 seconds
RESULTS for len: 1024 bytes
memcmp_memeq time: 2.6e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.8e-08 seconds
RESULTS for len: 2048 bytes
memcmp_memeq time: 2.6e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.8e-08 seconds
RESULTS for len: 4096 bytes
memcmp_memeq time: 2.6e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.9e-08 seconds
RESULTS for len: 8192 bytes
memcmp_memeq time: 2.4e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.9e-08 seconds
RESULTS for len: 16384 bytes
memcmp_memeq time: 2.6e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.8e-08 seconds
RESULTS for len: 32768 bytes
memcmp_memeq time: 2.6e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.8e-08 seconds
RESULTS for len: 65536 bytes
memcmp_memeq time: 2.6e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.8e-08 seconds
RESULTS for len: 131072 bytes
memcmp_memeq time: 2.6e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.9e-08 seconds
DIFFERENCE AT THE END
RESULTS for len: 4 bytes
memcmp_memeq time: 2.5e-08 seconds
my_memeq time: 2.6e-08 seconds
my_memeq_256 time: 2.6e-08 seconds
RESULTS for len: 8 bytes
memcmp_memeq time: 2.5e-08 seconds
my_memeq time: 2.6e-08 seconds
my_memeq_256 time: 2.6e-08 seconds
RESULTS for len: 16 bytes
memcmp_memeq time: 2.6e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.6e-08 seconds
RESULTS for len: 32 bytes
memcmp_memeq time: 2.6e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.6e-08 seconds
RESULTS for len: 64 bytes
memcmp_memeq time: 2.6e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.8e-08 seconds
RESULTS for len: 128 bytes
memcmp_memeq time: 2.6e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.9e-08 seconds
RESULTS for len: 256 bytes
memcmp_memeq time: 2.6e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.9e-08 seconds
RESULTS for len: 512 bytes
memcmp_memeq time: 3e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.9e-08 seconds
RESULTS for len: 1024 bytes
memcmp_memeq time: 3.5e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.8e-08 seconds
RESULTS for len: 2048 bytes
memcmp_memeq time: 4.2e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.9e-08 seconds
RESULTS for len: 4096 bytes
memcmp_memeq time: 6.1e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.9e-08 seconds
RESULTS for len: 8192 bytes
memcmp_memeq time: 9.8e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.9e-08 seconds
RESULTS for len: 16384 bytes
memcmp_memeq time: 1.95e-07 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.9e-08 seconds
RESULTS for len: 32768 bytes
memcmp_memeq time: 4.84e-07 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.9e-08 seconds
RESULTS for len: 65536 bytes
memcmp_memeq time: 9.42e-07 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.9e-08 seconds
RESULTS for len: 131072 bytes
memcmp_memeq time: 1.862e-06 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.9e-08 seconds
DIFFERENCE AT THE MIDDLE
RESULTS for len: 4 bytes
memcmp_memeq time: 2.7e-08 seconds
my_memeq time: 2.6e-08 seconds
my_memeq_256 time: 2.6e-08 seconds
RESULTS for len: 8 bytes
memcmp_memeq time: 2.6e-08 seconds
my_memeq time: 3.2e-08 seconds
my_memeq_256 time: 4.9e-08 seconds
RESULTS for len: 16 bytes
memcmp_memeq time: 2.7e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.7e-08 seconds
RESULTS for len: 32 bytes
memcmp_memeq time: 2.8e-08 seconds
my_memeq time: 2.8e-08 seconds
my_memeq_256 time: 2.7e-08 seconds
RESULTS for len: 64 bytes
memcmp_memeq time: 2.7e-08 seconds
my_memeq time: 3e-08 seconds
my_memeq_256 time: 3e-08 seconds
RESULTS for len: 128 bytes
memcmp_memeq time: 2.8e-08 seconds
my_memeq time: 3e-08 seconds
my_memeq_256 time: 3.2e-08 seconds
RESULTS for len: 256 bytes
memcmp_memeq time: 2.8e-08 seconds
my_memeq time: 3.3e-08 seconds
my_memeq_256 time: 3.1e-08 seconds
RESULTS for len: 512 bytes
memcmp_memeq time: 3.1e-08 seconds
my_memeq time: 3.2e-08 seconds
my_memeq_256 time: 3.3e-08 seconds
RESULTS for len: 1024 bytes
memcmp_memeq time: 3.3e-08 seconds
my_memeq time: 3.4e-08 seconds
my_memeq_256 time: 3.6e-08 seconds
RESULTS for len: 2048 bytes
memcmp_memeq time: 3.7e-08 seconds
my_memeq time: 3.7e-08 seconds
my_memeq_256 time: 3.8e-08 seconds
RESULTS for len: 4096 bytes
memcmp_memeq time: 4.3e-08 seconds
my_memeq time: 4.7e-08 seconds
my_memeq_256 time: 4.8e-08 seconds
RESULTS for len: 8192 bytes
memcmp_memeq time: 5.8e-08 seconds
my_memeq time: 6e-08 seconds
my_memeq_256 time: 6.1e-08 seconds
RESULTS for len: 16384 bytes
memcmp_memeq time: 1.02e-07 seconds
my_memeq time: 1.06e-07 seconds
my_memeq_256 time: 1.07e-07 seconds
RESULTS for len: 32768 bytes
memcmp_memeq time: 2.31e-07 seconds
my_memeq time: 2.36e-07 seconds
my_memeq_256 time: 2.41e-07 seconds
RESULTS for len: 65536 bytes
memcmp_memeq time: 5.17e-07 seconds
my_memeq time: 5.2e-07 seconds
my_memeq_256 time: 5.23e-07 seconds
RESULTS for len: 131072 bytes
memcmp_memeq time: 9.9e-07 seconds
my_memeq time: 1.007e-06 seconds
my_memeq_256 time: 9.78e-07 seconds
Separately, here are the results for the largest memory blocks:
VARIOUS STRINGS
RESULTS for len: 131072 bytes
memcmp_memeq time: 1.429e-06 seconds
my_memeq time: 9.97e-07 seconds
my_memeq_256 time: 9.8e-07 seconds
DIFFERENCE AT THE START
RESULTS for len: 131072 bytes
memcmp_memeq time: 2.6e-08 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.9e-08 seconds
DIFFERENCE AT THE END
RESULTS for len: 131072 bytes
memcmp_memeq time: 1.862e-06 seconds
my_memeq time: 2.7e-08 seconds
my_memeq_256 time: 2.9e-08 seconds
DIFFERENCE AT THE MIDDLE
RESULTS for len: 131072 bytes
memcmp_memeq time: 9.9e-07 seconds
my_memeq time: 1.007e-06 seconds
my_memeq_256 time: 9.78e-07 seconds
As can be seen, my implementations can provide significant performance improvements in certain cases.
However, slight performance degradation can be expected for smaller memory blocks.
I am interested to know if there is an existing solution specifically optimized for checking memory block equality.