I don’t know how to describe this in a few words. Anyway I have found something weird while messing with C++.
In short, I was trying to build a string representation of a sequence of bytes, and I have found that directly applying the conversion rules is faster than building a lookup table and performing lookup, and using std::vector
is faster than using std::unordered_map
.
But then something really weird happens and I don’t know why it is that way.
So I used a lambda function to generate the lookup table using the conversion rules, this saves the characters occupied by copy-pasting the data.
And somehow this causes the function to be dozens of times slower if there are multiple tables generated using lambda functions containing the same values.
But the thing is if I copy-paste the same data that would be generated by the lambda function into the program, under a different name, somehow the function using that lambda generated table becomes much faster, I don’t know why:
#include <algorithm>
#include <iostream>
#include <format>
#include <chrono>
#include <fstream>
#include <numeric>
#include <string>
#include <unordered_map>
#include <vector>
using std::vector;
typedef vector<uint8_t> bytes;
using std::string;
using std::chrono::steady_clock;
using std::chrono::duration;
using std::cout;
using std::unordered_map;
string DIGITS = "0123456789abcdef";
const vector<string> HEX1 = [] {
vector<string> data(256);
for (uint8_t i = 0; i < 255; i++) {
data[i] = "\x" + DIGITS[(i & 240) >> 4] + DIGITS[i & 15];
}
data[255] = "\xff";
return data;
} ();
static inline string hexlify3(bytes arr) {
string repr = "";
for (auto& chr : arr) {
repr += " " + DIGITS[(chr & 240) >> 4] + DIGITS[chr & 15];
}
repr.erase(0, 1);
return repr;
}
static inline string hexlify5(bytes arr) {
string repr = "";
for (auto& chr : arr) {
repr += " " + HEX1[chr];
}
repr.erase(0, 1);
return repr;
}
uint32_t number = 5415321543947302498;
bytes _bytes;
char lorem_ipsum[] = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.";
bytes placeholder = [] {
bytes _bytes(124);
std::memcpy(_bytes.data(), &lorem_ipsum[0], 124);
return _bytes;
} ();
string temp;
char answer[] = "@x9bx90x00x00x00x00x00, the great answer to life, the universe and everything, squared";
bytes _answer = [] {
bytes _bytes(72);
std::memcpy(_bytes.data(), &answer[0], 72);
return _bytes;
} ();
int main() {
auto start = steady_clock::now();
for (int i = 0; i < 4096; i++) {
temp = hexlify3(placeholder);
}
auto end = steady_clock::now();
duration<double, std::nano> time = end - start;
cout << "hexlify3: " << time.count() / 4096 << " nanosecondsn";
start = steady_clock::now();
for (int i = 0; i < 4096; i++) {
temp = hexlify5(placeholder);
}
end = steady_clock::now();
time = end - start;
cout << "hexlify5: " << time.count() / 4096 << " nanosecondsn";
}
The above code compiles and doesn’t suffer the problem, as it should.
PS D:xoreos-tools-masterbinRelease> C:UsersEstrangersourcereposhexlify_testx64Releasehexlify_test.exe
hexlify3: 816.577 nanoseconds
hexlify5: 2265.14 nanoseconds
PS D:xoreos-tools-masterbinRelease> C:UsersEstrangersourcereposhexlify_testx64Releasehexlify_test.exe
hexlify3: 865.063 nanoseconds
hexlify5: 2276.93 nanoseconds
Now if I add this code block to the program, without changing anything else, the program somehow is slowed down:
const unordered_map<uint8_t, string> HEX = [] {
unordered_map<uint8_t, string> data;
for (uint8_t i = 0; i < 255; i++) {
data[i] = DIGITS[(i & 240) >> 4] + DIGITS[i & 15];
}
data[255] = "ff";
return data;
} ();
PS D:xoreos-tools-masterbinRelease> C:UsersEstrangersourcereposhexlify_testx64Releasehexlify_test.exe
hexlify3: 3117.41 nanoseconds
hexlify5: 3137.28 nanoseconds
PS D:xoreos-tools-masterbinRelease> C:UsersEstrangersourcereposhexlify_testx64Releasehexlify_test.exe
hexlify3: 3093.63 nanoseconds
hexlify5: 3038.82 nanoseconds
If I add these, the code is slowed down even further:
const unordered_map<uint8_t, string> ASCII = [] {
unordered_map<uint8_t, string> data;
for (uint8_t i = 0; i < 128; i++) {
if (std::isprint(i)) {
data[i] = char(i);
}
else {
data[i] = "\x" + DIGITS[(i & 240) >> 4] + DIGITS[i & 15];
}
}
for (int i = 128; i < 256; i++) {
data[i] = "\x" + DIGITS[(i & 240) >> 4] + DIGITS[i & 15];
}
data[9] = "\t";
data[10] = "\n";
data[13] = "\r";
data[92] = "\\";
return data;
}();
const vector<string> ASCII1 = [] {
vector<string> data(256);
for (uint8_t i = 0; i < 128; i++) {
if (std::isprint(i)) {
data[i] = char(i);
}
else {
data[i] = "\x" + DIGITS[(i & 240) >> 4] + DIGITS[i & 15];
}
}
for (int i = 128; i < 256; i++) {
data[i] = "\x" + DIGITS[(i & 240) >> 4] + DIGITS[i & 15];
}
data[9] = "\t";
data[10] = "\n";
data[13] = "\r";
data[92] = "\\";
return data;
}();
PS D:xoreos-tools-masterbinRelease> C:UsersEstrangersourcereposhexlify_testx64Releasehexlify_test.exe
hexlify3: 3171.61 nanoseconds
hexlify5: 8897.92 nanoseconds
PS D:xoreos-tools-masterbinRelease> C:UsersEstrangersourcereposhexlify_testx64Releasehexlify_test.exe
hexlify3: 3767.41 nanoseconds
hexlify5: 56066.3 nanoseconds
PS D:xoreos-tools-masterbinRelease> C:UsersEstrangersourcereposhexlify_testx64Releasehexlify_test.exe
hexlify3: 3814.16 nanoseconds
hexlify5: 58556.5 nanoseconds
PS D:xoreos-tools-masterbinRelease> C:UsersEstrangersourcereposhexlify_testx64Releasehexlify_test.exe
hexlify3: 3811.47 nanoseconds
hexlify5: 55165.1 nanoseconds
But if I add this, somehow the code speeds up:
PS D:xoreos-tools-masterbinRelease> C:UsersEstrangersourcereposhexlify_testx64Releasehexlify_test.exe
hexlify3: 3102.93 nanoseconds
hexlify5: 2674.17 nanoseconds
PS D:xoreos-tools-masterbinRelease> C:UsersEstrangersourcereposhexlify_testx64Releasehexlify_test.exe
hexlify3: 3125.44 nanoseconds
hexlify5: 2591.36 nanoseconds
PS D:xoreos-tools-masterbinRelease> C:UsersEstrangersourcereposhexlify_testx64Releasehexlify_test.exe
hexlify3: 3495.8 nanoseconds
hexlify5: 2680.74 nanoseconds
What is happening here? I am using Visual Studio 2022 17.9.7, on Windows 10 x64, the program is compiled in Release mode, targeting x64, compiler flags:
/permissive- /ifcOutput "hexlify_testx64Release" /GS /GL /W3 /Gy /Zc:wchar_t /Zi /Gm- /O2 /sdl /Fd"hexlify_testx64Releasevc143.pdb" /Zc:inline /fp:precise /D "NDEBUG" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /errorReport:prompt /WX- /Zc:forScope /std:c17 /Gd /Oi /MD /std:c++20 /FC /Fa"hexlify_testx64Release" /EHsc /nologo /Fo"hexlify_testx64Release" /Ot /Fp"hexlify_testx64Releasehexlify_test.pch" /diagnostics:column