I have to parse about 1.5 billion records that look roughly like this:
<code>* 1832 FETCH (UID 10585 RFC822.SIZE 46507 FLAGS () BODY[HEADER.FIELDS (From To Subject Date)] {182}
From: SenderName <[email protected]>
Date: Wed, 14 Feb 2024 21:51:05 +0000
<code>* 1832 FETCH (UID 10585 RFC822.SIZE 46507 FLAGS () BODY[HEADER.FIELDS (From To Subject Date)] {182}
Subject: Some subject
From: SenderName <[email protected]>
To: [email protected]
Date: Wed, 14 Feb 2024 21:51:05 +0000
)
</code>
* 1832 FETCH (UID 10585 RFC822.SIZE 46507 FLAGS () BODY[HEADER.FIELDS (From To Subject Date)] {182}
Subject: Some subject
From: SenderName <[email protected]>
To: [email protected]
Date: Wed, 14 Feb 2024 21:51:05 +0000
)
Currently, I am trying to determine the fastest solution to this so I decided to set up a simple benchmark. Before I get yelled at for the terrible code, that is not the point of the question. I simply wrote the first thing that came to mind just so I could get the test set up.
I am on windows. The data file (emails.txt) is mapped to memory for faster read access. The file contains 112008 records for testing and is 29 MB in size.
I have the following code for parsing. Again the quality of the code is unimportant (unless that is what is causing this issue). This is because I will not be using string streams in the final code.
<code>std::tuple<const char*, DWORD> memory_map_file()
HANDLE file_handle = CreateFileW(L"emails.txt", GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_READONLY, NULL);
DWORD file_size = GetFileSize(file_handle, NULL);
HANDLE mapping_handle = CreateFileMappingW(file_handle, NULL, PAGE_READONLY, 0, 0, NULL);
LPVOID address = MapViewOfFile(mapping_handle, FILE_MAP_READ, 0, 0, 0);
return { (char*)address, file_size };
const NewMessageHeader parse_rfc822_header_string_stream(std::string_view header)
//Parse the message header using string streams...
void profiling_string_stream(const char* data, DWORD file_size)
for (DWORD i = 0; i < file_size - 1; ++i)
if (data[i] == '*' && data[i + 1] == ' ' && !header.empty())
const NewMessageHeader message_header = parse_rfc822_header_string_stream(header);
const auto profile(void (*func)(const char*, DWORD), const char* data, DWORD file_size)
const auto start_scan{ std::chrono::steady_clock::now() };
profiling_scan(data, file_size);
const auto end_scan{ std::chrono::steady_clock::now() };
return std::chrono::duration<double>{ end_scan - start_scan };
//Memory map the file to remove OS operations overhead
const auto [data, file_size] = memory_map_file();
const auto time_string_stream = profile(profiling_string_stream, data, file_size);
std::cout << "Finished string stream in " << time_string_stream.count() << std::endl;
const auto start_string_stream{ std::chrono::steady_clock::now() };
profiling_string_stream(data, file_size);
const auto end_string_stream{ std::chrono::steady_clock::now() };
const std::chrono::duration<double> elapsed_seconds_string_stream{ end_string_stream - start_string_stream };
std::cout << "Finished string stream in " << elapsed_seconds_string_stream.count() << std::endl;
<code>std::tuple<const char*, DWORD> memory_map_file()
{
HANDLE file_handle = CreateFileW(L"emails.txt", GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_READONLY, NULL);
DWORD file_size = GetFileSize(file_handle, NULL);
HANDLE mapping_handle = CreateFileMappingW(file_handle, NULL, PAGE_READONLY, 0, 0, NULL);
LPVOID address = MapViewOfFile(mapping_handle, FILE_MAP_READ, 0, 0, 0);
return { (char*)address, file_size };
}
const NewMessageHeader parse_rfc822_header_string_stream(std::string_view header)
{
//Parse the message header using string streams...
}
void profiling_string_stream(const char* data, DWORD file_size)
{
std::string header;
header.reserve(3000);
for (DWORD i = 0; i < file_size - 1; ++i)
{
if (data[i] == '*' && data[i + 1] == ' ' && !header.empty())
{
const NewMessageHeader message_header = parse_rfc822_header_string_stream(header);
header.clear();
}
header += data[i];
}
}
const auto profile(void (*func)(const char*, DWORD), const char* data, DWORD file_size)
{
const auto start_scan{ std::chrono::steady_clock::now() };
profiling_scan(data, file_size);
const auto end_scan{ std::chrono::steady_clock::now() };
return std::chrono::duration<double>{ end_scan - start_scan };
}
int main()
{
//Memory map the file to remove OS operations overhead
const auto [data, file_size] = memory_map_file();
//ONE
const auto time_string_stream = profile(profiling_string_stream, data, file_size);
std::cout << "Finished string stream in " << time_string_stream.count() << std::endl;
//TWO
const auto start_string_stream{ std::chrono::steady_clock::now() };
profiling_string_stream(data, file_size);
const auto end_string_stream{ std::chrono::steady_clock::now() };
const std::chrono::duration<double> elapsed_seconds_string_stream{ end_string_stream - start_string_stream };
std::cout << "Finished string stream in " << elapsed_seconds_string_stream.count() << std::endl;
std::cin.get();
return 0;
}
</code>
std::tuple<const char*, DWORD> memory_map_file()
{
HANDLE file_handle = CreateFileW(L"emails.txt", GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_READONLY, NULL);
DWORD file_size = GetFileSize(file_handle, NULL);
HANDLE mapping_handle = CreateFileMappingW(file_handle, NULL, PAGE_READONLY, 0, 0, NULL);
LPVOID address = MapViewOfFile(mapping_handle, FILE_MAP_READ, 0, 0, 0);
return { (char*)address, file_size };
}
const NewMessageHeader parse_rfc822_header_string_stream(std::string_view header)
{
//Parse the message header using string streams...
}
void profiling_string_stream(const char* data, DWORD file_size)
{
std::string header;
header.reserve(3000);
for (DWORD i = 0; i < file_size - 1; ++i)
{
if (data[i] == '*' && data[i + 1] == ' ' && !header.empty())
{
const NewMessageHeader message_header = parse_rfc822_header_string_stream(header);
header.clear();
}
header += data[i];
}
}
const auto profile(void (*func)(const char*, DWORD), const char* data, DWORD file_size)
{
const auto start_scan{ std::chrono::steady_clock::now() };
profiling_scan(data, file_size);
const auto end_scan{ std::chrono::steady_clock::now() };
return std::chrono::duration<double>{ end_scan - start_scan };
}
int main()
{
//Memory map the file to remove OS operations overhead
const auto [data, file_size] = memory_map_file();
//ONE
const auto time_string_stream = profile(profiling_string_stream, data, file_size);
std::cout << "Finished string stream in " << time_string_stream.count() << std::endl;
//TWO
const auto start_string_stream{ std::chrono::steady_clock::now() };
profiling_string_stream(data, file_size);
const auto end_string_stream{ std::chrono::steady_clock::now() };
const std::chrono::duration<double> elapsed_seconds_string_stream{ end_string_stream - start_string_stream };
std::cout << "Finished string stream in " << elapsed_seconds_string_stream.count() << std::endl;
std::cin.get();
return 0;
}
Now for the interesting part, the portion labeled ONE
runs in ~3.6 seconds where as the portion labeled TWO
runs in ~5.5 seconds. This is a huge difference. Is there any reason this difference exists?
Here is the complete code: https://godbolt.org/z/dnfbY65G9
I should note this code is meaningless without the datafile and required windows.h
to compile.
My question is why does this discrepancy of two seconds exist and how can I make sure my future benchmarks do not have this error.