For day 4 of the 2024 advent of code there’s a problem where you need to find how many “XMAS” strings are contained in a grid of characters such as
MMMSXXMASM
MSAMXMSMSA
AMXSXMAAMM
MSAMASMSMX
XMASAMXAMM
XXAMMXXAMA
SMSMSASXSS
SAXAMASAAA
MAMMMXMMMM
MXMXAXMASX
The way I solved that problem was to load all 32 characters of the 8 possible directions starting from an X into a SIMD register and comparing it to a mask.
That worked and is relatively fast (it reached 460 MB/s when I increased the input file size to test its limit) but when talking about our solutions on a discord server containing a lot of developers, and a few ones with over 15 years of professional experience, one of them told me that for this specific problem the use of SIMD instructions isn’t what would make the code run faster and the underlying algorithm itself is more important.
So I’m wondering, in what situations is SIMD worth it? How do I recognize a situation where using SIMD instructions would make a noticeable difference? I thought that comparing all 32 characters in 1 instruction would make a big difference instead of nesting 2 for loops but I might have been wrong?
typedef struct {
std::vector<char> chars;
uint32_t line_length;
} Data;
Data readFile() {
std::ifstream file("./input/day4_input.txt", std::ios::binary | std::ios::ate);
std::streamsize size = file.tellg();
file.seekg(0, std::ios::beg);
std::vector<char> buffer(size);
file.read(buffer.data(), size);
file.close();
uint32_t line_length = std::ranges::find(buffer, 'n') - buffer.begin();
buffer.erase(std::remove(buffer.begin(), buffer.end(), 'n'), buffer.end());
Data input_data { buffer, line_length };
return input_data;
}
int main(int argc, char** argv) {
Data idata = readFile();
auto coordsToIdx = [&idata] (uint32_t i, uint32_t j) -> uint32_t { return i*idata.line_length + j; };
auto idxToCoords = [&idata] (uint32_t idx) -> std::pair<uint32_t, uint32_t> {
return std::make_pair(static_cast<uint32_t>(idx / idata.line_length), static_cast<uint32_t>(idx % idata.line_length));
};
unsigned int res = 0;
const __m256i _mask = _mm256_setr_epi8(
'X', 'M', 'A', 'S',
'X', 'M', 'A', 'S',
'X', 'M', 'A', 'S',
'X', 'M', 'A', 'S',
'X', 'M', 'A', 'S',
'X', 'M', 'A', 'S',
'X', 'M', 'A', 'S',
'X', 'M', 'A', 'S'
);
std::array<int, 8> out_buff;
for(size_t i = 0; i < idata.chars.size(); i++) {
if(idata.chars[i] != 'X') continue;
char buff[32] = {0};
std::pair<uint32_t, uint32_t> coords = idxToCoords(i);
bool u = coords.first >= 3;
bool d = coords.first <= (idata.chars.size() / idata.line_length) - 4; // 0 indexed
bool l = coords.second >= 3;
bool r = coords.second <= idata.line_length - 4;
if(u) for(size_t c = 0; c < 4; c++) buff[c] = idata.chars[coordsToIdx(coords.first - c, coords.second)];
if(u && r) for(size_t c = 0; c < 4; c++) buff[c + 4] = idata.chars[coordsToIdx(coords.first - c, coords.second + c)];
if(r) for(size_t c = 0; c < 4; c++) buff[c + 8] = idata.chars[coordsToIdx(coords.first, coords.second + c)];
if(d && r) for(size_t c = 0; c < 4; c++) buff[c + 12] = idata.chars[coordsToIdx(coords.first + c, coords.second + c)];
if(d) for(size_t c = 0; c < 4; c++) buff[c + 16] = idata.chars[coordsToIdx(coords.first + c, coords.second)];
if(d && l) for(size_t c = 0; c < 4; c++) buff[c + 20] = idata.chars[coordsToIdx(coords.first + c, coords.second - c)];
if(l) for(size_t c = 0; c < 4; c++) buff[c + 24] = idata.chars[coordsToIdx(coords.first, coords.second - c)];
if(u && l) for(size_t c = 0; c < 4; c++) buff[c + 28] = idata.chars[coordsToIdx(coords.first - c, coords.second - c)];
__m256i _block = _mm256_loadu_epi8(buff);
__m256i _cmpeq = _mm256_cmpeq_epi32(_block, _mask);
_mm256_storeu_epi32(out_buff.data(), _cmpeq);
// Contains signed 32 bits integers for each direction
// If a direction match, all bits are set to 1
// Which is -1. So we substract the total to add it instead
res -= std::accumulate(out_buff.begin(), out_buff.end(), 0);
}
std::cout << "Result is : << res << std::endl;
}
11