Please note that this question is not about YUV422 to RGB conversion!
I have this code for a pixel order YUV422 to RGB conversion.
static void yuv422ToRGB(unsigned char* img,
int width, int height, int widthStep, unsigned char* dst)
{
__m128i yMask1 = _mm_setr_epi8(0, -1, 0, -1, 0, -1, 4, -1, 4, -1, 4, -1, 8, -1, 8, -1);
__m128i yMask2 = _mm_setr_epi8(8, -1, 12, -1, 12, -1, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1);
__m128i uvMask1 = _mm_setr_epi8(1, 3, 1, 3, 1, 3, 5, 7, 5, 7, 5, 7, 9, 11, 9, 11);
__m128i uvMask2 = _mm_setr_epi8(9, 11, 13, 15, 13, 15, 13, 15, -1, -1, -1, -1, -1, -1, -1, -1);
__m128i magicMask1 = _mm_setr_epi8(0, 102, 25, 52, -127, 0, 0, 102, 25, 52, -127, 0, 0, 102, 25, 52);
__m128i magicMask2 = _mm_setr_epi8(-127, 0, 0, 102, 25, 52, -127, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m128i const128 = _mm_set1_epi8(-127);
__m128i const16 = _mm_set1_epi16(16);
__m128i const32 = _mm_set1_epi16(32);
__m128i const74 = _mm_set1_epi16(74);
__m128i gFlipMask1 = _mm_setr_epi16(1, -1, 1, 1, -1, 1, 1, -1);
__m128i gFlipMask2 = _mm_setr_epi16(1, 1, -1, 1, 0, 0, 0, 0);
for (int y = 0; y < height; y++)
{
for (int x = 0; x < width; x += 16)
{
__m128i imgreg1 = _mm_loadu_si128((__m128i*)(img + x));
__m128i yValues1 = _mm_shuffle_epi8(imgreg1, yMask1);
yValues1 = _mm_subs_epi16(yValues1, const16);
yValues1 = _mm_mullo_epi16(yValues1, const74);
__m128i uvValues1 = _mm_shuffle_epi8(imgreg1, uvMask1);
uvValues1 = _mm_xor_si128(uvValues1, const128);
__m128i toAdd1 = _mm_maddubs_epi16(magicMask1, uvValues1);
toAdd1 = _mm_mullo_epi16(toAdd1, gFlipMask1);
toAdd1 = _mm_add_epi16(toAdd1, const32);
__m128i rgb1 = _mm_add_epi16(yValues1, toAdd1);
rgb1 = _mm_srai_epi16(rgb1, 6);
//======================
__m128i yValues2 = _mm_shuffle_epi8(imgreg1, yMask2);
yValues2 = _mm_subs_epi16(yValues2, const16);
yValues2 = _mm_mullo_epi16(yValues2, const74);
__m128i uvValues2 = _mm_shuffle_epi8(imgreg1, uvMask2);
uvValues2 = _mm_xor_si128(uvValues2, const128);
__m128i toAdd2 = _mm_maddubs_epi16(magicMask2, uvValues2);
toAdd2 = _mm_mullo_epi16(toAdd2, gFlipMask2);
toAdd2 = _mm_add_epi16(toAdd2, const32);
__m128i rgb2 = _mm_add_epi16(yValues2, toAdd2);
rgb2 = _mm_srai_epi16(rgb2, 6);
//======================
__m128i out = _mm_packus_epi16(rgb1, rgb2);
_mm_storeu_si128((__m128i*)(dst), out);
dst += 12;
}
img += widthStep;
}
}
I had an Idea to make it faster using an alternative calculation method that would require fewer instructions. To check how much performance I could gain, I uncommented six instructions inside the loop which might be saved. The rest of the code remained unchanged:
static void yuv422ToRGB(unsigned char* img,
int width, int height, int widthStep, unsigned char* dst)
{
__m128i yMask1 = _mm_setr_epi8(5, -1, 0, -1, 0, -1, 4, -1, 4, -1, 4, -1, 8, -1, 8, -1);
__m128i yMask2 = _mm_setr_epi8(8, -1, 12, -1, 12, -1, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1);
__m128i uvMask1 = _mm_setr_epi8(1, 3, 1, 3, 1, 3, 5, 7, 5, 7, 5, 7, 9, 11, 9, 11);
__m128i uvMask2 = _mm_setr_epi8(9, 11, 13, 15, 13, 15, 13, 15, -1, -1, -1, -1, -1, -1, -1, -1);
__m128i magicMask1 = _mm_setr_epi8(0, 102, 25, 52, -127, 0, 0, 102, 25, 52, -127, 0, 0, 102, 25, 52);
__m128i magicMask2 = _mm_setr_epi8(-127, 0, 0, 102, 25, 52, -127, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m128i const128 = _mm_set1_epi8(-127);
__m128i const16 = _mm_set1_epi16(16);
__m128i const32 = _mm_set1_epi16(32);
__m128i const74 = _mm_set1_epi16(74);
__m128i gFlipMask1 = _mm_setr_epi16(1, -1, 1, 1, -1, 1, 1, -1);
__m128i gFlipMask2 = _mm_setr_epi16(1, 1, -1, 1, 0, 0, 0, 0);
for (int y = 0; y < height; y++)
{
for (int x = 0; x < width; x += 16)
{
__m128i imgreg1 = _mm_loadu_si128((__m128i*)(img + x));
__m128i yValues1 = _mm_shuffle_epi8(imgreg1, yMask1);
//yValues1 = _mm_subs_epi16(yValues1, const16);
//yValues1 = _mm_mullo_epi16(yValues1, const74);
__m128i uvValues1 = _mm_shuffle_epi8(imgreg1, uvMask1);
uvValues1 = _mm_xor_si128(uvValues1, const128);
__m128i toAdd1 = _mm_maddubs_epi16(magicMask1, uvValues1);
toAdd1 = _mm_mullo_epi16(toAdd1, gFlipMask1);
//toAdd1 = _mm_add_epi16(toAdd1, const32);
__m128i rgb1 = _mm_add_epi16(yValues1, toAdd1);
rgb1 = _mm_srai_epi16(rgb1, 6);
//======================
__m128i yValues2 = _mm_shuffle_epi8(imgreg1, yMask2);
//yValues2 = _mm_subs_epi16(yValues2, const16);
//yValues2 = _mm_mullo_epi16(yValues2, const74);
__m128i uvValues2 = _mm_shuffle_epi8(imgreg1, uvMask2);
uvValues2 = _mm_xor_si128(uvValues2, const128);
__m128i toAdd2 = _mm_maddubs_epi16(magicMask2, uvValues2);
toAdd2 = _mm_mullo_epi16(toAdd2, gFlipMask2);
//toAdd2 = _mm_add_epi16(toAdd2, const32);
__m128i rgb2 = _mm_add_epi16(yValues2, toAdd2);
rgb2 = _mm_srai_epi16(rgb2, 6);
//======================
__m128i out = _mm_packus_epi16(rgb1, rgb2);
_mm_storeu_si128((__m128i*)(dst), out);
dst += 12;
}
img += widthStep;
}
}
Profiling the modified version, it turns out it is around 10% slower than the original (~2.2ms vs 1.9ms). This doesn’t make any sense to me. I was expecting that it might not make a difference because the bottleneck is somewhere else, but how can fewer instructions make the code slower?
What I’ve checked:
- The compiler output https://godbolt.org/z/893c8EfPW
- Effects of caching. I tested this call order: A A B B A B. All calls of A were faster than calls of B.
The Plattform:
- Intel Atom E3845
- clang 14
- Compiler flags:
-std=c++14 -O3 -march=silvermont
How can this behavior be explained in a general way?