I’m faced with having to do a bsf (find the first bit set) in a 512bits bitmap. This is in the hot path so I’d like to see how I can speed things up.
Right now I’m maintaining a header entry to know in which 32bits block the first set bit will be found. By doing a bsf on the header + a bsf in the entry designed by the header and some arithmetic, one can can compute the bsf of the whole bitmap fairly fast.
But this obvious require to maintain the header in addition of the bitmap itself. I’d like to explore alternative. Notably, SSE or AVX, but failed to come up with a solution.
Is that possible at all ? If yes, how ?
2
Ok if nobody else wants to, I’ll have a hack at it, assuming avx2
available (untested because avx2
isn’t for me). No idea if this is actually faster than just plowing through 8 iterations of bsf
on a 64 bit integer register. I have my doubts. Assume src_address holds the most significant bit. src_address + 511 holds the least.
bsf
finds the least significant 1 bit.
We can isolate the least significant 1 bit with the formula
x & (-1) //Hackers Delight 1st ed p11
avx2
uses 256 bit registers so we’ll have to unroll to get through all 512 bits.
int BSR512(char* src_address) {
// load the data into 2 registers
__m256i data1 = _mm256_loadu_si256((__m256i*)src_address);
__m256i data2 = _mm256_loadu_si256((__m256i*)(src_address + 256));
// zero a register for negate and comparison
//__m256i zeroes = _mm256_set_epi32(0,0,0,0,0,0,0,0); //NO
__m256i _mm256_setzero_si256();
// negate
__m256i negs1 = _mm256_sub_epi32(zeroes, data1);
__m256i negs2 = _mm256_sub_epi32(zeroes, data2);
// x & (-x)
// these should be _mm256_and_si256 to avoid casts -thanks rwong
__m256i lsbs1 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(data1), _mm256_castsi256_ps(negs1)));
__m256i lsbs2 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(data2), _mm256_castsi256_ps(negs2)));
// set int32 in a reg to all 1s if we found something
__m256i mask1 = _mm256_cmpeq_epi32(lsbs1, zeroes);
__m256i mask2 = _mm256_cmpeq_epi32(lsbs2, zeroes);
// which of the packed ints was set to all 1s? put in a normal register
int which1 = _mm256_movemask_ps (_mm256_castsi256_ps(mask1));
// isolate the lowest set bit
which1 = (which1 & -which1);
int which2 = _mm256_movemask_ps (_mm256_castsi256_ps(mask2));
which2 = (which2 & -which2);
// do you have to do a dance here? Does the lowest memory address contain
// the highest bit or the lowest bit - ask your data and change this
if (which1 != 0) {
// extract the 32 bit int with the bit set
const char w = which1;
int contains_set_bit = _mm256_extract_epi32(lsbs1, w);
// add number of bits in lower integers unset to bsf
return which1 * 32 + BSF(contains_set_bit);
} else if (which2 != 0) {
// as above but didn't find in the first loop
const char w = which2;
int contains_set_bit _mm256_extract_epi32(lsbs2, w);
return which2 * 32 + BSF(contains_set_bit) + 256;
}
return 0;
}
Which compiled with clang++ -std=c++11 -Wall -g -march=core-avx2 -O3
objdump -DC a.out
gives:
0000000000400a90 <BSF512(char*)>:
400a90: c5 fe 6f 07 vmovdqu (%rdi),%ymm0
400a94: c5 f5 ef c9 vpxor %ymm1,%ymm1,%ymm1
400a98: c5 f5 fa d0 vpsubd %ymm0,%ymm1,%ymm2
400a9c: c5 fd db c2 vpand %ymm2,%ymm0,%ymm0
400aa0: c5 fd 76 d1 vpcmpeqd %ymm1,%ymm0,%ymm2
400aa4: c5 fc 50 c2 vmovmskps %ymm2,%eax
400aa8: c4 e2 70 f3 d8 blsi %eax,%ecx
400aad: 74 26 je 400ad5 <BSF512(char*)+0x45>
400aaf: 89 c8 mov %ecx,%eax
400ab1: 83 e0 07 and $0x7,%eax
400ab4: c5 f9 6e c8 vmovd %eax,%xmm1
400ab8: c4 e2 75 36 c0 vpermd %ymm0,%ymm1,%ymm0
400abd: c5 f9 7e c2 vmovd %xmm0,%edx
400ac1: c1 e1 05 shl $0x5,%ecx
400ac4: f3 0f bc c2 tzcnt %edx,%eax
400ac8: ff c0 inc %eax
400aca: 85 d2 test %edx,%edx
400acc: 0f 44 c2 cmove %edx,%eax
400acf: 01 c8 add %ecx,%eax
400ad1: c5 f8 77 vzeroupper
400ad4: c3 retq
400ad5: c5 fe 6f 87 00 01 00 vmovdqu 0x100(%rdi),%ymm0
400adc: 00
400add: c5 f5 fa d0 vpsubd %ymm0,%ymm1,%ymm2
400ae1: c5 fd db c2 vpand %ymm2,%ymm0,%ymm0
400ae5: c5 fd 76 c9 vpcmpeqd %ymm1,%ymm0,%ymm1
400ae9: c5 fc 50 c9 vmovmskps %ymm1,%ecx
400aed: 31 c0 xor %eax,%eax
400aef: c4 e2 70 f3 d9 blsi %ecx,%ecx
400af4: 74 27 je 400b1d <BSF512(char*)+0x8d>
400af6: 89 c8 mov %ecx,%eax
400af8: 83 e0 07 and $0x7,%eax
400afb: c5 f9 6e c8 vmovd %eax,%xmm1
400aff: c4 e2 75 36 c0 vpermd %ymm0,%ymm1,%ymm0
400b04: c5 f9 7e c0 vmovd %xmm0,%eax
400b08: c1 e1 05 shl $0x5,%ecx
400b0b: f3 0f bc d0 tzcnt %eax,%edx
400b0f: ff c2 inc %edx
400b11: 85 c0 test %eax,%eax
400b13: 0f 44 d0 cmove %eax,%edx
400b16: 8d 84 11 00 01 00 00 lea 0x100(%rcx,%rdx,1),%eax
400b1d: c5 f8 77 vzeroupper
400b20: c3 retq
400b21: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1)
400b28: 00 00 00
400b2b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
0