I am trying to understand how AMD gpu’s perform integer division, so I disassembled this program:
extern "C" __attribute__((global))void __attribute__((amdgpu_flat_work_group_size(1, 1)))test(int* data0, const int* data1, const int* data2) {
int val0 = data1[0];
int val1 = data2[0];
data0[0] = (val0/val1);
}
I looked at the output and marked the parts that I get and the parts that I’m confused about, mainly why do we multiply by some weird magic number(4294966784) and the logic that follows it
//Load values into memory
s_load_b64 s[2:3], s[0:1], 0x10
s_load_b32 s4, s[2:3], null
s_load_b128 s[0:3], s[0:1], null
s_load_b32 s2, s[2:3], null
//get the divisor sign
s_ashr_i32 s5, s4, 31
//convert to one`s complement
s_add_i32 s4, s4, s5
s_xor_b32 s4, s4, s5
v_cvt_f32_u32_e32 v0, s4
//convert to unsigned
s_sub_i32 s6, 0, s4
//get the numerator sign
s_ashr_i32 s7, s2, 31
//convert to one`s complement
s_add_i32 s2, s2, s7
s_xor_b32 s2, s2, s7
//get the sign of the result
s_xor_b32 s5, s7, s5
//calculate the reciporal
v_rcp_iflag_f32_e32 v0, v0
s_waitcnt_depctr 0xfff
// I don't understand what is this magic number?
v_mul_f32_e32 v0, 0x4f7ffffe, v0
//convert to unsigned int, store in scalar register
v_cvt_u32_f32_e32 v0, v0
v_readfirstlane_b32 s3, v0
// I do not understant the part from here
s_mul_i32 s6, s6, s3
s_mul_hi_u32 s6, s3, s6
s_add_i32 s3, s3, s6
s_mul_hi_u32 s3, s2, s3
s_mul_i32 s6, s3, s4
s_sub_i32 s2, s2, s6
s_add_i32 s6, s3, 1
s_sub_i32 s7, s2, s4
s_cmp_ge_u32 s2, s4
s_cselect_b32 s3, s6, s3
s_cselect_b32 s2, s7, s2
s_add_i32 s6, s3, 1
s_cmp_ge_u32 s2, s4
s_cselect_b32 s2, s6, s3
s_xor_b32 s2, s2, s5
s_sub_i32 s2, s2, s5
// to here
//store the result
v_mov_b32_e32 v1, s2
v_mov_b32_e32 v0, 0
global_store_b32 v0, v1, s[0:1]
s_nop 0
s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
s_endpgm