I want to expand some u8
s out to u64
, except instead of zero or sign extending, which have direct support, I want “copy-extending”. What’s the best way to do this (on intel cpus with avx512)? Example code is in rust but the host language isn’t the interesting part.
#![feature(portable_simd)]
use std::simd::*;
// Expands out each input byte 8 times
pub fn batch_splat_scalar(x: [u8; 16]) -> [u64; 16] {
let mut ret = [0; 16];
for i in 0..16 {
ret[i] =
u64::from_le_bytes([x[i], x[i], x[i], x[i], x[i], x[i], x[i], x[i]]);
}
ret
}
pub fn batch_splat_simd(x: u8x16) -> u64x16 {
Simd::from_array(batch_splat_scalar(x.to_array()))
}
which compiles to something like this with avx512
vpmovzxbq zmm0, qword ptr [rsi]
vpbroadcastq zmm1, qword ptr [rip + .LCPI0_0]
mov rax, rdi
vpmuludq zmm2, zmm0, zmm1
vpbroadcastq zmm3, qword ptr [rip + .LCPI0_1]
vpmuludq zmm0, zmm0, zmm3
vpsllq zmm0, zmm0, 32
vporq zmm0, zmm2, zmm0
vmovdqu64 zmmword ptr [rdi], zmm0
vpmovzxbq zmm0, qword ptr [rsi + 8]
vpmuludq zmm1, zmm0, zmm1
vpmuludq zmm0, zmm0, zmm3
vpsllq zmm0, zmm0, 32
vporq zmm0, zmm1, zmm0
vmovdqu64 zmmword ptr [rdi + 64], zmm0
vzeroupper
ret
https://godbolt.org/z/67cW5GnKf