Here’s the code for a working simple program that multiplies two (although here same) 16-byte float vectors through SSE and storing the output into s
in C.
#include <xmmintrin.h>
float *data;
void sse_multiply() {
__m128 s = _mm_mul_ps(_mm_load_ps(data), _mm_load_ps(data));
// unload s in another float *result or even float *data...to get answer
}
int main() {
data = aligned_alloc(16, 4 * sizeof(float));
data[0] = 1.0f;
data[1] = 2.0f;
data[2] = 3.0f;
data[3] = 4.0f;
sse_multiply();
return 0;
}
In godbolt.org, The above program generates the following assembly code when compiled with x86-64_GCC
compiler:
data:
.zero 8
sse_multiply:
push rbp
mov rbp, rsp
mov rax, QWORD PTR data[rip]
mov QWORD PTR [rbp-64], rax
mov rax, QWORD PTR [rbp-64]
movaps xmm0, XMMWORD PTR [rax]
mov rax, QWORD PTR data[rip]
mov QWORD PTR [rbp-56], rax
mov rax, QWORD PTR [rbp-56]
movaps xmm1, XMMWORD PTR [rax]
movaps XMMWORD PTR [rbp-32], xmm1
movaps XMMWORD PTR [rbp-48], xmm0
movaps xmm0, XMMWORD PTR [rbp-32]
mulps xmm0, XMMWORD PTR [rbp-48]
movaps XMMWORD PTR [rbp-16], xmm0
nop
pop rbp
ret
main:
push rbp
mov rbp, rsp
mov esi, 16
mov edi, 16
call aligned_alloc
mov QWORD PTR data[rip], rax
mov rax, QWORD PTR data[rip]
movss xmm0, DWORD PTR .LC0[rip]
movss DWORD PTR [rax], xmm0
mov rax, QWORD PTR data[rip]
add rax, 4
movss xmm0, DWORD PTR .LC1[rip]
movss DWORD PTR [rax], xmm0
mov rax, QWORD PTR data[rip]
add rax, 8
movss xmm0, DWORD PTR .LC2[rip]
movss DWORD PTR [rax], xmm0
mov rax, QWORD PTR data[rip]
add rax, 12
movss xmm0, DWORD PTR .LC3[rip]
movss DWORD PTR [rax], xmm0
mov eax, 0
call sse_multiply
mov eax, 0
pop rbp
ret
.LC0:
.long 1065353216
.LC1:
.long 1073741824
.LC2:
.long 1077936128
.LC3:
.long 1082130432
You can see in main()
, it’s using the movss
SSE instruction (move a single scaler float) into the index via every line of data[<index>] = <value>;
. I know GCC puts the movss
instruction when loading floats but movess
is very slow. Also, Just to load 4 floats into float *data;
we outputted 14 lines of assembly code which isn’t good (below).
mov rax, QWORD PTR data[rip]
movss xmm0, DWORD PTR .LC0[rip]
movss DWORD PTR [rax], xmm0
mov rax, QWORD PTR data[rip]
add rax, 4
movss xmm0, DWORD PTR .LC1[rip]
movss DWORD PTR [rax], xmm0
mov rax, QWORD PTR data[rip]
add rax, 8
movss xmm0, DWORD PTR .LC2[rip]
movss DWORD PTR [rax], xmm0
mov rax, QWORD PTR data[rip]
add rax, 12
movss xmm0, DWORD PTR .LC3[rip]
movss DWORD PTR [rax], xmm0
My question is: Is there a better way to load & unload data to & from aligned memory locations in C, especially when working with SSE?
15