I am writing a double-double arithmetic library for AVX/AVX2. One of the issues I encountered was that the non-Simd and Simd versions have different memory layouts.
<code>// Pair-wise
struct Float64x2 {
double hi;
double lo;
};
// Component-wise
struct __m256dx2 {
__m256d hi; // double hi[4]
__m256d lo; // double lo[4]
};
</code>
<code>// Pair-wise
struct Float64x2 {
double hi;
double lo;
};
// Component-wise
struct __m256dx2 {
__m256d hi; // double hi[4]
__m256d lo; // double lo[4]
};
</code>
// Pair-wise
struct Float64x2 {
double hi;
double lo;
};
// Component-wise
struct __m256dx2 {
__m256d hi; // double hi[4]
__m256d lo; // double lo[4]
};
This means I will have to convert from a pair-wise
{x0,y0}, {x1,y1}, {x2,y2}, {x3,y3}
to a component-wise {x0,x1,x2,x3}, {y0,y1,y2,y3}
memory layout and vice-versa.
<code>__m256dx2 _mm256x2_loadu_pdx2(const double* mem_addr) {
__m256dx2 val;
val.hi = _mm256_loadu_pd(mem_addr);
mem_addr += sizeof(__m256d) / sizeof(double);
val.lo = _mm256_loadu_pd(mem_addr);
// convert to component-wise
return val;
}
void _mm256x2_storeu_pdx2(double* mem_addr, __m256dx2 val) {
// convert to pair-wise
_mm256_storeu_pd(mem_addr, val.hi);
mem_addr += sizeof(__m256d) / sizeof(double);
_mm256_storeu_pd(mem_addr, val.lo);
}
</code>
<code>__m256dx2 _mm256x2_loadu_pdx2(const double* mem_addr) {
__m256dx2 val;
val.hi = _mm256_loadu_pd(mem_addr);
mem_addr += sizeof(__m256d) / sizeof(double);
val.lo = _mm256_loadu_pd(mem_addr);
// convert to component-wise
return val;
}
void _mm256x2_storeu_pdx2(double* mem_addr, __m256dx2 val) {
// convert to pair-wise
_mm256_storeu_pd(mem_addr, val.hi);
mem_addr += sizeof(__m256d) / sizeof(double);
_mm256_storeu_pd(mem_addr, val.lo);
}
</code>
__m256dx2 _mm256x2_loadu_pdx2(const double* mem_addr) {
__m256dx2 val;
val.hi = _mm256_loadu_pd(mem_addr);
mem_addr += sizeof(__m256d) / sizeof(double);
val.lo = _mm256_loadu_pd(mem_addr);
// convert to component-wise
return val;
}
void _mm256x2_storeu_pdx2(double* mem_addr, __m256dx2 val) {
// convert to pair-wise
_mm256_storeu_pd(mem_addr, val.hi);
mem_addr += sizeof(__m256d) / sizeof(double);
_mm256_storeu_pd(mem_addr, val.lo);
}
How can I convert between the two memory layouts in AVX (or AVX2)?