Fixes and tweaks to whadd

This commit is contained in:
Andrew Cassidy 2022-05-22 16:38:54 -07:00
parent aa6bd9602d
commit c05879f1c1
3 changed files with 42 additions and 32 deletions

View File

@ -1,7 +1,7 @@
--- ---
BasedOnStyle: google BasedOnStyle: google
IndentWidth: 4 IndentWidth: 4
ColumnLimit: 160 ColumnLimit: 120
AllowShortBlocksOnASingleLine: Always AllowShortBlocksOnASingleLine: Always
AllowShortFunctionsOnASingleLine: All AllowShortFunctionsOnASingleLine: All
AlwaysBreakTemplateDeclarations: MultiLine AlwaysBreakTemplateDeclarations: MultiLine

View File

@ -25,43 +25,53 @@
#include "util.h" #include "util.h"
namespace quicktex { template <typename T> using requires_arch = xsimd::kernel::requires_arch<T>;
template <class A, class T> inline next_size_t<T> widening_hadd(xsimd::batch<T, A> const& arg) {
using b_type = xsimd::batch<T, A>;
using r_type = next_size_t<T>;
const auto len = b_type::size;
std::array<T, len> buff; namespace quicktex::simd {
r_type sum = 0;
arg.store(&buff[0]); namespace kernel {
for (unsigned i = 0; i < len; i++) { sum += static_cast<r_type>(buff[i]); }
return sum;
}
#if XSIMD_WITH_NEON64 #if XSIMD_WITH_NEON64
template <> inline int32_t widening_hadd(xsimd::batch<int16_t, xsimd::neon64> const& arg) { template <class A> inline int32_t whadd(xsimd::batch<int16_t, A> const& arg, requires_arch<xsimd::neon64>) {
// Pairwise widening sum, then sum all N/2 widened lanes return vaddlvq_s16(arg);
xsimd::batch<int32_t, xsimd::neon64> paired = vpaddlq_s16(arg);
return xsimd::hadd(paired);
} }
#endif #endif
#if XSIMD_WITH_SSE2 #if XSIMD_WITH_SSE2
template <> inline int32_t widening_hadd(xsimd::batch<int16_t, xsimd::sse2> const& arg) { template <class A> inline int32_t whadd(xsimd::batch<int16_t, A> const& arg, requires_arch<xsimd::sse2>) {
// Pairwise widening sum with multiply by 1, then sum all N/2 widened lanes // Pairwise widening sum with multiply by 1, then sum all N/2 widened lanes
xsimd::batch<int32_t, xsimd::sse2> paired = _mm_madd_epi16(arg, _mm_set1_epi16(1)); xsimd::batch<int32_t, A> paired = _mm_madd_epi16(arg, _mm_set1_epi16(1));
return xsimd::hadd(paired); return xsimd::hadd(paired);
} }
#endif #endif
#if XSIMD_WITH_AVX2 #if XSIMD_WITH_AVX2
template <> inline int32_t widening_hadd(xsimd::batch<int16_t, xsimd::avx2> const& arg) { template <class A> inline int32_t whadd(xsimd::batch<int16_t, A> const& arg, requires_arch<xsimd::avx2>) {
// Pairwise widening sum with multiply by 1, then sum all N/2 widened lanes // Pairwise widening sum with multiply by 1, then sum all N/2 widened lanes
xsimd::batch<int32_t, xsimd::avx2> paired = _mm256_madd_epi16(arg, _mm256_set1_epi16(1)); xsimd::batch<int32_t, A> paired = _mm256_madd_epi16(arg, _mm256_set1_epi16(1));
return xsimd::hadd(paired); return xsimd::hadd(paired);
} }
#endif #endif
} // namespace quicktex template <class A, class T>
inline next_size_t<T> whadd(xsimd::batch<T, A> const& arg, requires_arch<xsimd::generic>) {
// Generic implementation that should work everywhere
using b_type = xsimd::batch<T, A>;
using r_type = next_size_t<T>;
const auto len = b_type::size;
alignas(A::alignment()) T buffer[len];
r_type sum = 0;
arg.store_aligned(buffer);
for (T val : buffer) { sum += static_cast<r_type>(val); }
return sum;
}
} // namespace kernel
template <class A, class T> inline next_size_t<T> whadd(xsimd::batch<T, A> const& arg) {
return kernel::whadd(arg, A{});
}
} // namespace quicktex::simd

View File

@ -34,28 +34,28 @@ void test_widening_hadd() {
std::array<int16_t, vec_size> buffer; std::array<int16_t, vec_size> buffer;
std::iota(buffer.begin(), buffer.end(), 1); std::iota(buffer.begin(), buffer.end(), 1);
auto v = xsimd::load(&buffer[0]); auto v = xsimd::load_unaligned(&buffer[0]);
auto sum = widening_hadd(v); auto sum = simd::whadd(v);
assert(sum == vec_size / 2 * (vec_size + 1)); // Gauss formula assert(sum == vec_size / 2 * (vec_size + 1)); // Gauss formula
buffer.fill(1); buffer.fill(1);
v = xsimd::load(&buffer[0]); v = xsimd::load_unaligned(&buffer[0]);
sum = widening_hadd(v); sum = simd::whadd(v);
assert(sum == vec_size); assert(sum == vec_size);
buffer.fill(0); buffer.fill(0);
v = xsimd::load(&buffer[0]); v = xsimd::load_unaligned(&buffer[0]);
sum = widening_hadd(v); sum = simd::whadd(v);
assert(sum == 0); assert(sum == 0);
buffer.fill(std::numeric_limits<int16_t>::max()); buffer.fill(std::numeric_limits<int16_t>::max());
v = xsimd::load(&buffer[0]); v = xsimd::load_unaligned(&buffer[0]);
sum = widening_hadd(v); sum = simd::whadd(v);
assert(sum == std::numeric_limits<int16_t>::max() * (int)vec_size); assert(sum == std::numeric_limits<int16_t>::max() * (int)vec_size);
buffer.fill(std::numeric_limits<int16_t>::min()); buffer.fill(std::numeric_limits<int16_t>::min());
v = xsimd::load(&buffer[0]); v = xsimd::load_unaligned(&buffer[0]);
sum = widening_hadd(v); sum = simd::whadd(v);
assert(sum == std::numeric_limits<int16_t>::min() * (int)vec_size); assert(sum == std::numeric_limits<int16_t>::min() * (int)vec_size);
} }