From c05879f1c10ed6463b2f6d375d4e31a1ccabfdf0 Mon Sep 17 00:00:00 2001 From: Andrew Cassidy Date: Sun, 22 May 2022 16:38:54 -0700 Subject: [PATCH] Fixes and tweaks to whadd --- .clang-format | 2 +- quicktex/VecUtil.h | 52 ++++++++++++++++++++++--------------- quicktex/tests/TestSIMD.cpp | 20 +++++++------- 3 files changed, 42 insertions(+), 32 deletions(-) diff --git a/.clang-format b/.clang-format index ed7b74d..1f706e5 100644 --- a/.clang-format +++ b/.clang-format @@ -1,7 +1,7 @@ --- BasedOnStyle: google IndentWidth: 4 -ColumnLimit: 160 +ColumnLimit: 120 AllowShortBlocksOnASingleLine: Always AllowShortFunctionsOnASingleLine: All AlwaysBreakTemplateDeclarations: MultiLine diff --git a/quicktex/VecUtil.h b/quicktex/VecUtil.h index 60e936c..1347c01 100644 --- a/quicktex/VecUtil.h +++ b/quicktex/VecUtil.h @@ -25,43 +25,53 @@ #include "util.h" -namespace quicktex { -template inline next_size_t widening_hadd(xsimd::batch const& arg) { - using b_type = xsimd::batch; - using r_type = next_size_t; - const auto len = b_type::size; +template using requires_arch = xsimd::kernel::requires_arch; - std::array buff; - r_type sum = 0; +namespace quicktex::simd { - arg.store(&buff[0]); - for (unsigned i = 0; i < len; i++) { sum += static_cast(buff[i]); } - - return sum; -} +namespace kernel { #if XSIMD_WITH_NEON64 -template <> inline int32_t widening_hadd(xsimd::batch const& arg) { - // Pairwise widening sum, then sum all N/2 widened lanes - xsimd::batch paired = vpaddlq_s16(arg); - return xsimd::hadd(paired); +template inline int32_t whadd(xsimd::batch const& arg, requires_arch) { + return vaddlvq_s16(arg); } #endif #if XSIMD_WITH_SSE2 -template <> inline int32_t widening_hadd(xsimd::batch const& arg) { +template inline int32_t whadd(xsimd::batch const& arg, requires_arch) { // Pairwise widening sum with multiply by 1, then sum all N/2 widened lanes - xsimd::batch paired = _mm_madd_epi16(arg, _mm_set1_epi16(1)); + xsimd::batch paired = _mm_madd_epi16(arg, _mm_set1_epi16(1)); return xsimd::hadd(paired); } #endif #if XSIMD_WITH_AVX2 -template <> inline int32_t widening_hadd(xsimd::batch const& arg) { +template inline int32_t whadd(xsimd::batch const& arg, requires_arch) { // Pairwise widening sum with multiply by 1, then sum all N/2 widened lanes - xsimd::batch paired = _mm256_madd_epi16(arg, _mm256_set1_epi16(1)); + xsimd::batch paired = _mm256_madd_epi16(arg, _mm256_set1_epi16(1)); return xsimd::hadd(paired); } #endif -} // namespace quicktex \ No newline at end of file +template +inline next_size_t whadd(xsimd::batch const& arg, requires_arch) { + // Generic implementation that should work everywhere + using b_type = xsimd::batch; + using r_type = next_size_t; + const auto len = b_type::size; + + alignas(A::alignment()) T buffer[len]; + r_type sum = 0; + + arg.store_aligned(buffer); + for (T val : buffer) { sum += static_cast(val); } + + return sum; +} +} // namespace kernel + +template inline next_size_t whadd(xsimd::batch const& arg) { + return kernel::whadd(arg, A{}); +} + +} // namespace quicktex::simd \ No newline at end of file diff --git a/quicktex/tests/TestSIMD.cpp b/quicktex/tests/TestSIMD.cpp index f660c2d..7c1be9c 100644 --- a/quicktex/tests/TestSIMD.cpp +++ b/quicktex/tests/TestSIMD.cpp @@ -34,28 +34,28 @@ void test_widening_hadd() { std::array buffer; std::iota(buffer.begin(), buffer.end(), 1); - auto v = xsimd::load(&buffer[0]); - auto sum = widening_hadd(v); + auto v = xsimd::load_unaligned(&buffer[0]); + auto sum = simd::whadd(v); assert(sum == vec_size / 2 * (vec_size + 1)); // Gauss formula buffer.fill(1); - v = xsimd::load(&buffer[0]); - sum = widening_hadd(v); + v = xsimd::load_unaligned(&buffer[0]); + sum = simd::whadd(v); assert(sum == vec_size); buffer.fill(0); - v = xsimd::load(&buffer[0]); - sum = widening_hadd(v); + v = xsimd::load_unaligned(&buffer[0]); + sum = simd::whadd(v); assert(sum == 0); buffer.fill(std::numeric_limits::max()); - v = xsimd::load(&buffer[0]); - sum = widening_hadd(v); + v = xsimd::load_unaligned(&buffer[0]); + sum = simd::whadd(v); assert(sum == std::numeric_limits::max() * (int)vec_size); buffer.fill(std::numeric_limits::min()); - v = xsimd::load(&buffer[0]); - sum = widening_hadd(v); + v = xsimd::load_unaligned(&buffer[0]); + sum = simd::whadd(v); assert(sum == std::numeric_limits::min() * (int)vec_size); }