From c05879f1c10ed6463b2f6d375d4e31a1ccabfdf0 Mon Sep 17 00:00:00 2001
From: Andrew Cassidy <drewcassidy@me.com>
Date: Sun, 22 May 2022 16:38:54 -0700
Subject: [PATCH] Fixes and tweaks to whadd

---
 .clang-format               |  2 +-
 quicktex/VecUtil.h          | 52 ++++++++++++++++++++++---------------
 quicktex/tests/TestSIMD.cpp | 20 +++++++-------
 3 files changed, 42 insertions(+), 32 deletions(-)
diff --git a/.clang-format b/.clang-format
index ed7b74d..1f706e5 100644
--- a/.clang-format
+++ b/.clang-format
@@ -1,7 +1,7 @@
 ---
 BasedOnStyle: google
 IndentWidth: 4
-ColumnLimit: 160
+ColumnLimit: 120
 AllowShortBlocksOnASingleLine: Always
 AllowShortFunctionsOnASingleLine: All
 AlwaysBreakTemplateDeclarations: MultiLine
diff --git a/quicktex/VecUtil.h b/quicktex/VecUtil.h
index 60e936c..1347c01 100644
--- a/quicktex/VecUtil.h
+++ b/quicktex/VecUtil.h
@@ -25,43 +25,53 @@
 
 #include "util.h"
 
-namespace quicktex {
-template <class A, class T> inline next_size_t<T> widening_hadd(xsimd::batch<T, A> const& arg) {
-    using b_type = xsimd::batch<T, A>;
-    using r_type = next_size_t<T>;
-    const auto len = b_type::size;
+template <typename T> using requires_arch = xsimd::kernel::requires_arch<T>;
 
-    std::array<T, len> buff;
-    r_type sum = 0;
+namespace quicktex::simd {
 
-    arg.store(&buff[0]);
-    for (unsigned i = 0; i < len; i++) { sum += static_cast<r_type>(buff[i]); }
-
-    return sum;
-}
+namespace kernel {
 
 #if XSIMD_WITH_NEON64
-template <> inline int32_t widening_hadd(xsimd::batch<int16_t, xsimd::neon64> const& arg) {
-    // Pairwise widening sum, then sum all N/2 widened lanes
-    xsimd::batch<int32_t, xsimd::neon64> paired = vpaddlq_s16(arg);
-    return xsimd::hadd(paired);
+template <class A> inline int32_t whadd(xsimd::batch<int16_t, A> const& arg, requires_arch<xsimd::neon64>) {
+    return vaddlvq_s16(arg);
 }
 #endif
 
 #if XSIMD_WITH_SSE2
-template <> inline int32_t widening_hadd(xsimd::batch<int16_t, xsimd::sse2> const& arg) {
+template <class A> inline int32_t whadd(xsimd::batch<int16_t, A> const& arg, requires_arch<xsimd::sse2>) {
     // Pairwise widening sum with multiply by 1, then sum all N/2 widened lanes
-    xsimd::batch<int32_t, xsimd::sse2> paired = _mm_madd_epi16(arg, _mm_set1_epi16(1));
+    xsimd::batch<int32_t, A> paired = _mm_madd_epi16(arg, _mm_set1_epi16(1));
     return xsimd::hadd(paired);
 }
 #endif
 
 #if XSIMD_WITH_AVX2
-template <> inline int32_t widening_hadd(xsimd::batch<int16_t, xsimd::avx2> const& arg) {
+template <class A> inline int32_t whadd(xsimd::batch<int16_t, A> const& arg, requires_arch<xsimd::avx2>) {
     // Pairwise widening sum with multiply by 1, then sum all N/2 widened lanes
-    xsimd::batch<int32_t, xsimd::avx2> paired = _mm256_madd_epi16(arg, _mm256_set1_epi16(1));
+    xsimd::batch<int32_t, A> paired = _mm256_madd_epi16(arg, _mm256_set1_epi16(1));
     return xsimd::hadd(paired);
 }
 #endif
 
-}  // namespace quicktex
\ No newline at end of file
+template <class A, class T>
+inline next_size_t<T> whadd(xsimd::batch<T, A> const& arg, requires_arch<xsimd::generic>) {
+    // Generic implementation that should work everywhere
+    using b_type = xsimd::batch<T, A>;
+    using r_type = next_size_t<T>;
+    const auto len = b_type::size;
+
+    alignas(A::alignment()) T buffer[len];
+    r_type sum = 0;
+
+    arg.store_aligned(buffer);
+    for (T val : buffer) { sum += static_cast<r_type>(val); }
+
+    return sum;
+}
+}  // namespace kernel
+
+template <class A, class T> inline next_size_t<T> whadd(xsimd::batch<T, A> const& arg) {
+    return kernel::whadd(arg, A{});
+}
+
+}  // namespace quicktex::simd
\ No newline at end of file
diff --git a/quicktex/tests/TestSIMD.cpp b/quicktex/tests/TestSIMD.cpp
index f660c2d..7c1be9c 100644
--- a/quicktex/tests/TestSIMD.cpp
+++ b/quicktex/tests/TestSIMD.cpp
@@ -34,28 +34,28 @@ void test_widening_hadd() {
     std::array<int16_t, vec_size> buffer;
 
     std::iota(buffer.begin(), buffer.end(), 1);
-    auto v = xsimd::load(&buffer[0]);
-    auto sum = widening_hadd(v);
+    auto v = xsimd::load_unaligned(&buffer[0]);
+    auto sum = simd::whadd(v);
     assert(sum == vec_size / 2 * (vec_size + 1));  // Gauss formula
 
     buffer.fill(1);
-    v = xsimd::load(&buffer[0]);
-    sum = widening_hadd(v);
+    v = xsimd::load_unaligned(&buffer[0]);
+    sum = simd::whadd(v);
     assert(sum == vec_size);
 
     buffer.fill(0);
-    v = xsimd::load(&buffer[0]);
-    sum = widening_hadd(v);
+    v = xsimd::load_unaligned(&buffer[0]);
+    sum = simd::whadd(v);
     assert(sum == 0);
 
     buffer.fill(std::numeric_limits<int16_t>::max());
-    v = xsimd::load(&buffer[0]);
-    sum = widening_hadd(v);
+    v = xsimd::load_unaligned(&buffer[0]);
+    sum = simd::whadd(v);
     assert(sum == std::numeric_limits<int16_t>::max() * (int)vec_size);
 
     buffer.fill(std::numeric_limits<int16_t>::min());
-    v = xsimd::load(&buffer[0]);
-    sum = widening_hadd(v);
+    v = xsimd::load_unaligned(&buffer[0]);
+    sum = simd::whadd(v);
     assert(sum == std::numeric_limits<int16_t>::min() * (int)vec_size);
 
 }