Add widening hadd

2024-09-13 06:37:34 +00:00 · 2022-05-21 21:23:22 -07:00 · 2022-05-21 21:23:22 -07:00 · aa6bd9602d
commit aa6bd9602d
parent 8c77356aca
5 changed files with 92 additions and 3 deletions
--- a/quicktex/VecUtil.h
+++ b/quicktex/VecUtil.h
@ -18,6 +18,50 @@
 */

 #pragma once
+
+#include <array>
+#include <type_traits>
+#include <xsimd/xsimd.hpp>
+
+#include "util.h"
+
 namespace quicktex {
+template <class A, class T> inline next_size_t<T> widening_hadd(xsimd::batch<T, A> const& arg) {
+    using b_type = xsimd::batch<T, A>;
+    using r_type = next_size_t<T>;
+    const auto len = b_type::size;
+
+    std::array<T, len> buff;
+    r_type sum = 0;
+
+    arg.store(&buff[0]);
+    for (unsigned i = 0; i < len; i++) { sum += static_cast<r_type>(buff[i]); }
+
+    return sum;
+}
+
+#if XSIMD_WITH_NEON64
+template <> inline int32_t widening_hadd(xsimd::batch<int16_t, xsimd::neon64> const& arg) {
+    // Pairwise widening sum, then sum all N/2 widened lanes
+    xsimd::batch<int32_t, xsimd::neon64> paired = vpaddlq_s16(arg);
+    return xsimd::hadd(paired);
+}
+#endif
+
+#if XSIMD_WITH_SSE2
+template <> inline int32_t widening_hadd(xsimd::batch<int16_t, xsimd::sse2> const& arg) {
+    // Pairwise widening sum with multiply by 1, then sum all N/2 widened lanes
+    xsimd::batch<int32_t, xsimd::sse2> paired = _mm_madd_epi16(arg, _mm_set1_epi16(1));
+    return xsimd::hadd(paired);
+}
+#endif
+
+#if XSIMD_WITH_AVX2
+template <> inline int32_t widening_hadd(xsimd::batch<int16_t, xsimd::avx2> const& arg) {
+    // Pairwise widening sum with multiply by 1, then sum all N/2 widened lanes
+    xsimd::batch<int32_t, xsimd::avx2> paired = _mm256_madd_epi16(arg, _mm256_set1_epi16(1));
+    return xsimd::hadd(paired);
+}
+#endif

 }  // namespace quicktex
--- a/quicktex/tests/TestSIMD.cpp
+++ b/quicktex/tests/TestSIMD.cpp
@ -24,8 +24,39 @@
 #include <cstdint>
 #include <numeric>

+#include <xsimd/xsimd.hpp>
 #include "../VecUtil.h"

 namespace quicktex::tests {

+void test_widening_hadd() {
+    const auto vec_size = xsimd::batch<int16_t>::size;
+    std::array<int16_t, vec_size> buffer;
+
+    std::iota(buffer.begin(), buffer.end(), 1);
+    auto v = xsimd::load(&buffer[0]);
+    auto sum = widening_hadd(v);
+    assert(sum == vec_size / 2 * (vec_size + 1));  // Gauss formula
+
+    buffer.fill(1);
+    v = xsimd::load(&buffer[0]);
+    sum = widening_hadd(v);
+    assert(sum == vec_size);
+
+    buffer.fill(0);
+    v = xsimd::load(&buffer[0]);
+    sum = widening_hadd(v);
+    assert(sum == 0);
+
+    buffer.fill(std::numeric_limits<int16_t>::max());
+    v = xsimd::load(&buffer[0]);
+    sum = widening_hadd(v);
+    assert(sum == std::numeric_limits<int16_t>::max() * (int)vec_size);
+
+    buffer.fill(std::numeric_limits<int16_t>::min());
+    v = xsimd::load(&buffer[0]);
+    sum = widening_hadd(v);
+    assert(sum == std::numeric_limits<int16_t>::min() * (int)vec_size);
+
+}
 }  // namespace quicktex::tests
--- a/quicktex/tests/TestSIMD.h
+++ b/quicktex/tests/TestSIMD.h
@ -21,4 +21,6 @@

 namespace quicktex::tests {

+void test_widening_hadd();
+
 } // namespace quicktex::tests
--- a/quicktex/util.h
+++ b/quicktex/util.h
@ -175,4 +175,12 @@ template <typename... Args> std::string Format(const char *str, const Args &...a
    }

    return output;
-}
+}
+
+template <class > struct next_size;
+template <class T> using next_size_t = typename next_size<T>::type;
+template <class T> struct Tag { using type = T; };
+
+template <> struct next_size<int_least8_t>  : Tag<int_least16_t> { };
+template <> struct next_size<int_least16_t> : Tag<int_least32_t> { };
+template <> struct next_size<int_least32_t> : Tag<int_least64_t> { };
--- a/tools/SIMDFlags.cmake
+++ b/tools/SIMDFlags.cmake
@ -36,10 +36,14 @@ function(set_simd_flags target_name)
        if (simd_mode STREQUAL "AUTO")
            if (MSVC)
                #MSVC has no -march=native equivalent. womp
-            elseif (!ARM)
+            elseif (NOT ARM)
                # setting -march=native on an M1 causes Clang to freak out,
                # and arm64 is pretty samey instruction set wise (arm9 and SVE2 notwithstanding)
-                target_compile_options(${target_name} PUBLIC -march=native)
+
+                # Currently AVX512 will cause problems with buffer overruns,
+                # and I dont have good test hardware for it anyways
+
+                target_compile_options(${target_name} PUBLIC -march=native -mno-avx512f)
            endif ()
        elseif (simd_mode STREQUAL "SSSE3")
            if (MSVC)