mirror of
https://github.com/drewcassidy/quicktex.git
synced 2024-09-13 06:37:34 +00:00
Add widening hadd
This commit is contained in:
parent
8c77356aca
commit
aa6bd9602d
@ -18,6 +18,50 @@
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <array>
|
||||
#include <type_traits>
|
||||
#include <xsimd/xsimd.hpp>
|
||||
|
||||
#include "util.h"
|
||||
|
||||
namespace quicktex {
|
||||
template <class A, class T> inline next_size_t<T> widening_hadd(xsimd::batch<T, A> const& arg) {
|
||||
using b_type = xsimd::batch<T, A>;
|
||||
using r_type = next_size_t<T>;
|
||||
const auto len = b_type::size;
|
||||
|
||||
std::array<T, len> buff;
|
||||
r_type sum = 0;
|
||||
|
||||
arg.store(&buff[0]);
|
||||
for (unsigned i = 0; i < len; i++) { sum += static_cast<r_type>(buff[i]); }
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
#if XSIMD_WITH_NEON64
|
||||
template <> inline int32_t widening_hadd(xsimd::batch<int16_t, xsimd::neon64> const& arg) {
|
||||
// Pairwise widening sum, then sum all N/2 widened lanes
|
||||
xsimd::batch<int32_t, xsimd::neon64> paired = vpaddlq_s16(arg);
|
||||
return xsimd::hadd(paired);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_SSE2
|
||||
template <> inline int32_t widening_hadd(xsimd::batch<int16_t, xsimd::sse2> const& arg) {
|
||||
// Pairwise widening sum with multiply by 1, then sum all N/2 widened lanes
|
||||
xsimd::batch<int32_t, xsimd::sse2> paired = _mm_madd_epi16(arg, _mm_set1_epi16(1));
|
||||
return xsimd::hadd(paired);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_AVX2
|
||||
template <> inline int32_t widening_hadd(xsimd::batch<int16_t, xsimd::avx2> const& arg) {
|
||||
// Pairwise widening sum with multiply by 1, then sum all N/2 widened lanes
|
||||
xsimd::batch<int32_t, xsimd::avx2> paired = _mm256_madd_epi16(arg, _mm256_set1_epi16(1));
|
||||
return xsimd::hadd(paired);
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace quicktex
|
@ -24,8 +24,39 @@
|
||||
#include <cstdint>
|
||||
#include <numeric>
|
||||
|
||||
#include <xsimd/xsimd.hpp>
|
||||
#include "../VecUtil.h"
|
||||
|
||||
namespace quicktex::tests {
|
||||
|
||||
void test_widening_hadd() {
|
||||
const auto vec_size = xsimd::batch<int16_t>::size;
|
||||
std::array<int16_t, vec_size> buffer;
|
||||
|
||||
std::iota(buffer.begin(), buffer.end(), 1);
|
||||
auto v = xsimd::load(&buffer[0]);
|
||||
auto sum = widening_hadd(v);
|
||||
assert(sum == vec_size / 2 * (vec_size + 1)); // Gauss formula
|
||||
|
||||
buffer.fill(1);
|
||||
v = xsimd::load(&buffer[0]);
|
||||
sum = widening_hadd(v);
|
||||
assert(sum == vec_size);
|
||||
|
||||
buffer.fill(0);
|
||||
v = xsimd::load(&buffer[0]);
|
||||
sum = widening_hadd(v);
|
||||
assert(sum == 0);
|
||||
|
||||
buffer.fill(std::numeric_limits<int16_t>::max());
|
||||
v = xsimd::load(&buffer[0]);
|
||||
sum = widening_hadd(v);
|
||||
assert(sum == std::numeric_limits<int16_t>::max() * (int)vec_size);
|
||||
|
||||
buffer.fill(std::numeric_limits<int16_t>::min());
|
||||
v = xsimd::load(&buffer[0]);
|
||||
sum = widening_hadd(v);
|
||||
assert(sum == std::numeric_limits<int16_t>::min() * (int)vec_size);
|
||||
|
||||
}
|
||||
} // namespace quicktex::tests
|
@ -21,4 +21,6 @@
|
||||
|
||||
namespace quicktex::tests {
|
||||
|
||||
void test_widening_hadd();
|
||||
|
||||
} // namespace quicktex::tests
|
@ -175,4 +175,12 @@ template <typename... Args> std::string Format(const char *str, const Args &...a
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
}
|
||||
|
||||
template <class > struct next_size;
|
||||
template <class T> using next_size_t = typename next_size<T>::type;
|
||||
template <class T> struct Tag { using type = T; };
|
||||
|
||||
template <> struct next_size<int_least8_t> : Tag<int_least16_t> { };
|
||||
template <> struct next_size<int_least16_t> : Tag<int_least32_t> { };
|
||||
template <> struct next_size<int_least32_t> : Tag<int_least64_t> { };
|
@ -36,10 +36,14 @@ function(set_simd_flags target_name)
|
||||
if (simd_mode STREQUAL "AUTO")
|
||||
if (MSVC)
|
||||
#MSVC has no -march=native equivalent. womp
|
||||
elseif (!ARM)
|
||||
elseif (NOT ARM)
|
||||
# setting -march=native on an M1 causes Clang to freak out,
|
||||
# and arm64 is pretty samey instruction set wise (arm9 and SVE2 notwithstanding)
|
||||
target_compile_options(${target_name} PUBLIC -march=native)
|
||||
|
||||
# Currently AVX512 will cause problems with buffer overruns,
|
||||
# and I dont have good test hardware for it anyways
|
||||
|
||||
target_compile_options(${target_name} PUBLIC -march=native -mno-avx512f)
|
||||
endif ()
|
||||
elseif (simd_mode STREQUAL "SSSE3")
|
||||
if (MSVC)
|
||||
|
Loading…
Reference in New Issue
Block a user