Add Vector template class

Also experimentally bump to C++20 just to see if it works on GCC 9.3
2024-09-13 06:37:34 +00:00 · 2022-05-29 15:54:55 -07:00 · 2022-05-29 15:54:55 -07:00 · debaa6b54d
commit debaa6b54d
parent c96450b5fe
10 changed files with 528 additions and 83 deletions
--- a/.clang-tidy
+++ b/.clang-tidy
@ -2,15 +2,15 @@ FormatStyle: google

 Checks: '-*,clang-diagnostic-*,llvm-*,misc-*,-misc-unused-parameters,-misc-non-private-member-variables-in-classes,readability-identifier-naming,cppcoreguidelines-narrowing-conversions'
 CheckOptions:
-  - { key: readability-identifier-naming.NamespaceCase,          value: lower_case }
-  - { key: readability-identifier-naming.ClassCase,              value: CamelCase }
-  - { key: readability-identifier-naming.StructCase,             value: CamelCase }
-  - { key: readability-identifier-naming.TemplateParameterCase,  value: CamelCase }
-  - { key: readability-identifier-naming.FunctionCase,           value: aNy_CasE }
-  - { key: readability-identifier-naming.VariableCase,           value: lower_case }
-  - { key: readability-identifier-naming.MemberCase,        value: lower_case }
-  - { key: readability-identifier-naming.PrivateMemberPrefix,    value: _ }
-  - { key: readability-identifier-naming.ProtectedMemberPrefix,  value: _ }
+  - { key: readability-identifier-naming.NamespaceCase,            value: lower_case }
+  - { key: readability-identifier-naming.ClassCase,                value: CamelCase }
+  - { key: readability-identifier-naming.StructCase,               value: CamelCase }
+  - { key: readability-identifier-naming.TemplateParameterCase,    value: CamelCase }
+  - { key: readability-identifier-naming.FunctionCase,             value: lower_case }
+  - { key: readability-identifier-naming.VariableCase,             value: lower_case }
+  - { key: readability-identifier-naming.MemberCase,               value: lower_case }
+  - { key: readability-identifier-naming.PrivateMemberPrefix,      value: _ }
+  - { key: readability-identifier-naming.ProtectedMemberPrefix,    value: _ }
  - { key: readability-identifier-naming.EnumConstantCase,         value: CamelCase }
  - { key: readability-identifier-naming.ConstexprVariableCase,    value: CamelCase }
  - { key: readability-identifier-naming.GlobalConstantCase,       value: CamelCase }
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -57,7 +57,7 @@ endif ()
 target_link_libraries(_quicktex PUBLIC xsimd)

 # Set module features, like C/C++ standards
-target_compile_features(_quicktex PUBLIC cxx_std_17 c_std_11)
+target_compile_features(_quicktex PUBLIC cxx_std_20 c_std_11)

 # Set compiler warnings
 set_project_warnings(_quicktex)
--- a/quicktex/Color.h
+++ b/quicktex/Color.h
@ -0,0 +1,73 @@
+/*  Quicktex Texture Compression Library
+    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+#include "Vec.h"
+
+namespace quicktex::color {
+
+constexpr size_t uint5_max = (1 << 5) - 1;
+constexpr size_t uint6_max = (1 << 6) - 1;
+
+template <size_t N> struct MidpointTable {
+   public:
+    constexpr MidpointTable() : _values() {
+        constexpr float fN = (float)N;
+        for (unsigned i = 0; i < N - 1; i++) { _values[i] = ((float)i / fN) + (0.5f / fN); }
+        _values[N - 1] = 1e+37f;
+    }
+
+    float operator[](size_t i) const {
+        assert(i < N);
+        return _values[i];
+    }
+
+   private:
+    float _values[N];
+};
+
+constexpr MidpointTable<32> Midpoints5bit;
+constexpr MidpointTable<64> Midpoints6bit;
+
+template <typename T> Vec<T, 3> scale_to_565(Vec<T, 3> unscaled) {
+    return Vec<T, 3>{scale_from_8<T, 5>(unscaled.r()), scale_from_8<T, 6>(unscaled.g()),
+                     scale_from_8<T, 5>(unscaled.b())};
+}
+
+template <typename T> Vec<T, 3> scale_from_565(Vec<T, 3> scaled) {
+    return Vec<T, 3>{scale_to_8<T, 5>(scaled.r()), scale_to_8<T, 6>(scaled.g()), scale_to_8<T, 5>(scaled.b())};
+}
+
+template <typename T = int16_t> Vec<T, 3> precise_round_565(Vec<float, 3> &v) {
+    auto scaled = v * Vec<float, 3>{uint5_max, uint6_max, uint5_max};       // rescale by from (0,1) to (0,int_max)
+    auto rounded = (Vec<T, 3>)scaled;                                       // downcast to integral type
+    rounded = rounded.clamp({0, 0, 0}, {uint5_max, uint6_max, uint5_max});  // clamp to avoid out of bounds float errors
+
+    // increment each channel if above the rounding point
+    if (v.r() > Midpoints5bit[rounded.r()]) rounded.r()++;
+    if (v.g() > Midpoints6bit[rounded.g()]) rounded.g()++;
+    if (v.b() > Midpoints5bit[rounded.b()]) rounded.b()++;
+
+    assert(rounded.r() <= uint5_max);
+    assert(rounded.g() <= uint6_max);
+    assert(rounded.b() <= uint5_max);
+
+    return rounded;
+}
+}  // namespace quicktex::color
--- a/quicktex/OldColor.cpp
+++ b/quicktex/OldColor.cpp
@ -46,7 +46,9 @@ uint16_t OldColor::Pack565Unscaled(uint8_t r, uint8_t g, uint8_t b) {
    return static_cast<uint16_t>(b | (g << 5) | (r << 11));
 }

-uint16_t OldColor::Pack565(uint8_t r, uint8_t g, uint8_t b) { return Pack565Unscaled(scale8To5(r), scale8To6(g), scale8To5(b)); }
+uint16_t OldColor::Pack565(uint8_t r, uint8_t g, uint8_t b) {
+    return Pack565Unscaled(scale8To5(r), scale8To6(g), scale8To5(b));
+}

 OldColor OldColor::Unpack565Unscaled(uint16_t Packed) {
    uint8_t r = (Packed >> 11) & 0x1F;
@ -92,22 +94,12 @@ void OldColor::SetRGB(uint8_t vr, uint8_t vg, uint8_t vb) {
    b = vb;
 }

-size_t OldColor::MinChannelRGB() {
-    if (r <= g && r <= b) return 0;
-    if (g <= b && g <= r) return 1;
-    return 2;
-}
-
 size_t OldColor::MaxChannelRGB() {
    if (r >= g && r >= b) return 0;
    if (g >= b && g >= r) return 1;
    return 2;
 }

-OldColor OldColor::Min(const OldColor &A, const OldColor &B) { return OldColor(std::min(A[0], B[0]), std::min(A[1], B[1]), std::min(A[2], B[2]), std::min(A[3], B[3])); }
-
-OldColor OldColor::Max(const OldColor &a, const OldColor &b) { return OldColor(std::max(a[0], b[0]), std::max(a[1], b[1]), std::max(a[2], b[2]), std::max(a[3], b[3])); }
-
 OldColor::operator Vector4() const { return Vector4(r, g, b, a); }
 OldColor::operator Vector4Int() const { return Vector4Int(r, g, b, a); }
 Vector4Int operator-(const OldColor &lhs, const OldColor &rhs) {
--- a/quicktex/Vec.h
+++ b/quicktex/Vec.h
@ -0,0 +1,245 @@
+/*  Quicktex Texture Compression Library
+    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <numeric>
+#include <xsimd/xsimd.hpp>
+
+#include "util.h"
+
+namespace quicktex {
+
+template <typename T, size_t N> class Vec {
+   public:
+    // region constructors
+    /**
+     * Create a vector from an intializer list
+     * @param vals values to populate with
+     */
+    Vec(std::initializer_list<T> vals) { std::copy(vals.begin(), vals.end(), _c.begin()); }
+
+    /**
+     * Create a vector from a scalar value
+     * @param scalar value to populate with
+     */
+    Vec(const T &scalar = 0) { _c.fill(scalar); }
+
+    /**
+     * Create a vector from another vector of the same size and another type
+     * @tparam S Source vector type
+     * @param rvalue Source vector to copy from
+     */
+    template <typename S> Vec(std::enable_if_t<std::is_convertible_v<S, T>, const Vec<S, N>> &rvalue) {
+        Vec lvalue;
+        for (unsigned i = 0; i < N; i++) { lvalue[i] = static_cast<T>(rvalue[i]); }
+        return lvalue;
+    }
+
+    /**
+     * Create a vector from a naked pointer
+     * @tparam S Source data type
+     * @param ptr Pointer to the start of the source data. N values will be read.
+     */
+    template <typename S> Vec(const S *ptr) {
+        for (unsigned i = 0; i < N; i++) { at(i) = static_cast<T>(ptr[i]); }
+    }
+
+    /**
+     * Create a vector from a std::array
+     * @tparam S Source data type
+     * @param arr Array to copy from
+     */
+    template <typename S> Vec(const std::array<S, N> &arr) : Vec(arr.begin()) {}
+    // endregion
+
+    // region subscript accessors
+    /**
+     * Get the element at index i
+     * @param i index to read from
+     * @return the element at index i
+     */
+    T at(size_t i) const {
+        assert(i < N);
+        return _c[i];
+    }
+
+    /**
+     * Get a reference to the element at index i
+     * @param i index to read from
+     * @return Reference to the element at index i
+     */
+    T &at(size_t i) {
+        assert(i < N);
+        return _c[i];
+    }
+
+    /**
+     * Get the element at index i
+     * @param i index to read from
+     * @return the element at index i
+     */
+    T operator[](size_t i) const { return at(i); }
+
+    /**
+     * Get a reference to the element at index i
+     * @param i index to read from
+     * @return Reference to the element at index i
+     */
+    T &operator[](size_t i) { return at(i); }
+
+    T *begin() { return _c.begin(); }
+    T *end() { return _c.end(); }
+    const T *begin() const { return _c.begin(); }
+    const T *end() const { return _c.end(); }
+
+    // endregion
+
+    // region accessor shortcuts
+    // RGBA accessors
+    std::enable_if<N >= 1, T> r() const { return _c[0]; }
+    std::enable_if<N >= 1, T &> r() { return _c[0]; }
+    std::enable_if<N >= 2, T> g() const { return _c[1]; }
+    std::enable_if<N >= 2, T &> g() { return _c[1]; }
+    std::enable_if<N >= 3, T> b() const { return _c[2]; }
+    std::enable_if<N >= 3, T &> b() { return _c[2]; }
+    std::enable_if<N >= 4, T> a() const { return _c[3]; }
+    std::enable_if<N >= 4, T &> a() { return _c[3]; }
+
+    // XYZW accessors
+    std::enable_if<N >= 1, T> x() const { return _c[0]; }
+    std::enable_if<N >= 1, T &> x() { return _c[0]; }
+    std::enable_if<N >= 2, T> y() const { return _c[1]; }
+    std::enable_if<N >= 2, T &> y() { return _c[1]; }
+    std::enable_if<N >= 3, T> z() const { return _c[2]; }
+    std::enable_if<N >= 3, T &> z() { return _c[2]; }
+    std::enable_if<N >= 4, T> w() const { return _c[3]; }
+    std::enable_if<N >= 4, T &> w() { return _c[3]; }
+    // endregion
+
+    // region simple operators
+    friend Vec operator+(const Vec &lhs, const Vec &rhs) { return map(lhs, rhs, std::plus()); }
+    friend Vec operator-(const Vec &lhs, const Vec &rhs) { return map(lhs, rhs, std::minus()); }
+    friend Vec operator*(const Vec &lhs, const Vec &rhs) { return map(lhs, rhs, std::multiplies()); }
+    friend Vec operator/(const Vec &lhs, const Vec &rhs) { return map(lhs, rhs, std::divides()); }
+
+    friend Vec operator+(const Vec &lhs, const T &rhs) { return map(lhs, rhs, std::plus()); }
+    friend Vec operator-(const Vec &lhs, const T &rhs) { return map(lhs, rhs, std::minus()); }
+    friend Vec operator*(const Vec &lhs, const T &rhs) { return map(lhs, rhs, std::multiplies()); }
+    friend Vec operator/(const Vec &lhs, const T &rhs) { return map(lhs, rhs, std::divides()); }
+
+    friend Vec &operator+=(Vec &lhs, const Vec &rhs) { return lhs = lhs + rhs; }
+    friend Vec &operator-=(Vec &lhs, const Vec &rhs) { return lhs = lhs - rhs; }
+    friend Vec &operator*=(Vec &lhs, const Vec &rhs) { return lhs = lhs * rhs; }
+    friend Vec &operator/=(Vec &lhs, const Vec &rhs) { return lhs = lhs / rhs; }
+
+    friend Vec &operator+=(Vec &lhs, const T &rhs) { return lhs = lhs + rhs; }
+    friend Vec &operator-=(Vec &lhs, const T &rhs) { return lhs = lhs - rhs; }
+    friend Vec &operator*=(Vec &lhs, const T &rhs) { return lhs = lhs * rhs; }
+    friend Vec &operator/=(Vec &lhs, const T &rhs) { return lhs = lhs / rhs; }
+
+    bool operator==(const Vec &rhs) const { return _c == rhs._c; };
+    bool operator!=(const Vec &rhs) const { return _c != rhs._c; };
+    // endregion
+
+    template <typename U> void write(U *ptr) const {
+        if constexpr (std::is_same_v<T, U>) {
+            std::memcpy(ptr, _c.begin(), N * sizeof(T));
+        } else {
+            for (unsigned i = 0; i < N; i++) { ptr[i] = static_cast<U>(_c[i]); }
+        }
+    }
+
+    template <typename P = T, typename W = size_t>
+        requires std::is_unsigned_v<P> && std::is_integral_v<T>
+    P pack(const Vec<W, N> &widths) const {
+        assert((sizeof(P) * 8) >= (size_t)std::accumulate(widths.begin(), widths.end(), 0));
+
+        P packed = 0;
+
+        for (unsigned i = 0; i < N; i++) {
+            T val = at(i);
+            if constexpr (std::is_signed_v<T>) { val &= ((1 << widths[i]) - 1); }  // mask out upper bits of signed vals
+
+            assert(val < (1 << widths[i]));
+
+            packed = (packed << widths[i]) | val;
+        }
+        return packed;
+    }
+
+    T sum() const { return std::accumulate(begin(), end(), T{0}); }
+
+    T dot(const Vec &rhs) const {
+        Vec product = (*this) * rhs;
+        return product.sum();
+    }
+
+    T sqr_mag() const { return this->dot(*this); }
+
+    Vec abs() const {
+        return map(*this, [](T val) { return quicktex::abs(val); });
+    }
+
+    Vec clamp(const float &low, const float &high) {
+        return map(*this, [&low, &high](T val) { return quicktex::clamp(val, low, high); });
+    }
+
+    Vec clamp(const Vec &low, const Vec &high) {
+        Vec r;
+        for (unsigned i = 0; i < N; i++) { r[i] = quicktex::clamp(at(i), low[i], high[i]); }
+        return r;
+    }
+
+   protected:
+    std::array<T, N> _c;  // internal array of components
+
+    template <typename Op> static inline Vec map(const Vec &lhs, Op f) {
+        Vec r;
+        for (unsigned i = 0; i < N; i++) { r[i] = f(lhs[i]); }
+        return r;
+    }
+
+    template <typename Op> static inline Vec map(const Vec &lhs, const T &rhs, Op f) {
+        Vec r;
+        for (unsigned i = 0; i < N; i++) { r[i] = f(lhs[i], rhs); }
+        return r;
+    }
+
+    template <typename Op> static inline Vec map(const Vec &lhs, const Vec &rhs, Op f) {
+        Vec r;
+        for (unsigned i = 0; i < N; i++) { r[i] = f(lhs[i], rhs[i]); }
+        return r;
+    }
+};
+
+template <typename T, size_t N, typename A = xsimd::default_arch> class BatchVec : Vec<xsimd::batch<T, A>, N> {
+    template <typename M = xsimd::unaligned_mode> void store(std::array<T *, N> mem_rows, M) const {
+        for (unsigned i = 0; i < N; i++) { this->_c[i].store(mem_rows[i], M{}); }
+    }
+
+    template <typename M = xsimd::unaligned_mode> static Vec<T, N> load(std::array<T *, N> mem_rows, M) {
+        BatchVec<T, N, A> val;
+        for (unsigned i = 0; i < N; i++) { val[i] = xsimd::load<A, T>(mem_rows[i], M{}); }
+        return val;
+    }
+};
+
+}  // namespace quicktex
--- a/quicktex/ctests/TestSIMD.cpp
+++ b/quicktex/ctests/TestSIMD.cpp
@ -17,8 +17,6 @@
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

-#include "TestSIMD.h"
-
 #include <utest.h>

 #include <array>
--- a/quicktex/ctests/TestSIMD.h
+++ b/quicktex/ctests/TestSIMD.h
@ -1,26 +0,0 @@
-/*  Quicktex Texture Compression Library
-    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
-    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
-    and licenced under the public domain
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU Lesser General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-namespace quicktex::tests {
-
-void test_widening_hadd();
-
-} // namespace quicktex::tests
--- a/quicktex/ctests/TestVec.cpp
+++ b/quicktex/ctests/TestVec.cpp
@ -0,0 +1,140 @@
+/*  Quicktex Texture Compression Library
+    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <utest.h>
+
+#include "../Vec.h"
+
+namespace quicktex::tests {
+
+// region Vec_float unit tests
+UTEST(Vec_float, add) {
+    auto a = Vec<float, 3>{1.0f, 1.5f, 2.0f};
+    auto b = Vec<float, 3>{2.0f, -2.5f, 3.0f};
+    auto expected = Vec<float, 3>{3.0f, -1.0f, 5.0f};
+    float diff = ((a + b) - expected).sqr_mag();
+
+    ASSERT_LT(diff, 0.01f);
+}
+
+UTEST(Vec_float, sub) {
+    auto a = Vec<float, 3>{1.0f, 1.5f, 2.0f};
+    auto b = Vec<float, 3>{3.0f, 1.5f, 1.0f};
+    auto expected = Vec<float, 3>{-2.0f, 0.0f, 1.0f};
+    float diff = ((a - b) - expected).sqr_mag();
+
+    ASSERT_LT(diff, 0.01f);
+}
+
+UTEST(Vec_float, mul) {
+    auto a = Vec<float, 3>{1.0f, 1.5f, 2.0f};
+    auto b = Vec<float, 3>{3.0f, 1.5f, 0.0f};
+    auto expected = Vec<float, 3>{3.0f, 2.25f, 0.0f};
+    float diff = ((a * b) - expected).sqr_mag();
+
+    ASSERT_LT(diff, 0.01f);
+}
+
+UTEST(Vec_float, div) {
+    auto a = Vec<float, 3>{1.0f, 1.5f, 2.0f};
+    auto b = Vec<float, 3>{2.0f, 1.5f, 1.0f};
+    auto expected = Vec<float, 3>{0.5f, 1.0f, 2.0f};
+    float diff = ((a / b) - expected).sqr_mag();
+
+    ASSERT_LT(diff, 0.01f);
+}
+
+UTEST(Vec_float, sum) {
+    auto a = Vec<float, 5>{1.0f, 2.0f, 3.5f, 4.0f, -4.0f};
+    auto diff = abs(a.sum() - 6.5f);
+
+    ASSERT_LT(diff, 0.01f);
+}
+
+UTEST(Vec_float, dot) {
+    auto a = Vec<float, 3>{1.0f, 1.5f, 2.0f};
+    auto b = Vec<float, 3>{2.0f, 1.5f, 2.0f};
+    auto diff = abs(a.dot(b) - 8.25f);
+
+    ASSERT_LT(diff, 0.01f);
+}
+
+UTEST(Vec_float, abs) {
+    auto a = Vec<float, 3>{1.0f, -5.0f, -1.0f};
+    auto expected = Vec<float, 3>{1.0f, 5.0f, 1.0f};
+    auto diff = (a.abs() - expected).sqr_mag();
+
+    ASSERT_LT(diff, 0.01f);
+}
+
+UTEST(Vec_float, clamp) {
+    auto a = Vec<float, 6>{-1, -1, -1, 1, 1, 1};
+    auto low1 = Vec<float, 6>{-2, -0.5, -2, 0, 2, 0.5};
+    auto high1 = Vec<float, 6>{-1.5, 0, 0, 0.5, 3, 2};
+    auto result1 = a.clamp(low1, high1);
+    auto expected1 = Vec<float, 6>{-1.5, -0.5, -1, 0.5, 2, 1};
+    auto diff1 = (result1 - expected1).sqr_mag();
+
+    ASSERT_LT(diff1, 0.01f);
+
+    auto b = Vec<float, 6>{-1, -0.5, 0, 0.2, 0.5, 1};
+    auto result2 = b.clamp(-0.8, 0.3);
+    auto expected2 = Vec<float, 6>{-0.8, -0.5, 0, 0.2, 0.3, 0.3};
+    auto diff2 = (result2 - expected2).sqr_mag();
+
+    ASSERT_LT(diff2, 0.01f);
+}
+
+// endregion
+
+// region Vec_int unit tests
+UTEST(Vec_int, subscript) {
+    auto a = Vec<int, 4>{1, 3, 1, 2};
+
+    ASSERT_EQ(a[0], 1);
+    ASSERT_EQ(a[1], 3);
+    ASSERT_EQ(a[2], 1);
+    ASSERT_EQ(a[3], 2);
+
+    a[2] = 4;
+    ASSERT_EQ(a[2], 4);
+}
+
+UTEST(Vec_int, pack) {
+    auto a = Vec<uint16_t, 3>{0x1F, 0x2A, 0x01};
+    auto w = Vec<int, 3>{5, 6, 5};
+    auto result = a.pack(w);
+
+    ASSERT_EQ(result, 0xFD41);
+}
+
+UTEST(Vec_int, write) {
+    std::array<int, 4> arr{1, 3, 1, 2};
+    Vec<int, 4> a(arr);
+    Vec<int, 4> expected{1, 3, 1, 2};
+
+    ASSERT_TRUE(a == expected);
+
+    std::array<int, 4> out{-1, -3, -1, -2};
+    a.write(out.begin());
+
+    ASSERT_TRUE(out == arr);
+}
+// endregion
+}  // namespace quicktex::tests
--- a/quicktex/s3tc/bc1/BC1Encoder.cpp
+++ b/quicktex/s3tc/bc1/BC1Encoder.cpp
@ -608,8 +608,8 @@ void BC1Encoder::FindEndpoints(EncodeResults &result, const CBlock &pixels, cons
        for (unsigned c = 0; c < 3; c++) {
            int inset = ((int)(metrics.max[c] - metrics.min[c]) - 8) >> 4;  // 1/16 of delta, with bias

-            min[c] = clamp255(metrics.min[c] + inset);
-            max[c] = clamp255(metrics.max[c] - inset);
+            min[c] = clamp(metrics.min[c] + inset, 0, 255);
+            max[c] = clamp(metrics.max[c] - inset, 0, 255);
        }

        int icov_xz = 0, icov_yz = 0;
--- a/quicktex/util.h
+++ b/quicktex/util.h
@ -20,11 +20,12 @@
 #pragma once
 #include <cassert>
 #include <cstdint>
+#include <functional>
 #include <limits>
 #include <string>
 #include <type_traits>
-#include <functional>
 #include <vector>
+#include <xsimd/xsimd.hpp>

 #define UINT5_MAX 0x1FU  // 31
 #define UINT6_MAX 0x3FU  // 63
@ -32,6 +33,34 @@
 #define assert5bit(x) assert(x <= UINT5_MAX)
 #define assert6bit(x) assert(x <= UINT6_MAX)

+namespace quicktex {
+
+template <typename S, size_t N> S scale_from_8(S v) {
+    static_assert(N < 8);
+    assert(v < (1 << 8));
+
+    unsigned max = (1 << N) - 1;
+    unsigned v2 = (v * max) + 128;
+    auto result = static_cast<S>((v2 + (v2 >> 8)) >> 8);
+
+    assert(result < (1 << N));
+
+    return result;
+}
+
+template <typename S, size_t N> S scale_to_8(S v) {
+    static_assert(N < 8);
+    assert(v < (1 << N));
+
+    constexpr unsigned lshift = 8 - N;
+    constexpr unsigned rshift = N - lshift;
+    S result = static_cast<S>((v << lshift) | (v >> rshift));
+
+    assert(v < (1 << 8));
+
+    return result;
+}
+
 template <typename S> constexpr auto iabs(S i) {
    static_assert(!std::is_unsigned<S>::value);
    using O = typename std::make_unsigned<S>::type;
@ -51,8 +80,10 @@ template <typename I, typename O, size_t S, size_t C> constexpr std::array<O, C>
    // type checking
    static_assert(std::is_unsigned<I>::value, "Packed input type must be unsigned");
    static_assert(std::is_unsigned<O>::value, "Unpacked output type must be unsigned");
-    static_assert(std::numeric_limits<I>::digits >= (C * S), "Packed input type must be big enough to represent the number of bits multiplied by count");
-    static_assert(std::numeric_limits<O>::digits >= S, "Unpacked output type must be big enough to represent the number of bits");
+    static_assert(std::numeric_limits<I>::digits >= (C * S),
+                  "Packed input type must be big enough to represent the number of bits multiplied by count");
+    static_assert(std::numeric_limits<O>::digits >= S,
+                  "Unpacked output type must be big enough to represent the number of bits");

    constexpr O mask = (1U << S) - 1U;  // maximum value representable by N bits
    std::array<O, C> vals;              // output values array of size C
@ -78,8 +109,10 @@ template <typename I, typename O, size_t S, size_t C> constexpr O Pack(const std
    // type checking
    static_assert(std::is_unsigned<I>::value, "Unpacked input type must be unsigned");
    static_assert(std::is_unsigned<O>::value, "Packed output type must be unsigned");
-    static_assert(std::numeric_limits<I>::digits >= S, "Unpacked input type must be big enough to represent the number of bits");
-    static_assert(std::numeric_limits<O>::digits >= (C * S), "Packed output type must be big enough to represent the number of bits multiplied by count");
+    static_assert(std::numeric_limits<I>::digits >= S,
+                  "Unpacked input type must be big enough to represent the number of bits");
+    static_assert(std::numeric_limits<O>::digits >= (C * S),
+                  "Packed output type must be big enough to represent the number of bits multiplied by count");

    O packed = 0;  // output value of type O

@ -126,26 +159,12 @@ template <typename S> constexpr S scale6To8(S v) {
    return static_cast<S>((v << 2) | (v >> 4));
 }

-template <typename S> constexpr S maximum(S a, S b) { return (a > b) ? a : b; }
-template <typename S> constexpr S maximum(S a, S b, S c) { return maximum(maximum(a, b), c); }
-template <typename S> constexpr S maximum(S a, S b, S c, S d) { return maximum(maximum(maximum(a, b), c), d); }
-
-template <typename S> constexpr S minimum(S a, S b) { return (a < b) ? a : b; }
-template <typename S> constexpr S minimum(S a, S b, S c) { return minimum(minimum(a, b), c); }
-template <typename S> constexpr S minimum(S a, S b, S c, S d) { return minimum(minimum(minimum(a, b), c), d); }
-
-template <typename T> constexpr T square(T a) { return a * a; }
-
-constexpr float clampf(float value, float low = 0.0f, float high = 1.0f) {
-    if (value < low)
-        value = low;
-    else if (value > high)
-        value = high;
+template <typename S> constexpr S clamp(S value, S low, S high) {
+    assert(low <= high);
+    if (value < low) return low;
+    if (value > high) return high;
    return value;
 }
-constexpr uint8_t clamp255(int32_t i) { return static_cast<uint8_t>((static_cast<unsigned int>(i) & 0xFFFFFF00U) ? (~(i >> 31)) : i); }
-
-template <typename S> constexpr S clamp(S value, S low, S high) { return (value < low) ? low : ((value > high) ? high : value); }
 constexpr int32_t clampi(int32_t value, int32_t low, int32_t high) {
    if (value < low)
        value = low;
@ -154,8 +173,11 @@ constexpr int32_t clampi(int32_t value, int32_t low, int32_t high) {
    return value;
 }

-constexpr int squarei(int a) { return a * a; }
-constexpr int absi(int a) { return (a < 0) ? -a : a; }
+template <typename T> std::enable_if<std::is_unsigned_v<T>, T> abs(const T &sval) { return sval; }
+template <typename T> std::enable_if<std::is_signed_v<T> && std::is_arithmetic_v<T>, T> abs(const T &a) {
+    return (a < 0) ? -a : a;
+}
+using xsimd::abs;  // provides overload for abs<xsimd::batch>

 template <typename F> constexpr F lerp(F a, F b, F s) { return a + (b - a) * s; }

@ -177,14 +199,15 @@ template <typename... Args> std::string Format(const char *str, const Args &...a
    return output;
 }

-template <class > struct next_size;
+template <class> struct next_size;
 template <class T> using next_size_t = typename next_size<T>::type;
 template <class T> struct Tag { using type = T; };

-template <> struct next_size<int8_t>  : Tag<int16_t> { };
-template <> struct next_size<int16_t> : Tag<int32_t> { };
-template <> struct next_size<int32_t> : Tag<int64_t> { };
+template <> struct next_size<int8_t> : Tag<int16_t> {};
+template <> struct next_size<int16_t> : Tag<int32_t> {};
+template <> struct next_size<int32_t> : Tag<int64_t> {};

-template <> struct next_size<uint8_t>  : Tag<uint16_t> { };
-template <> struct next_size<uint16_t> : Tag<uint32_t> { };
-template <> struct next_size<uint32_t> : Tag<uint64_t> { };
+template <> struct next_size<uint8_t> : Tag<uint16_t> {};
+template <> struct next_size<uint16_t> : Tag<uint32_t> {};
+template <> struct next_size<uint32_t> : Tag<uint64_t> {};
+}  // namespace quicktex