From 534c4f7cfcb9220d0ecf2044c84c7ad68fa9845d Mon Sep 17 00:00:00 2001
From: drewcassidy <drewcassidy@me.com>
Date: Sun, 21 Feb 2021 23:15:04 -0800
Subject: [PATCH] First working (but still bad) BC1 encoder

---
 src/BC1/BC1Block.h     |   4 +-
 src/BC1/BC1Encoder.cpp | 101 +++++++++++++++++++++++++++++++++---
 src/BC1/BC1Encoder.h   |   8 +--
 src/BlockEncoder.h     |   4 ++
 src/BlockView.h        |   5 +-
 src/Color.cpp          |  16 +++---
 src/Color.h            |  10 ++--
 src/Interpolator.cpp   |  40 ++++++++-------
 src/Interpolator.h     |   4 +-
 src/Vector4.h          |   6 +--
 src/Vector4Int.h       | 114 +++++++++++++++++++++++++++++++++++++++++
 src/rgbcx.cpp          |  10 ++--
 12 files changed, 271 insertions(+), 51 deletions(-)
 create mode 100644 src/Vector4Int.h

diff --git a/src/BC1/BC1Block.h b/src/BC1/BC1Block.h
index 77b81c4..6e14739 100644
--- a/src/BC1/BC1Block.h
+++ b/src/BC1/BC1Block.h
@@ -64,8 +64,8 @@ class BC1Block {
         return unpacked;
     }
 
-    void PackSelectors(const UnpackedSelectors& unpacked) {
-        for (unsigned i = 0; i < 4; i++) { selectors[i] = Pack<uint8_t, uint8_t, 2, 4>(unpacked[i]); }
+    void PackSelectors(const UnpackedSelectors& unpacked, uint8_t mask = 0) {
+        for (unsigned i = 0; i < 4; i++) { selectors[i] = mask ^ Pack<uint8_t, uint8_t, 2, 4>(unpacked[i]); }
     }
 
     constexpr static inline size_t EndpointSize = 2;
diff --git a/src/BC1/BC1Encoder.cpp b/src/BC1/BC1Encoder.cpp
index fa98956..ff321b0 100644
--- a/src/BC1/BC1Encoder.cpp
+++ b/src/BC1/BC1Encoder.cpp
@@ -29,6 +29,7 @@
 #include "../Color.h"
 #include "../Matrix4x4.h"
 #include "../Vector4.h"
+#include "../Vector4Int.h"
 #include "../bitwiseEnums.h"
 
 namespace rgbcx {
@@ -96,6 +97,7 @@ template <size_t S> void PrepSingleColorTable(MatchList &match_table, MatchList
 BC1Encoder::BC1Encoder(InterpolatorPtr interpolator) : _interpolator(interpolator) {
     PrepSingleColorTable<5>(*_single_match5, *_single_match5_half, *_interpolator);
     PrepSingleColorTable<6>(*_single_match6, *_single_match6_half, *_interpolator);
+    _flags = Flags::None;
 }
 
 void BC1Encoder::EncodeBlock(Color4x4 pixels, BC1Block *dest) const {
@@ -105,13 +107,32 @@ void BC1Encoder::EncodeBlock(Color4x4 pixels, BC1Block *dest) const {
 
     Color first = pixels.Get(0, 0);
 
-    if (pixels.IsSingleColor()) {  // for now assume (wrongly) everything is a single-color block
+    if (pixels.IsSingleColor()) {
         // single-color pixel block, do it the fast way
         EncodeBlockSingleColor(first, dest);
         return;
     }
 
     auto metrics = pixels.GetMetrics();
+
+    bool needs_block_error = (_flags & Flags::UseLikelyTotalOrderings | Flags::Use3ColorBlocks | Flags::UseFullMSEEval) != Flags::None;
+    needs_block_error |= (_search_rounds > 0);
+    needs_block_error |= metrics.has_black && ((_flags & Flags::Use3ColorBlocksForBlackPixels) != Flags::None);
+
+    unsigned cur_err = UINT_MAX;
+
+    if (!needs_block_error || true) {
+        //        assert((_flags & Flags::TryAllInitialEndponts) == Flags::None);
+
+        EncodeResults orig;
+        FindEndpoints(pixels, _flags, metrics, orig.low, orig.high);
+        FindSelectors4(pixels, orig);
+        if (orig.low == orig.high) {
+            EncodeBlockSingleColor(metrics.avg, dest);
+        } else {
+            EncodeBlock4Color(orig, dest);
+        }
+    }
 }
 
 void BC1Encoder::EncodeBlockSingleColor(Color color, BC1Block *dest) const {
@@ -178,7 +199,38 @@ void BC1Encoder::EncodeBlockSingleColor(Color color, BC1Block *dest) const {
     dest->selectors[3] = mask;
 }
 
+void BC1Encoder::EncodeBlock4Color(EncodeResults &block, BC1Block *dest) const {
+    if (block.low == block.high) {
+        EncodeBlockSingleColor(block.low.ScaleFrom565() /* Color(255, 0, 255)*/, dest);
+        return;
+    }
+
+    uint8_t mask = 0;
+    uint16_t low = block.low.Pack565Unscaled();
+    uint16_t high = block.high.Pack565Unscaled();
+    if (low < high) {
+        std::swap(low, high);
+        mask = 0x55;
+    }
+
+    assert(low > high);
+    dest->SetLowColor(low);
+    dest->SetHighColor(high);
+    dest->PackSelectors(block.selectors, mask);
+}
+
+void encode_bc1_pick_initial(const Color *pSrc_pixels, uint32_t flags, bool grayscale_flag, int min_r, int min_g, int min_b, int max_r, int max_g, int max_b,
+                             int avg_r, int avg_g, int avg_b, int total_r, int total_g, int total_b, int &lr, int &lg, int &lb, int &hr, int &hg, int &hb);
+
 void BC1Encoder::FindEndpoints(Color4x4 pixels, BC1Encoder::Flags flags, const BC1Encoder::BlockMetrics metrics, Color &low, Color &high) const {
+    int lr, lg, lb, hr, hg, hb;
+    auto colors = pixels.Flatten();
+    encode_bc1_pick_initial(&colors[0], 0, metrics.is_greyscale, metrics.min.r, metrics.min.g, metrics.min.b, metrics.max.r, metrics.max.g, metrics.max.b,
+                            metrics.avg.r, metrics.avg.g, metrics.avg.b, metrics.sums[0], metrics.sums[1], metrics.sums[2], lr, lg, lb, hr, hg, hb);
+    low = Color(lr, lg, lb);
+    high = Color(hr, hg, hb);
+//    return;
+
     if (metrics.is_greyscale) {
         // specialized greyscale case
         const unsigned fr = pixels.Get(0).r;
@@ -209,7 +261,7 @@ void BC1Encoder::FindEndpoints(Color4x4 pixels, BC1Encoder::Flags flags, const B
         l[chan0] = (float)min[chan0];
         h[chan0] = (float)min[chan0];
 
-        assert(diff[chan0] >= diff[(chan0 + 1) % 3] && diff[chan0] >= diff[(chan0 + 2) % 3]);
+        assert((diff[chan0] >= diff[(chan0 + 1) % 3]) && (diff[chan0] >= diff[(chan0 + 2) % 3]));
 
         std::array<unsigned, 3> sums_xy;
 
@@ -317,7 +369,7 @@ void BC1Encoder::FindEndpoints(Color4x4 pixels, BC1Encoder::Flags flags, const B
         std::array<Vector4, 16> colors;
 
         Vector4 axis = {306, 601, 117};  // Luma vector
-        Matrix4x4 covariance;
+        Matrix4x4 covariance = Matrix4x4::Identity();
         const unsigned total_power_iters = (flags & Flags::Use6PowerIters) != Flags::None ? 6 : 4;
 
         for (unsigned i = 0; i < 16; i++) {
@@ -346,12 +398,14 @@ void BC1Encoder::FindEndpoints(Color4x4 pixels, BC1Encoder::Flags flags, const B
 
         // if we found any correlation, then this is our new axis. otherwise we fallback to the luma vector
         float k = delta.MaxAbs(3);
-        if (k > 2) { axis = delta * (2048.0f / k); }
+        if (k >= 2) { axis = delta * (2048.0f / k); }
+
+        axis *= 16;
 
         float min_dot = INFINITY;
         float max_dot = -INFINITY;
 
-        unsigned min_index, max_index;
+        unsigned min_index = 0, max_index = 0;
 
         for (unsigned i = 0; i < 16; i++) {
             // since axis is constant here, I dont think its magnitude actually matters,
@@ -360,7 +414,8 @@ void BC1Encoder::FindEndpoints(Color4x4 pixels, BC1Encoder::Flags flags, const B
             if (dot > max_dot) {
                 max_dot = dot;
                 max_index = i;
-            } else if (dot < min_dot) {
+            }
+            if (dot < min_dot) {
                 min_dot = dot;
                 min_index = i;
             }
@@ -370,4 +425,38 @@ void BC1Encoder::FindEndpoints(Color4x4 pixels, BC1Encoder::Flags flags, const B
         high = pixels.Get(max_index).ScaleTo565();
     }
 }
+
+unsigned BC1Encoder::FindSelectors4(Color4x4 pixels, BC1Encoder::EncodeResults &block, unsigned int cur_err, bool use_err) const {
+    // colors in selector order, 0, 1, 2, 3
+    // 0 = low color, 1 = high color, 2/3 = interpolated
+    std::array<Color, 4> colors = _interpolator->InterpolateBC1(block.low, block.high, false);
+    //    std::array<Vector4Int, 4> colorVectors;
+    //    for (unsigned i = 0; i < 4; i++) { colorVectors[i] = (Vector4Int)colors[i]; }
+
+    const std::array<uint8_t, 4> selectors = {1, 3, 2, 0};
+    std::array<Vector4Int, 4> colorVectors = {(Vector4Int)colors[0], (Vector4Int)colors[2], (Vector4Int)colors[3], (Vector4Int)colors[1]};
+
+    if (!use_err) {
+        Vector4Int a = colorVectors[3] - colorVectors[0];
+        Color high = block.high.ScaleFrom565();
+        Color low = block.low.ScaleFrom565();
+        std::array<int, 4> dots;
+        for (unsigned i = 0; i < 4; i++) { dots[i] = a.Dot(colorVectors[i]); }
+        int t0 = dots[0] + dots[1], t1 = dots[1] + dots[2], t2 = dots[2] + dots[3];
+        a *= 2;
+
+        for (unsigned x = 0; x < 4; x++) {
+            for (unsigned y = 0; y < 4; y++) {
+                int dot = a.Dot((Vector4Int)pixels.Get(x, y));
+                unsigned level = (dot <= t0) + (dot < t1) + (dot < t2);
+                unsigned selector = selectors[level];
+                assert(level < 4);
+                assert(selector < 4);
+                block.selectors[y][x] = selector;
+            }
+        }
+        return 0;
+    }
+    return 0;
+}
 }  // namespace rgbcx
\ No newline at end of file
diff --git a/src/BC1/BC1Encoder.h b/src/BC1/BC1Encoder.h
index 9266002..73c9a17 100644
--- a/src/BC1/BC1Encoder.h
+++ b/src/BC1/BC1Encoder.h
@@ -109,8 +109,6 @@ class BC1Encoder : public BlockEncoder<BC1Block, 4, 4> {
     void EncodeBlock(Color4x4 pixels, BC1Block *dest) const override;
 
    private:
-    using Vec3F = std::array<float, 3>;
-
     const InterpolatorPtr _interpolator;
 
     Flags _flags;
@@ -119,15 +117,19 @@ class BC1Encoder : public BlockEncoder<BC1Block, 4, 4> {
     unsigned _orderings3;
 
     // Unpacked BC1 block with metadata
+    using UnpackedSelectors = std::array<std::array<uint8_t, 4>, 4>;
     struct EncodeResults {
         Color low;
         Color high;
-        std::array<uint8_t, 16> selectors;
+        UnpackedSelectors selectors;
         bool is_3_color;
     };
 
     void EncodeBlockSingleColor(Color color, BC1Block *dest) const;
+    void EncodeBlock4Color(EncodeResults &block, BC1Block *dest) const;
+
     void FindEndpoints(Color4x4 pixels, Flags flags, BlockMetrics const metrics, Color &low, Color &high) const;
+    unsigned FindSelectors4(Color4x4 pixels, EncodeResults &block, unsigned cur_err = 0, bool use_err = false) const;
 
     // match tables used for single-color blocks
     // Each entry includes a high and low pair that best reproduces the 8-bit index as well as possible,
diff --git a/src/BlockEncoder.h b/src/BlockEncoder.h
index b153d7c..91885ae 100644
--- a/src/BlockEncoder.h
+++ b/src/BlockEncoder.h
@@ -60,6 +60,10 @@ template <class B, size_t M, size_t N> class BlockEncoder {
                 unsigned top_left = pixel_x + (pixel_y * image_width);
                 auto src = DecodedBlock(&decoded[top_left], image_width);
 
+                if (pixel_x == 272 && pixel_y == 748) {
+                    int foo = 3;
+                }
+
                 EncodeBlock(src, &blocks[x + block_width * y]);
             }
         }
diff --git a/src/BlockView.h b/src/BlockView.h
index c495354..57d6ac1 100644
--- a/src/BlockView.h
+++ b/src/BlockView.h
@@ -135,8 +135,7 @@ template <size_t M, size_t N> class ColorBlockView : public BlockView<Color, M,
         metrics.max = Color(0, 0, 0);
         metrics.has_black = false;
         metrics.is_greyscale = true;
-
-        std::array<unsigned, 3> sums;
+        metrics.sums = {0, 0, 0};
 
         for (unsigned i = 0; i < M * N; i++) {
             auto val = Base::Get(i);
@@ -146,7 +145,7 @@ template <size_t M, size_t N> class ColorBlockView : public BlockView<Color, M,
                 } else {
                     metrics.max[c] = val[c];
                 }
-                sums[c] += val[c];
+                metrics.sums[c] += val[c];
             }
             metrics.is_greyscale &= ((val.r == val.g) && (val.r == val.b));
             metrics.has_black |= (val.r | val.g | val.b < black_threshold);
diff --git a/src/Color.cpp b/src/Color.cpp
index 88a53c4..498148f 100644
--- a/src/Color.cpp
+++ b/src/Color.cpp
@@ -21,6 +21,7 @@
 #include <algorithm>  // for max, Min
 
 #include "Vector4.h"
+#include "Vector4Int.h"
 #include "util.h"  // for scale5To8, scale8To5, assert5bit, scale6To8
 
 namespace rgbcx {
@@ -90,14 +91,14 @@ void Color::SetRGB(uint8_t vr, uint8_t vg, uint8_t vb) {
 }
 
 size_t Color::MinChannelRGB() {
-    if (r < g && r < b) return 0;
-    if (g < b && g < r) return 1;
+    if (r <= g && r <= b) return 0;
+    if (g <= b && g <= r) return 1;
     return 2;
 }
 
 size_t Color::MaxChannelRGB() {
-    if (r > g && r > b) return 0;
-    if (g > b && g > r) return 1;
+    if (r >= g && r >= b) return 0;
+    if (g >= b && g >=r) return 1;
     return 2;
 }
 
@@ -105,8 +106,11 @@ Color Color::Min(const Color &A, const Color &B) { return Color(std::min(A[0], B
 
 Color Color::Max(const Color &a, const Color &b) { return Color(std::max(a[0], b[0]), std::max(a[1], b[1]), std::max(a[2], b[2]), std::max(a[3], b[3])); }
 
-uint16_t Color::pack565() { return Pack565(r, g, b); }
-uint16_t Color::pack565Unscaled() { return Pack565Unscaled(r, g, b); }
+Color::operator Vector4() const { return Vector4(r, g, b, a); }
+Color::operator Vector4Int() const { return Vector4Int(r, g, b, a);}
+
+uint16_t Color::Pack565() const { return Pack565(r, g, b); }
+uint16_t Color::Pack565Unscaled() const { return Pack565Unscaled(r, g, b); }
 
 Color Color::ScaleTo565() const { return Color(scale8To5(r), scale8To6(g), scale8To5(b)); }
 Color Color::ScaleFrom565() const { return Color(scale5To8(r), scale6To8(g), scale5To8(b)); }
diff --git a/src/Color.h b/src/Color.h
index b5db7bd..3976390 100644
--- a/src/Color.h
+++ b/src/Color.h
@@ -25,9 +25,10 @@
 
 namespace rgbcx {
 class Vector4;
+class Vector4Int;
+
 
 #pragma pack(push, 1)
-
 class Color {
    public:
     uint8_t r;
@@ -61,14 +62,17 @@ class Color {
         return reinterpret_cast<uint8_t *>(this)[index];
     }
 
+    operator Vector4() const;
+    operator Vector4Int() const;
+
     void SetRGBA(uint8_t vr, uint8_t vg, uint8_t vb, uint8_t va);
     void SetRGBA(const Color &other) { SetRGBA(other.r, other.g, other.b, other.a); }
 
     void SetRGB(uint8_t vr, uint8_t vg, uint8_t vb);
     void SetRGB(const Color &other) { SetRGB(other.r, other.g, other.b); }
 
-    uint16_t pack565();
-    uint16_t pack565Unscaled();
+    uint16_t Pack565() const;
+    uint16_t Pack565Unscaled() const;
 
     Color ScaleTo565() const;
     Color ScaleFrom565() const;
diff --git a/src/Interpolator.cpp b/src/Interpolator.cpp
index 9262502..d2de1d3 100644
--- a/src/Interpolator.cpp
+++ b/src/Interpolator.cpp
@@ -110,18 +110,22 @@ uint8_t Interpolator::InterpolateHalf5(uint8_t v0, uint8_t v1) const { return In
 uint8_t Interpolator::InterpolateHalf6(uint8_t v0, uint8_t v1) const { return InterpolateHalf8(scale6To8(v0), scale6To8(v1)); }
 
 std::array<Color, 4> Interpolator::InterpolateBC1(uint16_t low, uint16_t high) const {
-    auto colors = std::array<Color, 4>();
-    colors[0] = Color::Unpack565(low);
-    colors[1] = Color::Unpack565(high);
+    return InterpolateBC1(Color::Unpack565Unscaled(low), Color::Unpack565Unscaled(high), (high >= low));
+}
 
-    if (low > high) {
-        // 4-color mode
-        colors[2] = InterpolateColor24(colors[0], colors[1]);
-        colors[3] = InterpolateColor24(colors[1], colors[0]);
-    } else {
+std::array<Color, 4> Interpolator::InterpolateBC1(Color low, Color high, bool use_3color) const {
+    auto colors = std::array<Color, 4>();
+    colors[0] = low.ScaleFrom565();
+    colors[1] = high.ScaleFrom565();
+
+    if (use_3color) {
         // 3-color mode
         colors[2] = InterpolateHalfColor24(colors[0], colors[1]);
         colors[3] = Color(0, 0, 0, 0);  // transparent black
+    } else {
+        // 4-color mode
+        colors[2] = InterpolateColor24(colors[0], colors[1]);
+        colors[3] = InterpolateColor24(colors[1], colors[0]);
     }
 
     return colors;
@@ -147,7 +151,7 @@ uint8_t InterpolatorNvidia::Interpolate5(uint8_t v0, uint8_t v1) const {
 
 uint8_t InterpolatorNvidia::Interpolate6(uint8_t v0, uint8_t v1) const {
     assert(v0 < 64 && v1 < 64);
-    const int gdiff = (int) v1 - v0;
+    const int gdiff = (int)v1 - v0;
     return static_cast<uint8_t>((256 * v0 + (gdiff / 4) + 128 + gdiff * 80) >> 8);
 }
 
@@ -158,25 +162,23 @@ uint8_t InterpolatorNvidia::InterpolateHalf5(uint8_t v0, uint8_t v1) const {
 
 uint8_t InterpolatorNvidia::InterpolateHalf6(uint8_t v0, uint8_t v1) const {
     assert(v0 < 64 && v1 < 64);
-    const int gdiff = (int) v1 - v0;
+    const int gdiff = (int)v1 - v0;
     return static_cast<uint8_t>((256 * v0 + gdiff / 4 + 128 + gdiff * 128) >> 8);
 }
 
-std::array<Color, 4> InterpolatorNvidia::InterpolateBC1(uint16_t low, uint16_t high) const {
+std::array<Color, 4> InterpolatorNvidia::InterpolateBC1(Color low, Color high, bool use_3color) const {
     // Nvidia is special and interpolation cant be done with 8-bit values, so we need to override the default behavior
     std::array<Color, 4> colors;
-    auto low565 = Color::Unpack565Unscaled(low);
-    auto high565 = Color::Unpack565Unscaled(high);
-    colors[0] = low565.ScaleFrom565();
-    colors[1] = high565.ScaleFrom565();
+    colors[0] = low.ScaleFrom565();
+    colors[1] = high.ScaleFrom565();
 
-    if (low > high) {
+    if (!use_3color) {
         // 4-color mode
-        colors[2] = InterpolateColor565(low565, high565);
-        colors[3] = InterpolateColor565(high565, low565);
+        colors[2] = InterpolateColor565(low, high);
+        colors[3] = InterpolateColor565(high, low);
     } else {
         // 3-color mode
-        colors[2] = InterpolateHalfColor565(low565, high565);
+        colors[2] = InterpolateHalfColor565(low, high);
         colors[3] = Color(0, 0, 0, 0);  // transparent black
     }
 
diff --git a/src/Interpolator.h b/src/Interpolator.h
index 4dc5d45..0c0b66b 100644
--- a/src/Interpolator.h
+++ b/src/Interpolator.h
@@ -98,6 +98,8 @@ class Interpolator {
      */
     virtual std::array<Color, 4> InterpolateBC1(uint16_t low, uint16_t high) const;
 
+    virtual std::array<Color, 4> InterpolateBC1(Color low, Color high, bool use_3color) const;
+
     /**
      * Gets the type of an interpolator
      * @return The interpolator type
@@ -142,7 +144,7 @@ class InterpolatorNvidia : public Interpolator {
     uint8_t InterpolateHalf5(uint8_t v0, uint8_t v1) const override;
     uint8_t InterpolateHalf6(uint8_t v0, uint8_t v1) const override;
 
-    std::array<Color, 4> InterpolateBC1(uint16_t low, uint16_t high) const override;
+    std::array<Color, 4> InterpolateBC1(Color low, Color high, bool use_3color) const override;
 
     Type GetType() const noexcept override { return Type::Nvidia; }
     bool CanInterpolate8Bit() const noexcept override { return false; }
diff --git a/src/Vector4.h b/src/Vector4.h
index 7b73fb9..e340625 100644
--- a/src/Vector4.h
+++ b/src/Vector4.h
@@ -48,7 +48,7 @@ class Vector4 {
 
     static Vector4 FromColorRGB(const Color &c) { return Vector4(c.r, c.g, c.b); }
 
-    static float Dot(Vector4 &lhs, Vector4 &rhs) {
+    static float Dot(const Vector4 &lhs, const Vector4 &rhs) {
         float sum = 0;
         for (unsigned i = 0; i < 4; i++) { sum += lhs[i] * rhs[i]; }
         return sum;
@@ -83,8 +83,8 @@ class Vector4 {
     friend Vector4 &operator*=(Vector4 &lhs, const float &rhs) { return lhs = lhs * rhs; }
     friend Vector4 &operator/=(Vector4 &lhs, const float &rhs) { return lhs = lhs / rhs; }
 
-    float Dot(Vector4 other) { return Dot(*this, other); }
-    float MaxAbs(unsigned channels = 4) {
+    float Dot(Vector4 other) const { return Dot(*this, other); }
+    float MaxAbs(unsigned channels = 4) const {
         assert(channels < 5);
         assert(channels > 0);
         float max = 0;
diff --git a/src/Vector4Int.h b/src/Vector4Int.h
new file mode 100644
index 0000000..f636594
--- /dev/null
+++ b/src/Vector4Int.h
@@ -0,0 +1,114 @@
+/*  Python-rgbcx Texture Compression Library
+    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <array>
+#include <functional>
+
+#include "Color.h"
+
+namespace rgbcx {
+
+class Vector4Int {
+   public:
+    Vector4Int(int x = 0, int y = 0, int z = 0, int w = 0) {
+        _c[0] = x;
+        _c[1] = y;
+        _c[2] = z;
+        _c[3] = w;
+    }
+
+    Vector4Int(int scalar) {
+        _c[0] = scalar;
+        _c[1] = scalar;
+        _c[2] = scalar;
+        _c[3] = scalar;
+    }
+
+    Vector4Int(const Color &c) : Vector4Int(c.r, c.g, c.b, c.a) {}
+
+    static Vector4Int FromColor(const Color &c) { return Vector4Int(c); }
+
+    static Vector4Int FromColorRGB(const Color &c) { return Vector4Int(c.r, c.g, c.b); }
+
+    static int Dot(Vector4Int &lhs, Vector4Int &rhs) {
+        int sum = 0;
+        for (unsigned i = 0; i < 4; i++) { sum += lhs[i] * rhs[i]; }
+        return sum;
+    }
+
+    int operator[](size_t index) const {
+        assert(index < 4);
+        return _c[index];
+    }
+    int &operator[](size_t index) {
+        assert(index < 4);
+        return _c[index];
+    }
+
+    friend Vector4Int operator+(const Vector4Int &lhs, const Vector4Int &rhs) { return DoOp(lhs, rhs, std::plus()); }
+    friend Vector4Int operator-(const Vector4Int &lhs, const Vector4Int &rhs) { return DoOp(lhs, rhs, std::minus()); }
+    friend Vector4Int operator*(const Vector4Int &lhs, const Vector4Int &rhs) { return DoOp(lhs, rhs, std::multiplies()); }
+    friend Vector4Int operator/(const Vector4Int &lhs, const Vector4Int &rhs) { return DoOp(lhs, rhs, std::divides()); }
+
+    friend Vector4Int operator+(const Vector4Int &lhs, const int &rhs) { return DoOp(lhs, rhs, std::plus()); }
+    friend Vector4Int operator-(const Vector4Int &lhs, const int &rhs) { return DoOp(lhs, rhs, std::minus()); }
+    friend Vector4Int operator*(const Vector4Int &lhs, const int &rhs) { return DoOp(lhs, rhs, std::multiplies()); }
+    friend Vector4Int operator/(const Vector4Int &lhs, const int &rhs) { return DoOp(lhs, rhs, std::divides()); }
+
+    friend Vector4Int &operator+=(Vector4Int &lhs, const Vector4Int &rhs) { return lhs = lhs + rhs; }
+    friend Vector4Int &operator-=(Vector4Int &lhs, const Vector4Int &rhs) { return lhs = lhs - rhs; }
+    friend Vector4Int &operator*=(Vector4Int &lhs, const Vector4Int &rhs) { return lhs = lhs * rhs; }
+    friend Vector4Int &operator/=(Vector4Int &lhs, const Vector4Int &rhs) { return lhs = lhs / rhs; }
+
+    friend Vector4Int &operator+=(Vector4Int &lhs, const int &rhs) { return lhs = lhs + rhs; }
+    friend Vector4Int &operator-=(Vector4Int &lhs, const int &rhs) { return lhs = lhs - rhs; }
+    friend Vector4Int &operator*=(Vector4Int &lhs, const int &rhs) { return lhs = lhs * rhs; }
+    friend Vector4Int &operator/=(Vector4Int &lhs, const int &rhs) { return lhs = lhs / rhs; }
+
+    int Dot(Vector4Int other) { return Dot(*this, other); }
+    int MaxAbs(unsigned channels = 4) {
+        assert(channels < 5);
+        assert(channels > 0);
+        int max = 0;
+        for (unsigned i = 0; i < channels; i++) {
+            int a = abs((*this)[i]);
+            if (a > max) max = a;
+        }
+        return max;
+    }
+
+   private:
+    template <typename Op> friend Vector4Int DoOp(const Vector4Int &lhs, const Vector4Int &rhs, Op f) {
+        Vector4Int r;
+        for (unsigned i = 0; i < 4; i++) { r[i] = f(lhs[i], rhs[i]); }
+        return r;
+    }
+
+    template <typename Op> friend Vector4Int DoOp(const Vector4Int &lhs, const int &rhs, Op f) {
+        Vector4Int r;
+        for (unsigned i = 0; i < 4; i++) { r[i] = f(lhs[i], rhs); }
+        return r;
+    }
+
+    std::array<int, 4> _c;
+};
+
+}  // namespace rgbcx
diff --git a/src/rgbcx.cpp b/src/rgbcx.cpp
index 987d0f7..5af93f9 100644
--- a/src/rgbcx.cpp
+++ b/src/rgbcx.cpp
@@ -666,8 +666,8 @@ static inline void bc1_get_block_colors3(uint32_t block_r[3], uint32_t block_g[3
     }
 }
 
-static inline void bc1_find_sels4_noerr(const Color *pSrc_pixels, uint32_t lr, uint32_t lg, uint32_t lb, uint32_t hr, uint32_t hg, uint32_t hb,
-                                        uint8_t sels[16]) {
+
+void bc1_find_sels4_noerr(const Color *pSrc_pixels, uint32_t lr, uint32_t lg, uint32_t lb, uint32_t hr, uint32_t hg, uint32_t hb, uint8_t *sels) {
     uint32_t block_r[4], block_g[4], block_b[4];
     bc1_get_block_colors4(block_r, block_g, block_b, lr, lg, lb, hr, hg, hb);
 
@@ -1454,9 +1454,9 @@ void encode_bc1(uint32_t level, void *pDst, const uint8_t *pPixels, bool allow_3
 }
 
 // Finds low and high colors to begin with
-static inline void encode_bc1_pick_initial(const Color *pSrc_pixels, uint32_t flags, bool grayscale_flag, int min_r, int min_g, int min_b, int max_r,
-                                           int max_g, int max_b, int avg_r, int avg_g, int avg_b, int total_r, int total_g, int total_b, int &lr, int &lg,
-                                           int &lb, int &hr, int &hg, int &hb) {
+
+void encode_bc1_pick_initial(const Color *pSrc_pixels, uint32_t flags, bool grayscale_flag, int min_r, int min_g, int min_b, int max_r, int max_g, int max_b,
+                             int avg_r, int avg_g, int avg_b, int total_r, int total_g, int total_b, int &lr, int &lg, int &lb, int &hr, int &hg, int &hb) {
     if (grayscale_flag) {
         const int fr = pSrc_pixels[0].r;