Add ordertable and move single color tables to their own template class

2024-09-13 06:37:34 +00:00 · 2021-02-26 22:16:12 -08:00 · 2021-02-26 22:16:12 -08:00 · 5b492fd4b5
commit 5b492fd4b5
parent 68896aca1a
10 changed files with 427 additions and 124 deletions
--- a/src/BC1/BC1Encoder.cpp
+++ b/src/BC1/BC1Encoder.cpp
@ -36,73 +36,39 @@
 #include "../Vector4Int.h"
 #include "../bitwiseEnums.h"
 #include "../util.h"
+#include "OrderTable.h"
+#include "SingleColorTable.h"

 namespace rgbcx {
-using MatchList = std::array<BC1MatchEntry, 256>;
-using MatchListPtr = std::shared_ptr<MatchList>;
 using InterpolatorPtr = std::shared_ptr<Interpolator>;
+using Hist3 = OrderTable<3>::Histogram;
+using Hist4 = OrderTable<4>::Histogram;

 // region Free Functions/Templates
-inline void PrepSingleColorTableEntry(unsigned &error, MatchList &match_table, uint8_t v, unsigned i, uint8_t low, uint8_t high, uint8_t low8, uint8_t high8,
-                                      bool ideal) {
-    unsigned new_error = iabs(v - (int)i);
-
-    // We only need to factor in 3% error in BC1 ideal mode.
-    if (ideal) new_error += (iabs(high8 - (int)low8) * 3) / 100;
-
-    // Favor equal endpoints, for lower error on actual GPU's which approximate the interpolation.
-    if ((new_error < error) || (new_error == error && low == high)) {
-        assert(new_error <= UINT8_MAX);
-
-        match_table[i].low = (uint8_t)low;
-        match_table[i].high = (uint8_t)high;
-        match_table[i].error = (uint8_t)new_error;
-
-        error = new_error;
-    }
-}
-
-template <size_t S> void PrepSingleColorTable(MatchList &match_table, MatchList &match_table_half, Interpolator &interpolator) {
-    unsigned size = 1 << S;
-
-    assert((S == 5 && size == 32) || (S == 6 && size == 64));
-
-    bool ideal = interpolator.IsIdeal();
-    bool use_8bit = interpolator.CanInterpolate8Bit();
-
-    for (unsigned i = 0; i < 256; i++) {
-        unsigned error = 256;
-        unsigned error_half = 256;
-
-        // TODO: Can probably avoid testing for values that definitely wont yield good results,
-        // e.g. low8 and high8 both much smaller or larger than index
-        for (uint8_t low = 0; low < size; low++) {
-            uint8_t low8 = (S == 5) ? scale5To8(low) : scale6To8(low);
-
-            for (uint8_t high = 0; high < size; high++) {
-                uint8_t high8 = (S == 5) ? scale5To8(high) : scale6To8(high);
-                uint8_t value, value_half;
-
-                if (use_8bit) {
-                    value = interpolator.Interpolate8(high8, low8);
-                    value_half = interpolator.InterpolateHalf8(high8, low8);
-                } else {
-                    value = (S == 5) ? interpolator.Interpolate5(high, low) : interpolator.Interpolate6(high, low);
-                    value_half = (S == 5) ? interpolator.InterpolateHalf5(high, low) : interpolator.InterpolateHalf6(high, low);
-                }
-
-                PrepSingleColorTableEntry(error, match_table, value, i, low, high, low8, high8, ideal);
-                PrepSingleColorTableEntry(error_half, match_table_half, value_half, i, low, high, low8, high8, ideal);
-            }
-        }
-    }
-}
 // endregion

+// Static Fields
+OrderTable<3> *BC1Encoder::order_table3 = nullptr;
+OrderTable<4> *BC1Encoder::order_table4 = nullptr;
+std::mutex BC1Encoder::order_table_mutex = std::mutex();
+bool BC1Encoder::order_tables_generated = false;
+
 BC1Encoder::BC1Encoder(InterpolatorPtr interpolator) : _interpolator(interpolator) {
-    PrepSingleColorTable<5>(*_single_match5, *_single_match5_half, *_interpolator);
-    PrepSingleColorTable<6>(*_single_match6, *_single_match6_half, *_interpolator);
-    _flags = Flags::UseFullMSEEval | Flags::TwoLeastSquaresPasses;
+    _flags = Flags::UseFasterMSEEval | Flags::TwoLeastSquaresPasses;
+
+    // generate lookup tables
+    order_table_mutex.lock();
+    if (!order_tables_generated) {
+        assert(order_table3 == nullptr);
+        assert(order_table4 == nullptr);
+
+        order_table3 = new OrderTable<3>();
+        order_table4 = new OrderTable<4>();
+        order_tables_generated = true;
+    }
+    assert(order_table3 != nullptr);
+    assert(order_table4 != nullptr);
+    order_table_mutex.unlock();
 }

 void BC1Encoder::EncodeBlock(Color4x4 pixels, BC1Block *dest) const {
@ -125,10 +91,11 @@ void BC1Encoder::EncodeBlock(Color4x4 pixels, BC1Block *dest) const {
    needs_block_error |= metrics.has_black && ((_flags & Flags::Use3ColorBlocksForBlackPixels) != Flags::None);

    unsigned total_ls_passes = (_flags & Flags::TwoLeastSquaresPasses) != Flags::None ? 2 : 1;
-    unsigned total_rounds = needs_block_error && ((_flags & Flags::TryAllInitialEndpoints) != Flags::None) ? 2 : 1;
+    unsigned total_ep_rounds = needs_block_error && ((_flags & Flags::TryAllInitialEndpoints) != Flags::None) ? 2 : 1;

+    // Initial block generation
    EncodeResults result;
-    for (unsigned round = 0; round < total_rounds; round++) {
+    for (unsigned round = 0; round < total_ep_rounds; round++) {
        Flags modified_flags = _flags;
        if (round == 1) {
            modified_flags &= ~(Flags::Use2DLS | Flags::BoundingBoxInt);
@ -158,6 +125,54 @@ void BC1Encoder::EncodeBlock(Color4x4 pixels, BC1Block *dest) const {
        if (!needs_block_error || round_result.error < result.error) { result = round_result; }
    }

+    // First refinement pass using ordered cluster fit
+    if (result.error > 0 && (_flags & Flags::UseLikelyTotalOrderings) != Flags::None) {
+        const unsigned total_iters = (_flags & Flags::Iterative) != Flags::None ? 2 : 1;
+        for (unsigned iter = 0; iter < total_iters; iter++) {
+            EncodeResults orig = result;
+            Hist4 h(orig.selectors);
+
+            const Hash order_index = order_table4->GetHash(h);
+
+            Color low = orig.low.ScaleFrom565();
+            Color high = orig.high.ScaleFrom565();
+
+            Vector4Int axis = high - low;
+            std::array<Vector4, 16> color_vectors;
+
+            std::array<uint32_t, 16> dots;
+            for (unsigned i = 0; i < 16; i++) {
+                color_vectors[i] = Vector4::FromColorRGB(pixels.Get(i));
+                int dot = 0x1000000 + color_vectors[i].Dot(axis);
+                assert(dot >= 0);
+                dots[i] = (uint32_t)(dot << 4) | i;
+            }
+
+            std::sort(dots.begin(), dots.end());
+
+            // we now have a list of indices and their dot products along the primary axis
+            std::array<Vector4, 17> sums;
+            for (unsigned i = 0; i < 16; i++) {
+                const unsigned p = dots[i] & 0xF;
+                sums[i + 1] = sums[i] + color_vectors[p];
+            }
+
+            const unsigned q_total = ((_flags & Flags::Exhaustive) != Flags::None) ? order_table4->UniqueOrderings
+                                                                                   : (unsigned)clampi(_orderings4, MIN_TOTAL_ORDERINGS, MAX_TOTAL_ORDERINGS4);
+            for (unsigned q = 0; q < q_total; q++) {
+                Hash s = ((_flags & Flags::Exhaustive) != Flags::None) ? q : g_best_total_orderings4[order_index][q];
+
+                EncodeResults trial = orig;
+                Vector4 low, high;
+                if (order_table4->IsSingleColor(order_index)) {
+                    trial.is_1_color = true;
+                    trial.is_3_color = false;
+                } else {
+                }
+            }
+        }
+    }
+
    if (result.low == result.high) {
        EncodeBlockSingleColor(metrics.avg, dest);
    } else {
@ -172,19 +187,15 @@ void BC1Encoder::EncodeBlockSingleColor(Color color, BC1Block *dest) const {
    bool using_3color = false;

    // why is there no subscript operator for shared_ptr<array>
-    MatchList &match5 = *_single_match5;
-    MatchList &match6 = *_single_match6;
-    MatchList &match5_half = *_single_match5_half;
-    MatchList &match6_half = *_single_match6_half;

-    BC1MatchEntry match_r = match5[color.r];
-    BC1MatchEntry match_g = match6[color.g];
-    BC1MatchEntry match_b = match5[color.b];
+    auto match_r = _single_match5[color.r];
+    auto match_g = _single_match6[color.g];
+    auto match_b = _single_match5[color.b];

    if ((_flags & (Flags::Use3ColorBlocks | Flags::Use3ColorBlocksForBlackPixels)) != Flags::None) {
-        BC1MatchEntry match_r_half = match5_half[color.r];
-        BC1MatchEntry match_g_half = match6_half[color.g];
-        BC1MatchEntry match_b_half = match5_half[color.b];
+        auto match_r_half = _single_match5_half[color.r];
+        auto match_g_half = _single_match6_half[color.g];
+        auto match_b_half = _single_match5_half[color.b];

        const unsigned err4 = match_r.error + match_g.error + match_b.error;
        const unsigned err3 = match_r_half.error + match_g_half.error + match_b_half.error;
@ -590,4 +601,17 @@ bool BC1Encoder::ComputeEndpointsLS(Color4x4 pixels, EncodeResults &block, Block
    block.high = Color::PreciseRound565(high);
    return true;
 }
+/*
+bool BC1Encoder::ComputeEndpointsLS(Color4x4 pixels, EncodeResults &block, BlockMetrics metrics, Hash hash, Vector4 &matrix, std::array<Vector4, 17> &sums,
+                                    bool is_3color, bool use_black) const {
+    unsigned f1, f2, f3;
+    int denominator = is_3color ? 2 : 3;
+
+    if (is_3color) {
+        order_table3->GetUniqueOrderingSums(hash, f1, f2, f3);
+    } else {
+        order_table4->GetUniqueOrderingSums(hash, f1, f2, f3);
+    }
+}*/
+
 }  // namespace rgbcx
--- a/src/BC1/BC1Encoder.h
+++ b/src/BC1/BC1Encoder.h
@ -30,20 +30,15 @@
 #include "../bitwiseEnums.h"
 #include "../ndebug.h"
 #include "BC1Block.h"
+#include "OrderTable.h"
+#include "SingleColorTable.h"
 #include "tables.h"

 namespace rgbcx {

-struct BC1MatchEntry {
-    uint8_t high;
-    uint8_t low;
-    uint8_t error;
-};
-
 class BC1Encoder : public BlockEncoder<BC1Block, 4, 4> {
   public:
    using InterpolatorPtr = std::shared_ptr<Interpolator>;
-    using BlockMetrics = Color4x4::BlockMetrics;

    enum class Flags : uint32_t {
        None = 0,
@ -109,13 +104,6 @@ class BC1Encoder : public BlockEncoder<BC1Block, 4, 4> {
    void EncodeBlock(Color4x4 pixels, BC1Block *dest) const override;

   private:
-    const InterpolatorPtr _interpolator;
-
-    Flags _flags;
-    unsigned _search_rounds;
-    unsigned _orderings4;
-    unsigned _orderings3;
-
    // Unpacked BC1 block with metadata
    struct EncodeResults {
        Color low;
@ -126,6 +114,30 @@ class BC1Encoder : public BlockEncoder<BC1Block, 4, 4> {
        unsigned error = UINT_MAX;
    };

+    using Hash = uint16_t;
+    using BlockMetrics = Color4x4::BlockMetrics;
+
+    const InterpolatorPtr _interpolator;
+
+    // match tables used for single-color blocks
+    // Each entry includes a high and low pair that best reproduces the 8-bit index as well as possible,
+    // with an included error value
+    // these depend on the interpolator
+    const SingleColorTable<5, 4> _single_match5 = SingleColorTable<5, 4>(_interpolator);
+    const SingleColorTable<6, 4> _single_match6 = SingleColorTable<6, 4>(_interpolator);
+    const SingleColorTable<5, 3> _single_match5_half = SingleColorTable<5, 3>(_interpolator);
+    const SingleColorTable<6, 3> _single_match6_half = SingleColorTable<6, 3>(_interpolator);
+
+    Flags _flags;
+    unsigned _search_rounds;
+    unsigned _orderings4;
+    unsigned _orderings3;
+
+    static OrderTable<4> *order_table4;   // order table for 3-color blocks
+    static OrderTable<3> *order_table3;   // order table for 4-color blocks
+    static std::mutex order_table_mutex;  // prevent race condition with multiple BC1Encoders constructed at once
+    static bool order_tables_generated;   // have the order tables been generated by a previous instance?
+
    void EncodeBlockSingleColor(Color color, BC1Block *dest) const;
    void EncodeBlock4Color(EncodeResults &block, BC1Block *dest) const;

@ -133,36 +145,7 @@ class BC1Encoder : public BlockEncoder<BC1Block, 4, 4> {
    unsigned FindSelectors4(Color4x4 pixels, BC1Encoder::EncodeResults &block, bool use_err) const;

    bool ComputeEndpointsLS(Color4x4 pixels, EncodeResults &block, BlockMetrics metrics, bool is_3color, bool use_black) const;
-
-    // match tables used for single-color blocks
-    // Each entry includes a high and low pair that best reproduces the 8-bit index as well as possible,
-    // with an included error value
-    // these depend on the interpolator
-    using MatchList = std::array<BC1MatchEntry, 256>;
-    using MatchListPtr = std::shared_ptr<MatchList>;
-
-    const MatchListPtr _single_match5 = std::make_shared<MatchList>();
-    const MatchListPtr _single_match6 = std::make_shared<MatchList>();
-    const MatchListPtr _single_match5_half = std::make_shared<MatchList>();
-    const MatchListPtr _single_match6_half = std::make_shared<MatchList>();
-
-    // static lookup tables, generated the first time an encoder is created
-    // the mutex prevents race conditions if two encoders are created on different threads
-    static std::mutex _luts_mutex;
-    static bool _luts_initialized;
-
-    // lookup table for hash values
-    static uint16_t g_total_ordering4_hash[4096];
-    static uint16_t g_total_ordering3_hash[256];
-
-    static float g_selector_factors4[NUM_UNIQUE_TOTAL_ORDERINGS4][3];
-    static float g_selector_factors3[NUM_UNIQUE_TOTAL_ORDERINGS3][3];
-
-    // This table is: 9 * (w * w), 9 * ((1.0f - w) * w), 9 * ((1.0f - w) * (1.0f - w))
-    // where w is [0,1/3,2/3,1]. 9 is the perfect multiplier.
-    static constexpr uint32_t g_weight_vals4[4] = {0x000009, 0x010204, 0x040201, 0x090000};
-
-    // multiplier is 4 for 3-color
-    static constexpr uint32_t g_weight_vals3[3] = {0x000004, 0x040000, 0x010101};
+/*    bool ComputeEndpointsLS(Color4x4 pixels, EncodeResults &block, BlockMetrics metrics, Hash hash, Vector4 &matrix, std::array<Vector4, 17> &sums,
+                            bool is_3color, bool use_black) const;*/
 };
 }  // namespace rgbcx
--- a/src/BC1/ClusterFit.h
+++ b/src/BC1/ClusterFit.h
@ -0,0 +1,26 @@
+/*  Python-rgbcx Texture Compression Library
+    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+namespace rgbcx {
+
+void ComputeEndpoints()
+
+}  // namespace rgbcx::ClusterFit
--- a/src/BC1/OrderTable.h
+++ b/src/BC1/OrderTable.h
@ -0,0 +1,151 @@
+/*  Python-rgbcx Texture Compression Library
+    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+#include <mutex>
+#include <numeric>
+
+#include "../Vector4.h"
+#include "../util.h"
+#include "tables.h"
+
+namespace rgbcx {
+
+template <size_t N> class OrderTable {
+   public:
+    using Hash = uint16_t;
+    using FactorMatrix = std::array<float, 3>;
+
+    class Histogram {
+       public:
+        Histogram() { _bins = {0}; }
+
+        Histogram(std::array<uint8_t, 16> sels) {
+            _bins = {0};
+            for (unsigned i = 0; i < 16; i++) {
+                assert(sels[i] < N);
+                _bins[sels[i]]++;
+            }
+        }
+
+        uint8_t operator[](size_t index) const {
+            assert(index < N);
+            return _bins[index];
+        }
+        uint8_t &operator[](size_t index) {
+            assert(index < N);
+            return _bins[index];
+        }
+
+        bool Any16() {
+            return std::any_of(_bins.begin(), _bins.end(), [](int i) { return i == 16; });
+        }
+
+        unsigned GetPacked() const {
+            unsigned packed = 0;
+            for (unsigned i = 0; i < (N - 1); i++) { packed |= (_bins[i] << (4 * i)); }
+
+            assert(packed < TotalHashes);
+
+            return packed;
+        }
+
+       private:
+        std::array<uint8_t, N> _bins;
+    };
+
+    static inline constexpr size_t UniqueOrderings = (N == 4) ? NUM_UNIQUE_TOTAL_ORDERINGS4 : NUM_UNIQUE_TOTAL_ORDERINGS3;
+    static inline constexpr size_t TotalHashes = (N == 4) ? 4096 : 256;
+
+    static inline constexpr uint8_t GetUniqueOrdering(Hash hash, unsigned selector) {
+        if constexpr (N == 4) { return g_unique_total_orders4[hash][selector]; }
+        return g_unique_total_orders3[hash][selector];
+    }
+
+    static inline constexpr void GetUniqueOrderingSums(Hash hash, unsigned &f1, unsigned &f2, unsigned &f3) {
+        f1 = GetUniqueOrdering(hash, 0);
+        f2 = f1 + GetUniqueOrdering(hash, 1);
+        f3 = f2 + GetUniqueOrdering(hash, 2);
+    }
+
+    OrderTable<N>() {
+        static_assert(N == 4 || N == 3);
+
+        const unsigned *weight_vals = (N == 4) ? g_weight_vals4 : g_weight_vals3;
+        const float denominator = (N == 4) ? 3.0f : 2.0f;
+
+        for (unsigned i = 0; i < UniqueOrderings; i++) {
+            Histogram h;
+            for (unsigned j = 0; j < N; j++) { h[j] = GetUniqueOrdering(i, j); }
+
+            if (!h.Any16()) _hashes[h.GetPacked()] = (Hash)i;
+
+            unsigned weight_accum = 0;
+            for (unsigned sel = 0; sel < N; sel++) weight_accum += (weight_vals[sel] * h[sel]);
+
+            // todo: use a Vector4 here instead for SIMD readiness
+            float z00 = (float)((weight_accum >> 16) & 0xFF);
+            float z10 = (float)((weight_accum >> 8) & 0xFF);
+            float z11 = (float)(weight_accum & 0xFF);
+            float z01 = z10;
+
+            float det = z00 * z11 - z01 * z10;
+            if (fabs(det) < 1e-8f) {
+                _factors[i][0] = 0;
+                _factors[i][1] = 0;
+                _factors[i][2] = 0;
+            } else {
+                det = (denominator / 255.0f) / det;
+                _factors[i][0] = z11 * det;
+                _factors[i][1] = -z10 * det;
+                _factors[i][2] = z00 * det;
+            }
+        }
+    }
+
+    Hash GetHash(Histogram &hist) const {
+        for (unsigned i = 0; i < N; i++) {
+            if (hist[i] == 16) return GetSingleColorHashes()[i];
+        }
+
+        return _hashes[hist.GetPacked()];
+    }
+
+    Vector4 GetFactors(Hash hash) { return Vector4(_factors[hash][0], _factors[hash][1], _factors[hash][1], _factors[hash][2]); }
+
+    static inline constexpr std::array<Hash, N> GetSingleColorHashes() {
+        if (N == 4) { return {15, 700, 753, 515}; }
+        return {12, 15, 89};
+    }
+
+    static inline constexpr bool IsSingleColor(Hash hash) {
+        auto hashes = GetSingleColorHashes();
+        return (std::find(hashes.begin(), hashes.end(), hash) != hashes.end());
+    }
+
+   private:
+    std::array<Hash, TotalHashes> _hashes;
+    std::array<FactorMatrix, UniqueOrderings> _factors;
+};
+
+}  // namespace rgbcx
--- a/src/BC1/SingleColorTable.h
+++ b/src/BC1/SingleColorTable.h
@ -0,0 +1,107 @@
+/*  Python-rgbcx Texture Compression Library
+    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <array>
+#include <cstdint>
+#include <memory>
+
+#include "../Interpolator.h"
+#include "../Util.h"
+
+namespace rgbcx {
+
+/**
+ * Lookup table for single-color blocks
+ * @tparam B Number of bits (5 or 6)
+ * @tparam N Number of colors (3 or 4)
+ */
+template <size_t B, size_t N> class SingleColorTable {
+   public:
+    struct MatchEntry {
+        uint8_t high;
+        uint8_t low;
+        uint8_t error;
+    };
+
+    using MatchList = std::array<MatchEntry, 256>;
+    using MatchListPtr = std::shared_ptr<MatchList>;
+    using InterpolatorPtr = std::shared_ptr<Interpolator>;
+
+    SingleColorTable(InterpolatorPtr interpolator) {
+        static_assert((B == 5 && Size == 32) || (B == 6 && Size == 64));
+        static_assert(N == 4 || N == 3);
+
+        bool ideal = interpolator->IsIdeal();
+        bool use_8bit = interpolator->CanInterpolate8Bit();
+
+        for (unsigned i = 0; i < 256; i++) {
+            unsigned error = 256;
+
+            // TODO: Can probably avoid testing for values that definitely wont yield good results,
+            // e.g. low8 and high8 both much smaller or larger than index
+            for (uint8_t low = 0; low < Size; low++) {
+                uint8_t low8 = (B == 5) ? scale5To8(low) : scale6To8(low);
+
+                for (uint8_t high = 0; high < Size; high++) {
+                    uint8_t high8 = (B == 5) ? scale5To8(high) : scale6To8(high);
+                    uint8_t value;
+
+                    if (use_8bit) {
+                        value = interpolator->Interpolate8(high8, low8);
+                    } else {
+                        value = (B == 5) ? interpolator->Interpolate5(high, low) : interpolator->Interpolate6(high, low);
+                    }
+
+                    unsigned new_error = iabs(value - (int)i);
+
+                    // We only need to factor in 3% error in BC1 ideal mode.
+                    if (ideal) new_error += (iabs(high8 - (int)low8) * 3) / 100;
+
+                    if ((new_error < error) || (new_error == error && low == high)) {
+                        assert(new_error <= UINT8_MAX);
+
+                        (*_matches)[i].low = (uint8_t)low;
+                        (*_matches)[i].high = (uint8_t)high;
+                        (*_matches)[i].error = (uint8_t)new_error;
+
+                        error = new_error;
+                    }
+                }
+            }
+        }
+    }
+
+    MatchEntry operator[](size_t index) const {
+        assert(index <= UINT8_MAX);
+        return (*_matches)[index];
+    }
+    MatchEntry &operator[](size_t index) {
+        assert(index <= UINT8_MAX);
+        return (*_matches)[index];
+    }
+
+   private:
+    static inline constexpr size_t Size = 1 << B;
+
+    MatchListPtr _matches = std::make_shared<MatchList>();
+};
+
+}  // namespace rgbcx
--- a/src/BC1/tables.h
+++ b/src/BC1/tables.h
@ -4,6 +4,13 @@
 #pragma once
 #include <cstdint>

+// This table is: 9 * (w * w), 9 * ((1.0f - w) * w), 9 * ((1.0f - w) * (1.0f - w))
+// where w is [0,1/3,2/3,1]. 9 is the perfect multiplier.
+static constexpr uint32_t g_weight_vals4[4] = {0x000009, 0x010204, 0x040201, 0x090000};
+
+// multiplier is 4 for 3-color
+static constexpr uint32_t g_weight_vals3[3] = {0x000004, 0x040000, 0x010101};
+
 const uint32_t MIN_TOTAL_ORDERINGS = 1;
 const uint32_t MAX_TOTAL_ORDERINGS3 = 32;

--- a/src/BlockEncoder.h
+++ b/src/BlockEncoder.h
@ -36,7 +36,7 @@ template <class B, size_t M, size_t N> class BlockEncoder {

    virtual void EncodeBlock(DecodedBlock pixels, EncodedBlock *dest) const = 0;

-    void EncodeImage(uint8_t *encoded, Color *decoded, unsigned image_width, unsigned image_height) {
+    virtual void EncodeImage(uint8_t *encoded, Color *decoded, unsigned image_width, unsigned image_height) {
        assert(image_width % N == 0);
        assert(image_width % M == 0);

--- a/src/Vector4.h
+++ b/src/Vector4.h
@ -20,8 +20,8 @@
 #pragma once

 #include <array>
-#include <functional>
 #include <cmath>
+#include <functional>

 #include "Color.h"

@ -29,7 +29,9 @@ namespace rgbcx {

 class Vector4 {
   public:
-    Vector4(float x = 0, float y = 0, float z = 0, float w = 0) {
+    Vector4() : Vector4(0) {}
+
+    Vector4(float x, float y, float z = 0, float w = 0) {
        _c[0] = x;
        _c[1] = y;
        _c[2] = z;
@ -96,6 +98,9 @@ class Vector4 {
        return max;
    }

+    unsigned int SqrMag() { return (unsigned)Dot(*this, *this); }
+
+
   private:
    template <typename Op> friend Vector4 DoOp(const Vector4 &lhs, const Vector4 &rhs, Op f) {
        Vector4 r;
--- a/src/Vector4Int.h
+++ b/src/Vector4Int.h
@ -49,7 +49,7 @@ class Vector4Int {

    static Vector4Int FromColorRGB(const Color &c) { return Vector4Int(c.r, c.g, c.b); }

-    static int Dot(Vector4Int &lhs, Vector4Int &rhs) {
+    static int Dot(const Vector4Int &lhs, const Vector4Int &rhs) {
        int sum = 0;
        for (unsigned i = 0; i < 4; i++) { sum += lhs[i] * rhs[i]; }
        return sum;
@ -86,7 +86,7 @@ class Vector4Int {
    friend Vector4Int &operator*=(Vector4Int &lhs, const int &rhs) { return lhs = lhs * rhs; }
    friend Vector4Int &operator/=(Vector4Int &lhs, const int &rhs) { return lhs = lhs / rhs; }

-    int Dot(Vector4Int other) { return Dot(*this, other); }
+    int Dot(const Vector4Int &other) const { return Dot(*this, other); }
    int MaxAbs(unsigned channels = 4) {
        assert(channels < 5);
        assert(channels > 0);
--- a/src/util.h
+++ b/src/util.h
@ -39,7 +39,7 @@ template <typename S> constexpr auto iabs(S i) {

 /**
 * Unpacks an unsigned integer into an array of smaller integers.
- * @tparam I Input data type. Must be an unsigned integral type large enough to hold C * S bits.
+ * @tparam I Input data type. Must be an unsigned integral type large enough to hold C * N bits.
 * @tparam O Output data type. must be an unsigned integral type large enough to hold C bits..
 * @tparam S Number of bits in each value.
 * @tparam C Number of values to unpack.
@ -53,7 +53,7 @@ template <typename I, typename O, size_t S, size_t C> constexpr auto Unpack(I pa
    static_assert(std::numeric_limits<I>::digits >= (C * S), "Packed input type must be big enough to represent the number of bits multiplied by count");
    static_assert(std::numeric_limits<O>::digits >= S, "Unpacked output type must be big enough to represent the number of bits");

-    constexpr O mask = (1U << S) - 1U;  // maximum value representable by S bits
+    constexpr O mask = (1U << S) - 1U;  // maximum value representable by N bits
    std::array<O, C> vals;              // output values array of size C

    for (unsigned i = 0; i < C; i++) {
@ -67,7 +67,7 @@ template <typename I, typename O, size_t S, size_t C> constexpr auto Unpack(I pa
 /**
 * Packs an array of unsigned integers into a single integer.
 * @tparam I Input data type. Must be an unsigned integral type large enough to hold C bits.
- * @tparam O Output data type. must be an unsigned integral type large enough to hold C * S bits.
+ * @tparam O Output data type. must be an unsigned integral type large enough to hold C * N bits.
 * @tparam S Number of bits in each value.
 * @tparam C Number of values to unpack.
 * @param vals Unpacked std::array of type I and size C.