From c879061e4e59e693ec32b8e325ee63584d2869ce Mon Sep 17 00:00:00 2001
From: drewcassidy <drewcassidy@me.com>
Date: Thu, 18 Feb 2021 02:43:57 -0800
Subject: [PATCH] Add (extremely bad) BC1 encoding support

---
 src/BC1/BC1Encoder.cpp | 175 +++++++++++++++++++++++++++++++++++++++++
 src/BC1/BC1Encoder.h   | 145 ++++++++++++++++++++++++++++++++++
 src/BlockView.h        |  31 ++++++++
 src/Interpolator.h     |  61 +++++++-------
 src/test/test.cpp      |   5 ++
 5 files changed, 388 insertions(+), 29 deletions(-)
 create mode 100644 src/BC1/BC1Encoder.cpp
 create mode 100644 src/BC1/BC1Encoder.h
diff --git a/src/BC1/BC1Encoder.cpp b/src/BC1/BC1Encoder.cpp
new file mode 100644
index 0000000..06a1021
--- /dev/null
+++ b/src/BC1/BC1Encoder.cpp
@@ -0,0 +1,175 @@
+/*  Python-rgbcx Texture Compression Library
+    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "BC1Encoder.h"
+
+#include <cstdint>
+#include <memory>
+
+#include "../BlockView.h"
+#include "../Color.h"
+#include "../bitwiseEnums.h"
+
+namespace rgbcx {
+using MatchList = std::array<BC1MatchEntry, 256>;
+using MatchListPtr = std::shared_ptr<MatchList>;
+using InterpolatorPtr = std::shared_ptr<Interpolator>;
+
+// region Free Functions/Templates
+inline void PrepSingleColorTableEntry(unsigned &error, MatchList &match_table, uint8_t v, unsigned i, uint8_t low, uint8_t high, uint8_t low8, uint8_t high8,
+                                      bool ideal) {
+    unsigned new_error = iabs(v - (int)i);
+
+    // We only need to factor in 3% error in BC1 ideal mode.
+    if (ideal) new_error += (iabs(high8 - (int)low8) * 3) / 100;
+
+    // Favor equal endpoints, for lower error on actual GPU's which approximate the interpolation.
+    if ((new_error < error) || (new_error == error && low == high)) {
+        assert(new_error <= UINT8_MAX);
+
+        match_table[i].low = (uint8_t)low;
+        match_table[i].high = (uint8_t)high;
+        match_table[i].error = (uint8_t)new_error;
+
+        error = new_error;
+    }
+}
+
+template <size_t S> void PrepSingleColorTable(MatchList &match_table, MatchList &match_table_half, Interpolator &interpolator) {
+    unsigned size = 1 << S;
+
+    assert((S == 5 && size == 32) || (S == 6 && size == 64));
+
+    bool ideal = interpolator.IsIdeal();
+    bool use_8bit = interpolator.CanInterpolate8Bit();
+
+    for (unsigned i = 0; i < 256; i++) {
+        unsigned error = 256;
+        unsigned error_half = 256;
+
+        // TODO: Can probably avoid testing for values that definitely wont yield good results,
+        // e.g. low8 and high8 both much smaller or larger than index
+        for (uint8_t low = 0; low < size; low++) {
+            uint8_t low8 = (S == 5) ? scale5To8(low) : scale6To8(low);
+
+            for (uint8_t high = 0; high < size; high++) {
+                uint8_t high8 = (S == 5) ? scale5To8(high) : scale6To8(low);
+                uint8_t value, value_half;
+
+                if (use_8bit) {
+                    value = interpolator.Interpolate8(high8, low8);
+                    value_half = interpolator.InterpolateHalf8(high8, low8);
+                } else {
+                    value = (S == 5) ? interpolator.Interpolate5(high, low) : interpolator.Interpolate6(high, low);
+                    value_half = (S == 5) ? interpolator.InterpolateHalf5(high, low) : interpolator.InterpolateHalf6(high, low);
+                }
+
+                PrepSingleColorTableEntry(error, match_table, value, i, low, high, low8, high8, ideal);
+                PrepSingleColorTableEntry(error_half, match_table_half, value_half, i, low, high, low8, high8, ideal);
+            }
+        }
+    }
+}
+// endregion
+
+BC1Encoder::BC1Encoder(InterpolatorPtr interpolator) : _interpolator(interpolator) {
+    PrepSingleColorTable<5>(*_single_match5, *_single_match5_half, *_interpolator);
+    PrepSingleColorTable<6>(*_single_match6, *_single_match6_half, *_interpolator);
+}
+
+void BC1Encoder::EncodeBlock(Color4x4 pixels, BC1Block *dest) const {
+    auto r_view = pixels.GetChannel(0);
+    auto g_view = pixels.GetChannel(1);
+    auto b_view = pixels.GetChannel(2);
+
+    if (pixels.IsSingleColor() || true) {  // for now assume (wrongly) everything is a single-color block
+        // single-color pixel block, do it the fast way
+        EncodeBlockSingleColor(pixels.Get(0, 0), dest);
+        return;
+    }
+
+    Color min, max, avg;
+    pixels.GetMinMaxAvgRGB(min, max, avg);
+}
+
+void BC1Encoder::EncodeBlockSingleColor(Color color, BC1Block *dest) const {
+    uint8_t mask = 0xAA;  // 2222
+    uint16_t min16, max16;
+
+    bool using_3color = false;
+
+    // why is there no subscript operator for shared_ptr<array>
+    MatchList &match5 = *_single_match5;
+    MatchList &match6 = *_single_match6;
+    MatchList &match5_half = *_single_match5_half;
+    MatchList &match6_half = *_single_match6_half;
+
+    BC1MatchEntry match_r = match5[color.r];
+    BC1MatchEntry match_g = match6[color.g];
+    BC1MatchEntry match_b = match5[color.b];
+
+    if ((_flags & (Flags::Use3ColorBlocks | Flags::Use3ColorBlocksForBlackPixels)) != Flags::None) {
+        BC1MatchEntry match_r_half = match5_half[color.r];
+        BC1MatchEntry match_g_half = match6_half[color.g];
+        BC1MatchEntry match_b_half = match5_half[color.b];
+
+        const unsigned err4 = match_r.error + match_g.error + match_b.error;
+        const unsigned err3 = match_r_half.error + match_g_half.error + match_b_half.error;
+
+        if (err3 < err4) {
+            min16 = Color::Pack565Unscaled(match_r_half.low, match_g_half.low, match_b_half.low);
+            max16 = Color::Pack565Unscaled(match_r_half.high, match_g_half.high, match_b_half.high);
+
+            if (max16 > min16) std::swap(min16, max16);
+            using_3color = true;
+        }
+    }
+
+    if (!using_3color) {
+        min16 = Color::Pack565Unscaled(match_r.low, match_g.low, match_b.low);
+        max16 = Color::Pack565Unscaled(match_r.high, match_g.high, match_b.high);
+
+        if (min16 == max16) {
+            // make sure this isnt accidentally a 3-color block
+            // so make max16 > min16 (l > h)
+            if (min16 > 0) {
+                min16--;
+                mask = 0;  // endpoints are equal so mask doesnt matter
+            } else {
+                assert(min16 == 0 && max16 == 0);
+                max16 = 1;
+                min16 = 0;
+                mask = 0x55;  // 1111 (min value only, max is ignored)
+            }
+        } else if (max16 < min16) {
+            std::swap(min16, max16);
+            mask = 0xFF;  // invert mask to 3333
+        }
+        assert(max16 > min16);
+    }
+
+    dest->SetLowColor(max16);
+    dest->SetHighColor(min16);
+    dest->selectors[0] = mask;
+    dest->selectors[1] = mask;
+    dest->selectors[2] = mask;
+    dest->selectors[3] = mask;
+}
+
+}  // namespace rgbcx
\ No newline at end of file
diff --git a/src/BC1/BC1Encoder.h b/src/BC1/BC1Encoder.h
new file mode 100644
index 0000000..4b20885
--- /dev/null
+++ b/src/BC1/BC1Encoder.h
@@ -0,0 +1,145 @@
+/*  Python-rgbcx Texture Compression Library
+    Copyright (C) 2021 Andrew Cassidy <drewcassidy@me.com>
+    Partially derived from rgbcx.h written by Richard Geldreich <richgel99@gmail.com>
+    and licenced under the public domain
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <array>
+#include <cassert>
+#include <cstdint>
+#include <mutex>
+
+#include "../BlockEncoder.h"
+#include "../BlockView.h"
+#include "../Interpolator.h"
+#include "../bitwiseEnums.h"
+#include "../ndebug.h"
+#include "../tables.h"
+#include "BC1Block.h"
+
+namespace rgbcx {
+
+struct BC1MatchEntry {
+    uint8_t high;
+    uint8_t low;
+    uint8_t error;
+};
+
+class BC1Encoder : public BlockEncoder<BC1Block, 4, 4> {
+   public:
+    using InterpolatorPtr = std::shared_ptr<Interpolator>;
+
+    enum class Flags : uint32_t {
+        None = 0,
+
+        // Try to improve quality using the most likely total orderings.
+        // The total_orderings_to_try parameter will then control the number of total orderings to try for 4 color blocks, and the
+        // total_orderings_to_try3 parameter will control the number of total orderings to try for 3 color blocks (if they are enabled).
+        UseLikelyTotalOrderings = 2,
+
+        // Use 2 least squares pass, instead of one (same as stb_dxt's HIGHQUAL option).
+        // Recommended if you're enabling UseLikelyTotalOrderings.
+        TwoLeastSquaresPasses = 4,
+
+        // Use3ColorBlocksForBlackPixels allows the BC1 encoder to use 3-color blocks for blocks containing black or very dark pixels.
+        // You shader/engine MUST ignore the alpha channel on textures encoded with this flag.
+        // Average quality goes up substantially for my 100 texture corpus (~.5 dB), so it's worth using if you can.
+        // Note the BC1 encoder does not actually support transparency in 3-color mode.
+        // Don't set when encoding to BC3.
+        Use3ColorBlocksForBlackPixels = 8,
+
+        // If Use3ColorBlocks is set, the encoder can use 3-color mode for a small but noticeable gain in average quality, but lower perf.
+        // If you also specify the UseLikelyTotalOrderings flag, set the total_orderings_to_try3 paramter to the number of total orderings to try.
+        // Don't set when encoding to BC3.
+        Use3ColorBlocks = 16,
+
+        // Iterative will greatly increase encode time, but is very slightly higher quality.
+        // Same as squish's iterative cluster fit option. Not really worth the tiny boost in quality, unless you just don't care about perf. at all.
+        Iterative = 32,
+
+        // BoundingBox enables a fast all-integer PCA approximation on 4-color blocks.
+        // At level 0 options (no other flags), this is ~15% faster, and higher *average* quality.
+        BoundingBox = 64,
+
+        // Use a slightly lower quality, but ~30% faster MSE evaluation function for 4-color blocks.
+        UseFasterMSEEval = 128,
+
+        // Examine all colors to compute selectors/MSE (slower than default)
+        UseFullMSEEval = 256,
+
+        // Use 2D least squares+inset+optimal rounding (the method used in Humus's GPU texture encoding demo), instead of PCA.
+        // Around 18% faster, very slightly lower average quality to better (depends on the content).
+        Use2DLS = 512,
+
+        // Use 6 power iterations vs. 4 for PCA.
+        Use6PowerIters = 2048,
+
+        // Check all total orderings - *very* slow. The encoder is not designed to be used in this way.
+        Exhaustive = 8192,
+
+        // Try 2 different ways of choosing the initial endpoints.
+        TryAllInitialEndponts = 16384,
+
+        // Same as BoundingBox, but implemented using integer math (faster, slightly less quality)
+        BoundingBoxInt = 32768,
+
+        // Try refining the final endpoints by examining nearby colors.
+        EndpointSearchRoundsShift = 22,
+        EndpointSearchRoundsMask = 1023U << EndpointSearchRoundsShift,
+    };
+
+    BC1Encoder(InterpolatorPtr interpolator);
+
+
+    void EncodeBlock(Color4x4 pixels, BC1Block *dest) const override;
+
+   private:
+    const InterpolatorPtr _interpolator;
+
+    Flags _flags;
+    unsigned _search_rounds;
+    unsigned _orderings4;
+    unsigned _orderings3;
+
+    void EncodeBlockSingleColor(Color color, BC1Block *dest) const;
+
+    // match tables used for single-color blocks
+    // Each entry includes a high and low pair that best reproduces the 8-bit index as well as possible,
+    // with an included error value
+    // these depend on the interpolator
+    using MatchList = std::array<BC1MatchEntry, 256>;
+    using MatchListPtr = std::shared_ptr<MatchList>;
+
+    const MatchListPtr _single_match5 = std::make_shared<MatchList>();
+    const MatchListPtr _single_match6 = std::make_shared<MatchList>();
+    const MatchListPtr _single_match5_half = std::make_shared<MatchList>();
+    const MatchListPtr _single_match6_half = std::make_shared<MatchList>();
+
+    // static lookup tables, generated the first time an encoder is created
+    // the mutex prevents race conditions if two encoders are created on different threads
+    static std::mutex _luts_mutex;
+    static bool _luts_initialized;
+
+    // lookup table for hash values
+    static uint16_t g_total_ordering4_hash[4096];
+    static uint16_t g_total_ordering3_hash[256];
+
+    static float g_selector_factors4[NUM_UNIQUE_TOTAL_ORDERINGS4][3];
+    static float g_selector_factors3[NUM_UNIQUE_TOTAL_ORDERINGS3][3];
+};
+}  // namespace rgbcx
diff --git a/src/BlockView.h b/src/BlockView.h
index 7b97c2a..973660c 100644
--- a/src/BlockView.h
+++ b/src/BlockView.h
@@ -81,6 +81,9 @@ template <typename S, size_t M, size_t N> class BlockView {
         start[(row_stride * (int)y) + (pixel_stride * (int)x)] = value;
     }
 
+    constexpr S &Get(unsigned i) noexcept(ndebug) { return Get(i % N, i / N); }
+    constexpr S Get(unsigned i) const noexcept(ndebug) { return Get(i % N, i / N); }
+
     constexpr std::array<S, M * N> Flatten() noexcept {
         std::array<S, M * N> result;
         for (unsigned x = 0; x < N; x++) {
@@ -108,6 +111,34 @@ template <size_t M, size_t N> class ColorBlockView : public BlockView<Color, M,
     }
 
     void SetRGB(unsigned x, unsigned y, Color value) noexcept(ndebug) { Base::Get(x, y).SetRGB(value); }
+
+    bool IsSingleColor() {
+        auto first = Base::Get(0, 0);
+        for (unsigned j = 1; j < M * N; j++) {
+            if (Base::Get(j) != first) return false;
+        }
+        return true;
+    }
+
+    void GetMinMaxAvgRGB(Color &min, Color &max, Color &avg) {
+        min = Base::Get(0, 0);
+        max = Base::Get(0, 0);
+        std::array<unsigned, 3> sums;
+
+        for (unsigned i = 1; i < M * N; i++) {
+            auto val = Base::Get(i);
+            for (unsigned c = 0; c < 3; c++) {
+                if (val[c] < min[c]) {
+                    min[c] = val[c];
+                } else {
+                    max[c] = val[c];
+                }
+                sums[c] += val[c];
+            }
+        }
+
+        for (unsigned c = 0; c < 3; c++) { avg[c] = (uint8_t)(sums[c] / (M * N)); }
+    }
 };
 
 using Color4x4 = ColorBlockView<4, 4>;
diff --git a/src/Interpolator.h b/src/Interpolator.h
index 5469659..4dc5d45 100644
--- a/src/Interpolator.h
+++ b/src/Interpolator.h
@@ -52,6 +52,16 @@ class Interpolator {
      */
     virtual uint8_t Interpolate6(uint8_t v0, uint8_t v1) const;
 
+    /**
+     * Performs a 2/3 interpolation of a pair of 8-bit values to produce an 8-bit value
+     * Output is approximately (2v0 + v1)/3.
+     * Output is not guranteed to be accurate for the given interpolator if CanInterpolate8Bit() is false
+     * @param v0 The first 8-bit value
+     * @param v1 The second 8-bit value
+     * @return The interpolated value
+     */
+    virtual uint8_t Interpolate8(uint8_t v0, uint8_t v1) const;
+
     /**
      * Performs a 1/2 interpolation of a pair of 5-bit values to produce an 8-bit value
      * Output is approximately (v0 + v1)/2, with v0 and v1 first extended to 8 bits.
@@ -70,6 +80,16 @@ class Interpolator {
      */
     virtual uint8_t InterpolateHalf6(uint8_t v0, uint8_t v1) const;
 
+    /**
+     * Performs a 1/2 interpolation of a pair of 8-bit values to produce an 8-bit value
+     * Output is approximately (v0 + v1)/2.
+     * Output is not guranteed to be accurate for the given interpolator if CanInterpolate8Bit() is false
+     * @param v0 The first 8-bit value
+     * @param v1 The second 8-bit value
+     * @return The interpolated value
+     */
+    virtual uint8_t InterpolateHalf8(uint8_t v0, uint8_t v1) const;
+
     /**
      * Generates the 4 colors for a BC1 block from the given 5:6:5-packed colors
      * @param low first 5:6:5 color for the block
@@ -84,6 +104,8 @@ class Interpolator {
      */
     virtual Type GetType() const noexcept { return Type::Ideal; }
 
+    virtual bool CanInterpolate8Bit() const noexcept { return true; }
+
     /**
      * Checks if the interpolator uses an ideal algorithm
      * @return true if the interpolator is ideal, false otherwise.
@@ -94,21 +116,6 @@ class Interpolator {
     }
 
    private:
-    virtual uint8_t Interpolate8(uint8_t v0, uint8_t v1) const;
-    virtual uint8_t InterpolateHalf8(uint8_t v0, uint8_t v1) const;
-
-    //    constexpr static auto Expand5 = ExpandArray<Size5, scale5To8>();
-    //    constexpr static auto Expand6 = ExpandArray<size6, scale6To8>();
-    //
-    //    // match tables used for single-color blocks
-    //    using MatchList = std::array<MatchEntry, match_count>;
-    //    using MatchListPtr = std::shared_ptr<MatchList>;
-    //
-    //    const MatchListPtr _single_match5 = {std::make_shared<MatchList>()};
-    //    const MatchListPtr _single_match6 = {std::make_shared<MatchList>()};
-    //    const MatchListPtr _single_match5_half = {std::make_shared<MatchList>()};
-    //    const MatchListPtr _single_match6_half = {std::make_shared<MatchList>()};
-
     Color InterpolateColor24(const Color &c0, const Color &c1) const {
         return Color(Interpolate8(c0.r, c1.r), Interpolate8(c0.g, c1.g), Interpolate8(c0.b, c1.b));
     }
@@ -116,33 +123,29 @@ class Interpolator {
     Color InterpolateHalfColor24(const Color &c0, const Color &c1) const {
         return Color(InterpolateHalf8(c0.r, c1.r), InterpolateHalf8(c0.g, c1.g), InterpolateHalf8(c0.b, c1.b));
     }
-
-    //    virtual constexpr bool useExpandedInMatch() noexcept { return true; }
-    //
-    //    void PrepSingleColorTables(const MatchListPtr &matchTable, const MatchListPtr &matchTableHalf, int len);
-    //
-    //    int PrepSingleColorTableEntry(const MatchListPtr &matchTable, int v, int i, int low, int high, int low_e, int high_e, int lowest_error, bool half,
-    //                                  bool ideal);
 };
 
 class InterpolatorRound : public Interpolator {
    public:
     uint8_t Interpolate5(uint8_t v0, uint8_t v1) const override;
     uint8_t Interpolate6(uint8_t v0, uint8_t v1) const override;
-    Type GetType() const noexcept override { return Type::IdealRound; }
-
-   private:
     uint8_t Interpolate8(uint8_t v0, uint8_t v1) const override;
+
+    Type GetType() const noexcept override { return Type::IdealRound; }
 };
 
 class InterpolatorNvidia : public Interpolator {
    public:
     uint8_t Interpolate5(uint8_t v0, uint8_t v1) const override;
     uint8_t Interpolate6(uint8_t v0, uint8_t v1) const override;
+
     uint8_t InterpolateHalf5(uint8_t v0, uint8_t v1) const override;
     uint8_t InterpolateHalf6(uint8_t v0, uint8_t v1) const override;
+
     std::array<Color, 4> InterpolateBC1(uint16_t low, uint16_t high) const override;
+
     Type GetType() const noexcept override { return Type::Nvidia; }
+    bool CanInterpolate8Bit() const noexcept override { return false; }
 
    private:
     Color InterpolateColor565(const Color &c0, const Color &c1) const {
@@ -158,12 +161,12 @@ class InterpolatorAMD : public Interpolator {
    public:
     uint8_t Interpolate5(uint8_t v0, uint8_t v1) const override;
     uint8_t Interpolate6(uint8_t v0, uint8_t v1) const override;
+    uint8_t Interpolate8(uint8_t v0, uint8_t v1) const override;
+
     uint8_t InterpolateHalf5(uint8_t v0, uint8_t v1) const override;
     uint8_t InterpolateHalf6(uint8_t v0, uint8_t v1) const override;
-    Type GetType() const noexcept override { return Type::AMD; }
-
-   private:
-    uint8_t Interpolate8(uint8_t v0, uint8_t v1) const override;
     uint8_t InterpolateHalf8(uint8_t v0, uint8_t v1) const override;
+
+    Type GetType() const noexcept override { return Type::AMD; }
 };
 }  // namespace rgbcx
\ No newline at end of file
diff --git a/src/test/test.cpp b/src/test/test.cpp
index 126fa74..8e17cce 100644
--- a/src/test/test.cpp
+++ b/src/test/test.cpp
@@ -18,6 +18,7 @@
 #include <vector>
 
 #include "../BC4/BC4Encoder.h"
+#include "../BC1/BC1Encoder.h"
 #include "../rgbcx.h"
 #include "../rgbcxDecoders.h"
 #include "../util.h"
@@ -671,7 +672,11 @@ int main(int argc, char *argv[]) {
 
         for (int i = 0; i < test_count; i++)
             bc4_encoder.EncodeImage(reinterpret_cast<uint8_t *>(&packed_image8[0]), src, source_image.width(), source_image.height());
+    } else if (dxgi_format == DXGI_FORMAT_BC1_UNORM) {
+        auto bc1_encoder = BC1Encoder(Interpolator::MakeInterpolator());
+        Color *src = &source_image.get_pixels()[0];
 
+        bc1_encoder.EncodeImage(reinterpret_cast<uint8_t *>(&packed_image8[0]), src, source_image.width(), source_image.height());
     } else {
         for (uint32_t by = 0; by < blocks_y; by++) {
             for (uint32_t bx = 0; bx < blocks_x; bx++) {