From c879061e4e59e693ec32b8e325ee63584d2869ce Mon Sep 17 00:00:00 2001 From: drewcassidy Date: Thu, 18 Feb 2021 02:43:57 -0800 Subject: [PATCH] Add (extremely bad) BC1 encoding support --- src/BC1/BC1Encoder.cpp | 175 +++++++++++++++++++++++++++++++++++++++++ src/BC1/BC1Encoder.h | 145 ++++++++++++++++++++++++++++++++++ src/BlockView.h | 31 ++++++++ src/Interpolator.h | 61 +++++++------- src/test/test.cpp | 5 ++ 5 files changed, 388 insertions(+), 29 deletions(-) create mode 100644 src/BC1/BC1Encoder.cpp create mode 100644 src/BC1/BC1Encoder.h diff --git a/src/BC1/BC1Encoder.cpp b/src/BC1/BC1Encoder.cpp new file mode 100644 index 0000000..06a1021 --- /dev/null +++ b/src/BC1/BC1Encoder.cpp @@ -0,0 +1,175 @@ +/* Python-rgbcx Texture Compression Library + Copyright (C) 2021 Andrew Cassidy + Partially derived from rgbcx.h written by Richard Geldreich + and licenced under the public domain + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see . + */ + +#include "BC1Encoder.h" + +#include +#include + +#include "../BlockView.h" +#include "../Color.h" +#include "../bitwiseEnums.h" + +namespace rgbcx { +using MatchList = std::array; +using MatchListPtr = std::shared_ptr; +using InterpolatorPtr = std::shared_ptr; + +// region Free Functions/Templates +inline void PrepSingleColorTableEntry(unsigned &error, MatchList &match_table, uint8_t v, unsigned i, uint8_t low, uint8_t high, uint8_t low8, uint8_t high8, + bool ideal) { + unsigned new_error = iabs(v - (int)i); + + // We only need to factor in 3% error in BC1 ideal mode. + if (ideal) new_error += (iabs(high8 - (int)low8) * 3) / 100; + + // Favor equal endpoints, for lower error on actual GPU's which approximate the interpolation. + if ((new_error < error) || (new_error == error && low == high)) { + assert(new_error <= UINT8_MAX); + + match_table[i].low = (uint8_t)low; + match_table[i].high = (uint8_t)high; + match_table[i].error = (uint8_t)new_error; + + error = new_error; + } +} + +template void PrepSingleColorTable(MatchList &match_table, MatchList &match_table_half, Interpolator &interpolator) { + unsigned size = 1 << S; + + assert((S == 5 && size == 32) || (S == 6 && size == 64)); + + bool ideal = interpolator.IsIdeal(); + bool use_8bit = interpolator.CanInterpolate8Bit(); + + for (unsigned i = 0; i < 256; i++) { + unsigned error = 256; + unsigned error_half = 256; + + // TODO: Can probably avoid testing for values that definitely wont yield good results, + // e.g. low8 and high8 both much smaller or larger than index + for (uint8_t low = 0; low < size; low++) { + uint8_t low8 = (S == 5) ? scale5To8(low) : scale6To8(low); + + for (uint8_t high = 0; high < size; high++) { + uint8_t high8 = (S == 5) ? scale5To8(high) : scale6To8(low); + uint8_t value, value_half; + + if (use_8bit) { + value = interpolator.Interpolate8(high8, low8); + value_half = interpolator.InterpolateHalf8(high8, low8); + } else { + value = (S == 5) ? interpolator.Interpolate5(high, low) : interpolator.Interpolate6(high, low); + value_half = (S == 5) ? interpolator.InterpolateHalf5(high, low) : interpolator.InterpolateHalf6(high, low); + } + + PrepSingleColorTableEntry(error, match_table, value, i, low, high, low8, high8, ideal); + PrepSingleColorTableEntry(error_half, match_table_half, value_half, i, low, high, low8, high8, ideal); + } + } + } +} +// endregion + +BC1Encoder::BC1Encoder(InterpolatorPtr interpolator) : _interpolator(interpolator) { + PrepSingleColorTable<5>(*_single_match5, *_single_match5_half, *_interpolator); + PrepSingleColorTable<6>(*_single_match6, *_single_match6_half, *_interpolator); +} + +void BC1Encoder::EncodeBlock(Color4x4 pixels, BC1Block *dest) const { + auto r_view = pixels.GetChannel(0); + auto g_view = pixels.GetChannel(1); + auto b_view = pixels.GetChannel(2); + + if (pixels.IsSingleColor() || true) { // for now assume (wrongly) everything is a single-color block + // single-color pixel block, do it the fast way + EncodeBlockSingleColor(pixels.Get(0, 0), dest); + return; + } + + Color min, max, avg; + pixels.GetMinMaxAvgRGB(min, max, avg); +} + +void BC1Encoder::EncodeBlockSingleColor(Color color, BC1Block *dest) const { + uint8_t mask = 0xAA; // 2222 + uint16_t min16, max16; + + bool using_3color = false; + + // why is there no subscript operator for shared_ptr + MatchList &match5 = *_single_match5; + MatchList &match6 = *_single_match6; + MatchList &match5_half = *_single_match5_half; + MatchList &match6_half = *_single_match6_half; + + BC1MatchEntry match_r = match5[color.r]; + BC1MatchEntry match_g = match6[color.g]; + BC1MatchEntry match_b = match5[color.b]; + + if ((_flags & (Flags::Use3ColorBlocks | Flags::Use3ColorBlocksForBlackPixels)) != Flags::None) { + BC1MatchEntry match_r_half = match5_half[color.r]; + BC1MatchEntry match_g_half = match6_half[color.g]; + BC1MatchEntry match_b_half = match5_half[color.b]; + + const unsigned err4 = match_r.error + match_g.error + match_b.error; + const unsigned err3 = match_r_half.error + match_g_half.error + match_b_half.error; + + if (err3 < err4) { + min16 = Color::Pack565Unscaled(match_r_half.low, match_g_half.low, match_b_half.low); + max16 = Color::Pack565Unscaled(match_r_half.high, match_g_half.high, match_b_half.high); + + if (max16 > min16) std::swap(min16, max16); + using_3color = true; + } + } + + if (!using_3color) { + min16 = Color::Pack565Unscaled(match_r.low, match_g.low, match_b.low); + max16 = Color::Pack565Unscaled(match_r.high, match_g.high, match_b.high); + + if (min16 == max16) { + // make sure this isnt accidentally a 3-color block + // so make max16 > min16 (l > h) + if (min16 > 0) { + min16--; + mask = 0; // endpoints are equal so mask doesnt matter + } else { + assert(min16 == 0 && max16 == 0); + max16 = 1; + min16 = 0; + mask = 0x55; // 1111 (min value only, max is ignored) + } + } else if (max16 < min16) { + std::swap(min16, max16); + mask = 0xFF; // invert mask to 3333 + } + assert(max16 > min16); + } + + dest->SetLowColor(max16); + dest->SetHighColor(min16); + dest->selectors[0] = mask; + dest->selectors[1] = mask; + dest->selectors[2] = mask; + dest->selectors[3] = mask; +} + +} // namespace rgbcx \ No newline at end of file diff --git a/src/BC1/BC1Encoder.h b/src/BC1/BC1Encoder.h new file mode 100644 index 0000000..4b20885 --- /dev/null +++ b/src/BC1/BC1Encoder.h @@ -0,0 +1,145 @@ +/* Python-rgbcx Texture Compression Library + Copyright (C) 2021 Andrew Cassidy + Partially derived from rgbcx.h written by Richard Geldreich + and licenced under the public domain + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see . + */ + +#pragma once + +#include +#include +#include +#include + +#include "../BlockEncoder.h" +#include "../BlockView.h" +#include "../Interpolator.h" +#include "../bitwiseEnums.h" +#include "../ndebug.h" +#include "../tables.h" +#include "BC1Block.h" + +namespace rgbcx { + +struct BC1MatchEntry { + uint8_t high; + uint8_t low; + uint8_t error; +}; + +class BC1Encoder : public BlockEncoder { + public: + using InterpolatorPtr = std::shared_ptr; + + enum class Flags : uint32_t { + None = 0, + + // Try to improve quality using the most likely total orderings. + // The total_orderings_to_try parameter will then control the number of total orderings to try for 4 color blocks, and the + // total_orderings_to_try3 parameter will control the number of total orderings to try for 3 color blocks (if they are enabled). + UseLikelyTotalOrderings = 2, + + // Use 2 least squares pass, instead of one (same as stb_dxt's HIGHQUAL option). + // Recommended if you're enabling UseLikelyTotalOrderings. + TwoLeastSquaresPasses = 4, + + // Use3ColorBlocksForBlackPixels allows the BC1 encoder to use 3-color blocks for blocks containing black or very dark pixels. + // You shader/engine MUST ignore the alpha channel on textures encoded with this flag. + // Average quality goes up substantially for my 100 texture corpus (~.5 dB), so it's worth using if you can. + // Note the BC1 encoder does not actually support transparency in 3-color mode. + // Don't set when encoding to BC3. + Use3ColorBlocksForBlackPixels = 8, + + // If Use3ColorBlocks is set, the encoder can use 3-color mode for a small but noticeable gain in average quality, but lower perf. + // If you also specify the UseLikelyTotalOrderings flag, set the total_orderings_to_try3 paramter to the number of total orderings to try. + // Don't set when encoding to BC3. + Use3ColorBlocks = 16, + + // Iterative will greatly increase encode time, but is very slightly higher quality. + // Same as squish's iterative cluster fit option. Not really worth the tiny boost in quality, unless you just don't care about perf. at all. + Iterative = 32, + + // BoundingBox enables a fast all-integer PCA approximation on 4-color blocks. + // At level 0 options (no other flags), this is ~15% faster, and higher *average* quality. + BoundingBox = 64, + + // Use a slightly lower quality, but ~30% faster MSE evaluation function for 4-color blocks. + UseFasterMSEEval = 128, + + // Examine all colors to compute selectors/MSE (slower than default) + UseFullMSEEval = 256, + + // Use 2D least squares+inset+optimal rounding (the method used in Humus's GPU texture encoding demo), instead of PCA. + // Around 18% faster, very slightly lower average quality to better (depends on the content). + Use2DLS = 512, + + // Use 6 power iterations vs. 4 for PCA. + Use6PowerIters = 2048, + + // Check all total orderings - *very* slow. The encoder is not designed to be used in this way. + Exhaustive = 8192, + + // Try 2 different ways of choosing the initial endpoints. + TryAllInitialEndponts = 16384, + + // Same as BoundingBox, but implemented using integer math (faster, slightly less quality) + BoundingBoxInt = 32768, + + // Try refining the final endpoints by examining nearby colors. + EndpointSearchRoundsShift = 22, + EndpointSearchRoundsMask = 1023U << EndpointSearchRoundsShift, + }; + + BC1Encoder(InterpolatorPtr interpolator); + + + void EncodeBlock(Color4x4 pixels, BC1Block *dest) const override; + + private: + const InterpolatorPtr _interpolator; + + Flags _flags; + unsigned _search_rounds; + unsigned _orderings4; + unsigned _orderings3; + + void EncodeBlockSingleColor(Color color, BC1Block *dest) const; + + // match tables used for single-color blocks + // Each entry includes a high and low pair that best reproduces the 8-bit index as well as possible, + // with an included error value + // these depend on the interpolator + using MatchList = std::array; + using MatchListPtr = std::shared_ptr; + + const MatchListPtr _single_match5 = std::make_shared(); + const MatchListPtr _single_match6 = std::make_shared(); + const MatchListPtr _single_match5_half = std::make_shared(); + const MatchListPtr _single_match6_half = std::make_shared(); + + // static lookup tables, generated the first time an encoder is created + // the mutex prevents race conditions if two encoders are created on different threads + static std::mutex _luts_mutex; + static bool _luts_initialized; + + // lookup table for hash values + static uint16_t g_total_ordering4_hash[4096]; + static uint16_t g_total_ordering3_hash[256]; + + static float g_selector_factors4[NUM_UNIQUE_TOTAL_ORDERINGS4][3]; + static float g_selector_factors3[NUM_UNIQUE_TOTAL_ORDERINGS3][3]; +}; +} // namespace rgbcx diff --git a/src/BlockView.h b/src/BlockView.h index 7b97c2a..973660c 100644 --- a/src/BlockView.h +++ b/src/BlockView.h @@ -81,6 +81,9 @@ template class BlockView { start[(row_stride * (int)y) + (pixel_stride * (int)x)] = value; } + constexpr S &Get(unsigned i) noexcept(ndebug) { return Get(i % N, i / N); } + constexpr S Get(unsigned i) const noexcept(ndebug) { return Get(i % N, i / N); } + constexpr std::array Flatten() noexcept { std::array result; for (unsigned x = 0; x < N; x++) { @@ -108,6 +111,34 @@ template class ColorBlockView : public BlockView sums; + + for (unsigned i = 1; i < M * N; i++) { + auto val = Base::Get(i); + for (unsigned c = 0; c < 3; c++) { + if (val[c] < min[c]) { + min[c] = val[c]; + } else { + max[c] = val[c]; + } + sums[c] += val[c]; + } + } + + for (unsigned c = 0; c < 3; c++) { avg[c] = (uint8_t)(sums[c] / (M * N)); } + } }; using Color4x4 = ColorBlockView<4, 4>; diff --git a/src/Interpolator.h b/src/Interpolator.h index 5469659..4dc5d45 100644 --- a/src/Interpolator.h +++ b/src/Interpolator.h @@ -52,6 +52,16 @@ class Interpolator { */ virtual uint8_t Interpolate6(uint8_t v0, uint8_t v1) const; + /** + * Performs a 2/3 interpolation of a pair of 8-bit values to produce an 8-bit value + * Output is approximately (2v0 + v1)/3. + * Output is not guranteed to be accurate for the given interpolator if CanInterpolate8Bit() is false + * @param v0 The first 8-bit value + * @param v1 The second 8-bit value + * @return The interpolated value + */ + virtual uint8_t Interpolate8(uint8_t v0, uint8_t v1) const; + /** * Performs a 1/2 interpolation of a pair of 5-bit values to produce an 8-bit value * Output is approximately (v0 + v1)/2, with v0 and v1 first extended to 8 bits. @@ -70,6 +80,16 @@ class Interpolator { */ virtual uint8_t InterpolateHalf6(uint8_t v0, uint8_t v1) const; + /** + * Performs a 1/2 interpolation of a pair of 8-bit values to produce an 8-bit value + * Output is approximately (v0 + v1)/2. + * Output is not guranteed to be accurate for the given interpolator if CanInterpolate8Bit() is false + * @param v0 The first 8-bit value + * @param v1 The second 8-bit value + * @return The interpolated value + */ + virtual uint8_t InterpolateHalf8(uint8_t v0, uint8_t v1) const; + /** * Generates the 4 colors for a BC1 block from the given 5:6:5-packed colors * @param low first 5:6:5 color for the block @@ -84,6 +104,8 @@ class Interpolator { */ virtual Type GetType() const noexcept { return Type::Ideal; } + virtual bool CanInterpolate8Bit() const noexcept { return true; } + /** * Checks if the interpolator uses an ideal algorithm * @return true if the interpolator is ideal, false otherwise. @@ -94,21 +116,6 @@ class Interpolator { } private: - virtual uint8_t Interpolate8(uint8_t v0, uint8_t v1) const; - virtual uint8_t InterpolateHalf8(uint8_t v0, uint8_t v1) const; - - // constexpr static auto Expand5 = ExpandArray(); - // constexpr static auto Expand6 = ExpandArray(); - // - // // match tables used for single-color blocks - // using MatchList = std::array; - // using MatchListPtr = std::shared_ptr; - // - // const MatchListPtr _single_match5 = {std::make_shared()}; - // const MatchListPtr _single_match6 = {std::make_shared()}; - // const MatchListPtr _single_match5_half = {std::make_shared()}; - // const MatchListPtr _single_match6_half = {std::make_shared()}; - Color InterpolateColor24(const Color &c0, const Color &c1) const { return Color(Interpolate8(c0.r, c1.r), Interpolate8(c0.g, c1.g), Interpolate8(c0.b, c1.b)); } @@ -116,33 +123,29 @@ class Interpolator { Color InterpolateHalfColor24(const Color &c0, const Color &c1) const { return Color(InterpolateHalf8(c0.r, c1.r), InterpolateHalf8(c0.g, c1.g), InterpolateHalf8(c0.b, c1.b)); } - - // virtual constexpr bool useExpandedInMatch() noexcept { return true; } - // - // void PrepSingleColorTables(const MatchListPtr &matchTable, const MatchListPtr &matchTableHalf, int len); - // - // int PrepSingleColorTableEntry(const MatchListPtr &matchTable, int v, int i, int low, int high, int low_e, int high_e, int lowest_error, bool half, - // bool ideal); }; class InterpolatorRound : public Interpolator { public: uint8_t Interpolate5(uint8_t v0, uint8_t v1) const override; uint8_t Interpolate6(uint8_t v0, uint8_t v1) const override; - Type GetType() const noexcept override { return Type::IdealRound; } - - private: uint8_t Interpolate8(uint8_t v0, uint8_t v1) const override; + + Type GetType() const noexcept override { return Type::IdealRound; } }; class InterpolatorNvidia : public Interpolator { public: uint8_t Interpolate5(uint8_t v0, uint8_t v1) const override; uint8_t Interpolate6(uint8_t v0, uint8_t v1) const override; + uint8_t InterpolateHalf5(uint8_t v0, uint8_t v1) const override; uint8_t InterpolateHalf6(uint8_t v0, uint8_t v1) const override; + std::array InterpolateBC1(uint16_t low, uint16_t high) const override; + Type GetType() const noexcept override { return Type::Nvidia; } + bool CanInterpolate8Bit() const noexcept override { return false; } private: Color InterpolateColor565(const Color &c0, const Color &c1) const { @@ -158,12 +161,12 @@ class InterpolatorAMD : public Interpolator { public: uint8_t Interpolate5(uint8_t v0, uint8_t v1) const override; uint8_t Interpolate6(uint8_t v0, uint8_t v1) const override; + uint8_t Interpolate8(uint8_t v0, uint8_t v1) const override; + uint8_t InterpolateHalf5(uint8_t v0, uint8_t v1) const override; uint8_t InterpolateHalf6(uint8_t v0, uint8_t v1) const override; - Type GetType() const noexcept override { return Type::AMD; } - - private: - uint8_t Interpolate8(uint8_t v0, uint8_t v1) const override; uint8_t InterpolateHalf8(uint8_t v0, uint8_t v1) const override; + + Type GetType() const noexcept override { return Type::AMD; } }; } // namespace rgbcx \ No newline at end of file diff --git a/src/test/test.cpp b/src/test/test.cpp index 126fa74..8e17cce 100644 --- a/src/test/test.cpp +++ b/src/test/test.cpp @@ -18,6 +18,7 @@ #include #include "../BC4/BC4Encoder.h" +#include "../BC1/BC1Encoder.h" #include "../rgbcx.h" #include "../rgbcxDecoders.h" #include "../util.h" @@ -671,7 +672,11 @@ int main(int argc, char *argv[]) { for (int i = 0; i < test_count; i++) bc4_encoder.EncodeImage(reinterpret_cast(&packed_image8[0]), src, source_image.width(), source_image.height()); + } else if (dxgi_format == DXGI_FORMAT_BC1_UNORM) { + auto bc1_encoder = BC1Encoder(Interpolator::MakeInterpolator()); + Color *src = &source_image.get_pixels()[0]; + bc1_encoder.EncodeImage(reinterpret_cast(&packed_image8[0]), src, source_image.width(), source_image.height()); } else { for (uint32_t by = 0; by < blocks_y; by++) { for (uint32_t bx = 0; bx < blocks_x; bx++) {