quicktex/src/rgbcx.cpp

// rgbcx.h v1.12
// High-performance scalar BC1-5 encoders. Public Domain or MIT license (you choose - see below), written by Richard Geldreich 2020 <richgel99@gmail.com>.

#pragma GCC diagnostic ignored "-Weverything"
#include "rgbcx.h"

#include <algorithm>
#include <array>
#include <cassert>
#include <climits>
#include <cmath>
#include <cstring>
#include <type_traits>

#include "Color.h"
#include "blocks.h"
#include "tables.h"
#include "util.h"

namespace rgbcx {

static const uint32_t TOTAL_ORDER_4_0_16 = 15;
static const uint32_t TOTAL_ORDER_4_1_16 = 700;
static const uint32_t TOTAL_ORDER_4_2_16 = 753;
static const uint32_t TOTAL_ORDER_4_3_16 = 515;
static uint16_t g_total_ordering4_hash[4096];
static float g_selector_factors4[NUM_UNIQUE_TOTAL_ORDERINGS4][3];

static const uint32_t TOTAL_ORDER_3_0_16 = 12;
static const uint32_t TOTAL_ORDER_3_1_16 = 15;
static const uint32_t TOTAL_ORDER_3_2_16 = 89;
static uint16_t g_total_ordering3_hash[256];
static float g_selector_factors3[NUM_UNIQUE_TOTAL_ORDERINGS3][3];

struct hist4 {
    uint8_t m_hist[4];

    hist4() { memset(m_hist, 0, sizeof(m_hist)); }

    hist4(uint32_t i, uint32_t j, uint32_t k, uint32_t l) {
        m_hist[0] = (uint8_t)i;
        m_hist[1] = (uint8_t)j;
        m_hist[2] = (uint8_t)k;
        m_hist[3] = (uint8_t)l;
    }

    inline bool operator==(const hist4 &h) const {
        if (m_hist[0] != h.m_hist[0]) return false;
        if (m_hist[1] != h.m_hist[1]) return false;
        if (m_hist[2] != h.m_hist[2]) return false;
        if (m_hist[3] != h.m_hist[3]) return false;
        return true;
    }

    inline bool any_16() const { return (m_hist[0] == 16) || (m_hist[1] == 16) || (m_hist[2] == 16) || (m_hist[3] == 16); }

    inline uint32_t lookup_total_ordering_index() const {
        if (m_hist[0] == 16)
            return TOTAL_ORDER_4_0_16;
        else if (m_hist[1] == 16)
            return TOTAL_ORDER_4_1_16;
        else if (m_hist[2] == 16)
            return TOTAL_ORDER_4_2_16;
        else if (m_hist[3] == 16)
            return TOTAL_ORDER_4_3_16;

        // Must sum to 16, so m_hist[3] isn't needed.
        return g_total_ordering4_hash[m_hist[0] | (m_hist[1] << 4) | (m_hist[2] << 8)];
    }
};

struct hist3 {
    uint8_t m_hist[3];

    hist3() { memset(m_hist, 0, sizeof(m_hist)); }

    hist3(uint32_t i, uint32_t j, uint32_t k) {
        m_hist[0] = (uint8_t)i;
        m_hist[1] = (uint8_t)j;
        m_hist[2] = (uint8_t)k;
    }

    inline bool operator==(const hist3 &h) const {
        if (m_hist[0] != h.m_hist[0]) return false;
        if (m_hist[1] != h.m_hist[1]) return false;
        if (m_hist[2] != h.m_hist[2]) return false;
        return true;
    }

    inline bool any_16() const { return (m_hist[0] == 16) || (m_hist[1] == 16) || (m_hist[2] == 16); }

    inline uint32_t lookup_total_ordering_index() const {
        if (m_hist[0] == 16)
            return TOTAL_ORDER_3_0_16;
        else if (m_hist[1] == 16)
            return TOTAL_ORDER_3_1_16;
        else if (m_hist[2] == 16)
            return TOTAL_ORDER_3_2_16;

        // Must sum to 16, so m_hist[2] isn't needed.
        return g_total_ordering3_hash[m_hist[0] | (m_hist[1] << 4)];
    }
};

struct bc1_match_entry {
    uint8_t m_hi;
    uint8_t m_lo;
    uint8_t m_e;
};

static bc1_approx_mode g_bc1_approx_mode;
static bc1_match_entry g_bc1_match5_equals_1[256], g_bc1_match6_equals_1[256];
static bc1_match_entry g_bc1_match5_half[256], g_bc1_match6_half[256];

// v0, v1 = unexpanded DXT1 endpoint values (5/6-bits)
// c0, c1 = expanded DXT1 endpoint values (8-bits)
static inline int interp_5_6_ideal(int c0, int c1) {
    assert(c0 < 256 && c1 < 256);
    return (c0 * 2 + c1) / 3;
}
static inline int interp_5_6_ideal_round(int c0, int c1) {
    assert(c0 < 256 && c1 < 256);
    return (c0 * 2 + c1 + 1) / 3;
}
static inline int interp_half_5_6_ideal(int c0, int c1) {
    assert(c0 < 256 && c1 < 256);
    return (c0 + c1) / 2;
}

static inline int interp_5_nv(int v0, int v1) {
    assert(v0 < 32 && v1 < 32);
    return ((2 * v0 + v1) * 22) / 8;
}
static inline int interp_6_nv(int c0, int c1) {
    assert(c0 < 256 && c1 < 256);
    const int gdiff = c1 - c0;
    return (256 * c0 + (gdiff / 4) + 128 + gdiff * 80) / 256;
}

static inline int interp_half_5_nv(int v0, int v1) {
    assert(v0 < 32 && v1 < 32);
    return ((v0 + v1) * 33) / 8;
}
static inline int interp_half_6_nv(int c0, int c1) {
    assert(c0 < 256 && c1 < 256);
    const int gdiff = c1 - c0;
    return (256 * c0 + gdiff / 4 + 128 + gdiff * 128) / 256;
}

static inline int interp_5_6_amd(int c0, int c1) {
    assert(c0 < 256 && c1 < 256);
    return (c0 * 43 + c1 * 21 + 32) >> 6;
}
static inline int interp_half_5_6_amd(int c0, int c1) {
    assert(c0 < 256 && c1 < 256);
    return (c0 + c1 + 1) >> 1;
}

static inline int interp_5(int v0, int v1, int c0, int c1, bc1_approx_mode mode) {
    // assert(scale_5_to_8(v0) == c0 && scale5To8(v1) == c1);
    switch (mode) {
        case bc1_approx_mode::cBC1NVidia:
            return interp_5_nv(v0, v1);
        case bc1_approx_mode::cBC1AMD:
            return interp_5_6_amd(c0, c1);
        default:
        case bc1_approx_mode::cBC1Ideal:
            return interp_5_6_ideal(c0, c1);
        case bc1_approx_mode::cBC1IdealRound4:
            return interp_5_6_ideal_round(c0, c1);
    }
}

static inline int interp_6(int v0, int v1, int c0, int c1, bc1_approx_mode mode) {
    (void)v0;
    (void)v1;
    // assert(scale_6_to_8(v0) == c0 && scale6To8(v1) == c1);
    switch (mode) {
        case bc1_approx_mode::cBC1NVidia:
            return interp_6_nv(c0, c1);
        case bc1_approx_mode::cBC1AMD:
            return interp_5_6_amd(c0, c1);
        default:
        case bc1_approx_mode::cBC1Ideal:
            return interp_5_6_ideal(c0, c1);
        case bc1_approx_mode::cBC1IdealRound4:
            return interp_5_6_ideal_round(c0, c1);
    }
}

static inline unsigned int interp_half_5(unsigned int v0, unsigned int v1, unsigned int c0, unsigned int c1, bc1_approx_mode mode) {
    assert(scale5To8(v0) == c0 && scale5To8(v1) == c1);
    switch (mode) {
        case bc1_approx_mode::cBC1NVidia:
            return interp_half_5_nv(v0, v1);
        case bc1_approx_mode::cBC1AMD:
            return interp_half_5_6_amd(c0, c1);
        case bc1_approx_mode::cBC1Ideal:
        case bc1_approx_mode::cBC1IdealRound4:
        default:
            return interp_half_5_6_ideal(c0, c1);
    }
}

static inline unsigned int interp_half_6(unsigned v0, unsigned v1, unsigned c0, bc1_approx_mode mode, unsigned c1) {
    (void)v0;
    (void)v1;
    assert(scale6To8(v0) == c0 && scale6To8(v1) == c1);
    switch (mode) {
        case bc1_approx_mode::cBC1NVidia:
            return interp_half_6_nv(c0, c1);
        case bc1_approx_mode::cBC1AMD:
            return interp_half_5_6_amd(c0, c1);
        case bc1_approx_mode::cBC1Ideal:
        case bc1_approx_mode::cBC1IdealRound4:
        default:
            return interp_half_5_6_ideal(c0, c1);
    }
}

static void prepare_bc1_single_color_table_half(bc1_match_entry *pTable, const uint8_t *pExpand, int size, bc1_approx_mode mode) {
    for (int i = 0; i < 256; i++) {
        int lowest_e = 256;
        for (int lo = 0; lo < size; lo++) {
            const int lo_e = pExpand[lo];

            for (int hi = 0; hi < size; hi++) {
                const int hi_e = pExpand[hi];

                const int v = (size == 32) ? interp_half_5(hi, lo, hi_e, lo_e, mode) : interp_half_6(hi, lo, hi_e, mode, lo_e);

                int e = iabs(v - i);

                // We only need to factor in 3% error in BC1 ideal mode.
                if ((mode == bc1_approx_mode::cBC1Ideal) || (mode == bc1_approx_mode::cBC1IdealRound4)) e += (iabs(hi_e - lo_e) * 3) / 100;

                // Favor equal endpoints, for lower error on actual GPU's which approximate the interpolation.
                if ((e < lowest_e) || ((e == lowest_e) && (lo == hi))) {
                    pTable[i].m_hi = static_cast<uint8_t>(hi);
                    pTable[i].m_lo = static_cast<uint8_t>(lo);

                    assert(e <= UINT8_MAX);
                    pTable[i].m_e = static_cast<uint8_t>(e);

                    lowest_e = e;
                }

            }  // hi
        }      // lo
    }
}

static void prepare_bc1_single_color_table(bc1_match_entry *pTable, const uint8_t *pExpand, int size, bc1_approx_mode mode) {
    for (int i = 0; i < 256; i++) {
        int lowest_e = 256;
        for (int lo = 0; lo < size; lo++) {
            const int lo_e = pExpand[lo];

            for (int hi = 0; hi < size; hi++) {
                const int hi_e = pExpand[hi];

                const int v = (size == 32) ? interp_5(hi, lo, hi_e, lo_e, mode) : interp_6(hi, lo, hi_e, lo_e, mode);

                int e = iabs(v - i);

                if ((mode == bc1_approx_mode::cBC1Ideal) || (mode == bc1_approx_mode::cBC1IdealRound4)) e += (iabs(hi_e - lo_e) * 3) / 100;

                // Favor equal endpoints, for lower error on actual GPU's which approximate the interpolation.
                if ((e < lowest_e) || ((e == lowest_e) && (lo == hi))) {
                    pTable[i].m_hi = static_cast<uint8_t>(hi);
                    pTable[i].m_lo = static_cast<uint8_t>(lo);

                    assert(e <= UINT8_MAX);
                    pTable[i].m_e = static_cast<uint8_t>(e);

                    lowest_e = e;
                }

            }  // hi
        }      // lo
    }
}

// This table is: 9 * (w * w), 9 * ((1.0f - w) * w), 9 * ((1.0f - w) * (1.0f - w))
// where w is [0,1/3,2/3,1]. 9 is the perfect multiplier.
static const uint32_t g_weight_vals4[4] = {0x000009, 0x010204, 0x040201, 0x090000};

// multiplier is 4 for 3-color
static const uint32_t g_weight_vals3[3] = {0x000004, 0x040000, 0x010101};

static inline void compute_selector_factors4(const hist4 &h, float &iz00, float &iz10, float &iz11) {
    uint32_t weight_accum = 0;
    for (uint32_t sel = 0; sel < 4; sel++) weight_accum += g_weight_vals4[sel] * h.m_hist[sel];

    float z00 = (float)((weight_accum >> 16) & 0xFF);
    float z10 = (float)((weight_accum >> 8) & 0xFF);
    float z11 = (float)(weight_accum & 0xFF);
    float z01 = z10;

    float det = z00 * z11 - z01 * z10;
    if (fabs(det) < 1e-8f)
        det = 0.0f;
    else
        det = (3.0f / 255.0f) / det;

    iz00 = z11 * det;
    iz10 = -z10 * det;
    iz11 = z00 * det;
}

static inline void compute_selector_factors3(const hist3 &h, float &iz00, float &iz10, float &iz11) {
    uint32_t weight_accum = 0;
    for (uint32_t sel = 0; sel < 3; sel++) weight_accum += g_weight_vals3[sel] * h.m_hist[sel];

    float z00 = (float)((weight_accum >> 16) & 0xFF);
    float z10 = (float)((weight_accum >> 8) & 0xFF);
    float z11 = (float)(weight_accum & 0xFF);
    float z01 = z10;

    float det = z00 * z11 - z01 * z10;
    if (fabs(det) < 1e-8f)
        det = 0.0f;
    else
        det = (2.0f / 255.0f) / det;

    iz00 = z11 * det;
    iz10 = -z10 * det;
    iz11 = z00 * det;
}

static bool g_initialized;

void init(bc1_approx_mode mode) {
    g_bc1_approx_mode = mode;

    uint8_t bc1_expand5[32];
    for (int i = 0; i < 32; i++) bc1_expand5[i] = static_cast<uint8_t>((i << 3) | (i >> 2));
    prepare_bc1_single_color_table(g_bc1_match5_equals_1, bc1_expand5, 32, mode);
    prepare_bc1_single_color_table_half(g_bc1_match5_half, bc1_expand5, 32, mode);

    uint8_t bc1_expand6[64];
    for (int i = 0; i < 64; i++) bc1_expand6[i] = static_cast<uint8_t>((i << 2) | (i >> 4));
    prepare_bc1_single_color_table(g_bc1_match6_equals_1, bc1_expand6, 64, mode);
    prepare_bc1_single_color_table_half(g_bc1_match6_half, bc1_expand6, 64, mode);

    for (uint32_t i = 0; i < NUM_UNIQUE_TOTAL_ORDERINGS4; i++) {
        hist4 h;
        h.m_hist[0] = (uint8_t)g_unique_total_orders4[i][0];
        h.m_hist[1] = (uint8_t)g_unique_total_orders4[i][1];
        h.m_hist[2] = (uint8_t)g_unique_total_orders4[i][2];
        h.m_hist[3] = (uint8_t)g_unique_total_orders4[i][3];

        if (!h.any_16()) {
            const uint32_t index = h.m_hist[0] | (h.m_hist[1] << 4) | (h.m_hist[2] << 8);
            assert(index < 4096);
            g_total_ordering4_hash[index] = (uint16_t)i;
        }

        compute_selector_factors4(h, g_selector_factors4[i][0], g_selector_factors4[i][1], g_selector_factors4[i][2]);
    }

    for (uint32_t i = 0; i < NUM_UNIQUE_TOTAL_ORDERINGS3; i++) {
        hist3 h;
        h.m_hist[0] = (uint8_t)g_unique_total_orders3[i][0];
        h.m_hist[1] = (uint8_t)g_unique_total_orders3[i][1];
        h.m_hist[2] = (uint8_t)g_unique_total_orders3[i][2];

        if (!h.any_16()) {
            const uint32_t index = h.m_hist[0] | (h.m_hist[1] << 4);
            assert(index < 256);
            g_total_ordering3_hash[index] = (uint16_t)i;
        }

        compute_selector_factors3(h, g_selector_factors3[i][0], g_selector_factors3[i][1], g_selector_factors3[i][2]);
    }

    g_initialized = true;
}

void encode_bc1_solid_block(void *pDst, uint32_t fr, uint32_t fg, uint32_t fb, bool allow_3color) {
    BC1Block *pDst_block = static_cast<BC1Block *>(pDst);

    uint32_t mask = 0xAA;
    int max16 = -1, min16 = 0;

    if (allow_3color) {
        const uint32_t err4 = g_bc1_match5_equals_1[fr].m_e + g_bc1_match6_equals_1[fg].m_e + g_bc1_match5_equals_1[fb].m_e;
        const uint32_t err3 = g_bc1_match5_half[fr].m_e + g_bc1_match6_half[fg].m_e + g_bc1_match5_half[fb].m_e;

        if (err3 < err4) {
            max16 = (g_bc1_match5_half[fr].m_hi << 11) | (g_bc1_match6_half[fg].m_hi << 5) | g_bc1_match5_half[fb].m_hi;
            min16 = (g_bc1_match5_half[fr].m_lo << 11) | (g_bc1_match6_half[fg].m_lo << 5) | g_bc1_match5_half[fb].m_lo;

            if (max16 > min16) std::swap(max16, min16);
        }
    }

    if (max16 == -1) {
        max16 = (g_bc1_match5_equals_1[fr].m_hi << 11) | (g_bc1_match6_equals_1[fg].m_hi << 5) | g_bc1_match5_equals_1[fb].m_hi;
        min16 = (g_bc1_match5_equals_1[fr].m_lo << 11) | (g_bc1_match6_equals_1[fg].m_lo << 5) | g_bc1_match5_equals_1[fb].m_lo;

        if (min16 == max16) {
            // Always forbid 3 color blocks
            // This is to guarantee that BC3 blocks never use punchthrough alpha (3 color) mode, which isn't supported on some (all?) GPU's.
            mask = 0;

            // Make l > h
            if (min16 > 0)
                min16--;
            else {
                // l = h = 0
                assert(min16 == max16 && max16 == 0);

                max16 = 1;
                min16 = 0;
                mask = 0x55;
            }

            assert(max16 > min16);
        }

        if (max16 < min16) {
            std::swap(max16, min16);
            mask ^= 0x55;
        }
    }

    pDst_block->SetLowColor(static_cast<uint16_t>(max16));
    pDst_block->SetHighColor(static_cast<uint16_t>(min16));
    pDst_block->selectors[0] = static_cast<uint8_t>(mask);
    pDst_block->selectors[1] = static_cast<uint8_t>(mask);
    pDst_block->selectors[2] = static_cast<uint8_t>(mask);
    pDst_block->selectors[3] = static_cast<uint8_t>(mask);
}

static const float g_midpoint5[32] = {.015686f, .047059f, .078431f, .111765f, .145098f, .176471f, .207843f, .241176f, .274510f, .305882f, .337255f,
                                      .370588f, .403922f, .435294f, .466667f, .5f,      .533333f, .564706f, .596078f, .629412f, .662745f, .694118f,
                                      .725490f, .758824f, .792157f, .823529f, .854902f, .888235f, .921569f, .952941f, .984314f, 1e+37f};
static const float g_midpoint6[64] = {.007843f, .023529f, .039216f, .054902f, .070588f, .086275f, .101961f, .117647f, .133333f, .149020f, .164706f,
                                      .180392f, .196078f, .211765f, .227451f, .245098f, .262745f, .278431f, .294118f, .309804f, .325490f, .341176f,
                                      .356863f, .372549f, .388235f, .403922f, .419608f, .435294f, .450980f, .466667f, .482353f, .500000f, .517647f,
                                      .533333f, .549020f, .564706f, .580392f, .596078f, .611765f, .627451f, .643137f, .658824f, .674510f, .690196f,
                                      .705882f, .721569f, .737255f, .754902f, .772549f, .788235f, .803922f, .819608f, .835294f, .850980f, .866667f,
                                      .882353f, .898039f, .913725f, .929412f, .945098f, .960784f, .976471f, .992157f, 1e+37f};

struct vec3F {
    float c[3];
};

static inline void compute_least_squares_endpoints4_rgb(vec3F *pXl, vec3F *pXh, int total_r, int total_g, int total_b, float iz00, float iz10, float iz11,
                                                        uint32_t s, const uint32_t r_sum[17], const uint32_t g_sum[17], const uint32_t b_sum[17]) {
    const float iz01 = iz10;

    const uint32_t f1 = g_unique_total_orders4[s][0];
    const uint32_t f2 = g_unique_total_orders4[s][0] + g_unique_total_orders4[s][1];
    const uint32_t f3 = g_unique_total_orders4[s][0] + g_unique_total_orders4[s][1] + g_unique_total_orders4[s][2];
    uint32_t uq00_r = (r_sum[f2] - r_sum[f1]) + (r_sum[f3] - r_sum[f2]) * 2 + (r_sum[16] - r_sum[f3]) * 3;
    uint32_t uq00_g = (g_sum[f2] - g_sum[f1]) + (g_sum[f3] - g_sum[f2]) * 2 + (g_sum[16] - g_sum[f3]) * 3;
    uint32_t uq00_b = (b_sum[f2] - b_sum[f1]) + (b_sum[f3] - b_sum[f2]) * 2 + (b_sum[16] - b_sum[f3]) * 3;

    float q10_r = (float)(total_r * 3 - uq00_r);
    float q10_g = (float)(total_g * 3 - uq00_g);
    float q10_b = (float)(total_b * 3 - uq00_b);

    pXl->c[0] = iz00 * (float)uq00_r + iz01 * q10_r;
    pXh->c[0] = iz10 * (float)uq00_r + iz11 * q10_r;

    pXl->c[1] = iz00 * (float)uq00_g + iz01 * q10_g;
    pXh->c[1] = iz10 * (float)uq00_g + iz11 * q10_g;

    pXl->c[2] = iz00 * (float)uq00_b + iz01 * q10_b;
    pXh->c[2] = iz10 * (float)uq00_b + iz11 * q10_b;
}

static inline bool compute_least_squares_endpoints4_rgb(const Color *pColors, const uint8_t *pSelectors, vec3F *pXl, vec3F *pXh, int total_r, int total_g,
                                                        int total_b) {
    uint32_t uq00_r = 0, uq00_g = 0, uq00_b = 0;
    uint32_t weight_accum = 0;
    for (uint32_t i = 0; i < 16; i++) {
        const uint8_t r = pColors[i][0], g = pColors[i][1], b = pColors[i][2];
        const uint8_t sel = pSelectors[i];

        weight_accum += g_weight_vals4[sel];
        uq00_r += sel * r;
        uq00_g += sel * g;
        uq00_b += sel * b;
    }

    int q10_r = total_r * 3 - uq00_r;
    int q10_g = total_g * 3 - uq00_g;
    int q10_b = total_b * 3 - uq00_b;

    float z00 = (float)((weight_accum >> 16) & 0xFF);
    float z10 = (float)((weight_accum >> 8) & 0xFF);
    float z11 = (float)(weight_accum & 0xFF);
    float z01 = z10;

    float det = z00 * z11 - z01 * z10;
    if (fabs(det) < 1e-8f) return false;

    det = (3.0f / 255.0f) / det;

    float iz00, iz01, iz10, iz11;
    iz00 = z11 * det;
    iz01 = -z01 * det;
    iz10 = -z10 * det;
    iz11 = z00 * det;

    pXl->c[0] = iz00 * (float)uq00_r + iz01 * q10_r;
    pXh->c[0] = iz10 * (float)uq00_r + iz11 * q10_r;

    pXl->c[1] = iz00 * (float)uq00_g + iz01 * q10_g;
    pXh->c[1] = iz10 * (float)uq00_g + iz11 * q10_g;

    pXl->c[2] = iz00 * (float)uq00_b + iz01 * q10_b;
    pXh->c[2] = iz10 * (float)uq00_b + iz11 * q10_b;

    return true;
}

static inline void compute_least_squares_endpoints3_rgb(vec3F *pXl, vec3F *pXh, int total_r, int total_g, int total_b, float iz00, float iz10, float iz11,
                                                        uint32_t s, const uint32_t r_sum[17], const uint32_t g_sum[17], const uint32_t b_sum[17]) {
    const float iz01 = iz10;

    // Compensates for BC1 3-color ordering, which is selector 0, 2, 1
    const uint32_t f1 = g_unique_total_orders3[s][0];
    const uint32_t f2 = g_unique_total_orders3[s][0] + g_unique_total_orders3[s][2];
    uint32_t uq00_r = (r_sum[16] - r_sum[f2]) * 2 + (r_sum[f2] - r_sum[f1]);
    uint32_t uq00_g = (g_sum[16] - g_sum[f2]) * 2 + (g_sum[f2] - g_sum[f1]);
    uint32_t uq00_b = (b_sum[16] - b_sum[f2]) * 2 + (b_sum[f2] - b_sum[f1]);

    float q10_r = (float)(total_r * 2 - uq00_r);
    float q10_g = (float)(total_g * 2 - uq00_g);
    float q10_b = (float)(total_b * 2 - uq00_b);

    pXl->c[0] = iz00 * (float)uq00_r + iz01 * q10_r;
    pXh->c[0] = iz10 * (float)uq00_r + iz11 * q10_r;

    pXl->c[1] = iz00 * (float)uq00_g + iz01 * q10_g;
    pXh->c[1] = iz10 * (float)uq00_g + iz11 * q10_g;

    pXl->c[2] = iz00 * (float)uq00_b + iz01 * q10_b;
    pXh->c[2] = iz10 * (float)uq00_b + iz11 * q10_b;
}

static inline bool compute_least_squares_endpoints3_rgb(bool use_black, const Color *pColors, const uint8_t *pSelectors, vec3F *pXl, vec3F *pXh) {
    int uq00_r = 0, uq00_g = 0, uq00_b = 0;
    uint32_t weight_accum = 0;
    int total_r = 0, total_g = 0, total_b = 0;
    for (uint32_t i = 0; i < 16; i++) {
        const uint8_t r = pColors[i][0], g = pColors[i][1], b = pColors[i][2];
        if (use_black) {
            if ((r | g | b) < 4) continue;
        }

        const uint8_t sel = pSelectors[i];
        assert(sel <= 3);
        if (sel == 3) continue;

        weight_accum += g_weight_vals3[sel];

        static const uint8_t s_tran[3] = {0, 2, 1};
        const uint8_t tsel = s_tran[sel];
        uq00_r += tsel * r;
        uq00_g += tsel * g;
        uq00_b += tsel * b;

        total_r += r;
        total_g += g;
        total_b += b;
    }

    int q10_r = total_r * 2 - uq00_r;
    int q10_g = total_g * 2 - uq00_g;
    int q10_b = total_b * 2 - uq00_b;

    float z00 = (float)((weight_accum >> 16) & 0xFF);
    float z10 = (float)((weight_accum >> 8) & 0xFF);
    float z11 = (float)(weight_accum & 0xFF);
    float z01 = z10;

    float det = z00 * z11 - z01 * z10;
    if (fabs(det) < 1e-8f) return false;

    det = (2.0f / 255.0f) / det;

    float iz00, iz01, iz10, iz11;
    iz00 = z11 * det;
    iz01 = -z01 * det;
    iz10 = -z10 * det;
    iz11 = z00 * det;

    pXl->c[0] = iz00 * (float)uq00_r + iz01 * q10_r;
    pXh->c[0] = iz10 * (float)uq00_r + iz11 * q10_r;

    pXl->c[1] = iz00 * (float)uq00_g + iz01 * q10_g;
    pXh->c[1] = iz10 * (float)uq00_g + iz11 * q10_g;

    pXl->c[2] = iz00 * (float)uq00_b + iz01 * q10_b;
    pXh->c[2] = iz10 * (float)uq00_b + iz11 * q10_b;

    return true;
}

static inline void bc1_get_block_colors4(uint32_t block_r[4], uint32_t block_g[4], uint32_t block_b[4], uint32_t lr, uint32_t lg, uint32_t lb, uint32_t hr,
                                         uint32_t hg, uint32_t hb) {
    block_r[0] = (lr << 3) | (lr >> 2);
    block_g[0] = (lg << 2) | (lg >> 4);
    block_b[0] = (lb << 3) | (lb >> 2);
    block_r[3] = (hr << 3) | (hr >> 2);
    block_g[3] = (hg << 2) | (hg >> 4);
    block_b[3] = (hb << 3) | (hb >> 2);

    if (g_bc1_approx_mode == bc1_approx_mode::cBC1Ideal) {
        block_r[1] = (block_r[0] * 2 + block_r[3]) / 3;
        block_g[1] = (block_g[0] * 2 + block_g[3]) / 3;
        block_b[1] = (block_b[0] * 2 + block_b[3]) / 3;
        block_r[2] = (block_r[3] * 2 + block_r[0]) / 3;
        block_g[2] = (block_g[3] * 2 + block_g[0]) / 3;
        block_b[2] = (block_b[3] * 2 + block_b[0]) / 3;
    } else if (g_bc1_approx_mode == bc1_approx_mode::cBC1IdealRound4) {
        block_r[1] = (block_r[0] * 2 + block_r[3] + 1) / 3;
        block_g[1] = (block_g[0] * 2 + block_g[3] + 1) / 3;
        block_b[1] = (block_b[0] * 2 + block_b[3] + 1) / 3;
        block_r[2] = (block_r[3] * 2 + block_r[0] + 1) / 3;
        block_g[2] = (block_g[3] * 2 + block_g[0] + 1) / 3;
        block_b[2] = (block_b[3] * 2 + block_b[0] + 1) / 3;
    } else if (g_bc1_approx_mode == bc1_approx_mode::cBC1AMD) {
        block_r[1] = interp_5_6_amd(block_r[0], block_r[3]);
        block_g[1] = interp_5_6_amd(block_g[0], block_g[3]);
        block_b[1] = interp_5_6_amd(block_b[0], block_b[3]);
        block_r[2] = interp_5_6_amd(block_r[3], block_r[0]);
        block_g[2] = interp_5_6_amd(block_g[3], block_g[0]);
        block_b[2] = interp_5_6_amd(block_b[3], block_b[0]);
    } else {
        block_r[1] = interp_5_nv(lr, hr);
        block_g[1] = interp_6_nv(block_g[0], block_g[3]);
        block_b[1] = interp_5_nv(lb, hb);
        block_r[2] = interp_5_nv(hr, lr);
        block_g[2] = interp_6_nv(block_g[3], block_g[0]);
        block_b[2] = interp_5_nv(hb, lb);
    }
}

static inline void bc1_get_block_colors3(uint32_t block_r[3], uint32_t block_g[3], uint32_t block_b[3], uint32_t lr, uint32_t lg, uint32_t lb, uint32_t hr,
                                         uint32_t hg, uint32_t hb) {
    block_r[0] = (lr << 3) | (lr >> 2);
    block_g[0] = (lg << 2) | (lg >> 4);
    block_b[0] = (lb << 3) | (lb >> 2);
    block_r[1] = (hr << 3) | (hr >> 2);
    block_g[1] = (hg << 2) | (hg >> 4);
    block_b[1] = (hb << 3) | (hb >> 2);

    if ((g_bc1_approx_mode == bc1_approx_mode::cBC1Ideal) || (g_bc1_approx_mode == bc1_approx_mode::cBC1IdealRound4)) {
        block_r[2] = (block_r[0] + block_r[1]) / 2;
        block_g[2] = (block_g[0] + block_g[1]) / 2;
        block_b[2] = (block_b[0] + block_b[1]) / 2;
    } else if (g_bc1_approx_mode == bc1_approx_mode::cBC1AMD) {
        block_r[2] = interp_half_5_6_amd(block_r[0], block_r[1]);
        block_g[2] = interp_half_5_6_amd(block_g[0], block_g[1]);
        block_b[2] = interp_half_5_6_amd(block_b[0], block_b[1]);
    } else {
        block_r[2] = interp_half_5_nv(lr, hr);
        block_g[2] = interp_half_6_nv(block_g[0], block_g[1]);
        block_b[2] = interp_half_5_nv(lb, hb);
    }
}

static inline void bc1_find_sels4_noerr(const Color *pSrc_pixels, uint32_t lr, uint32_t lg, uint32_t lb, uint32_t hr, uint32_t hg, uint32_t hb,
                                        uint8_t sels[16]) {
    uint32_t block_r[4], block_g[4], block_b[4];
    bc1_get_block_colors4(block_r, block_g, block_b, lr, lg, lb, hr, hg, hb);

    int ar = block_r[3] - block_r[0], ag = block_g[3] - block_g[0], ab = block_b[3] - block_b[0];

    int dots[4];
    for (uint32_t i = 0; i < 4; i++) dots[i] = (int)block_r[i] * ar + (int)block_g[i] * ag + (int)block_b[i] * ab;

    int t0 = dots[0] + dots[1], t1 = dots[1] + dots[2], t2 = dots[2] + dots[3];

    ar *= 2;
    ag *= 2;
    ab *= 2;

    static const uint8_t s_sels[4] = {3, 2, 1, 0};

    for (uint32_t i = 0; i < 16; i += 4) {
        const int d0 = pSrc_pixels[i + 0].r * ar + pSrc_pixels[i + 0].g * ag + pSrc_pixels[i + 0].b * ab;
        const int d1 = pSrc_pixels[i + 1].r * ar + pSrc_pixels[i + 1].g * ag + pSrc_pixels[i + 1].b * ab;
        const int d2 = pSrc_pixels[i + 2].r * ar + pSrc_pixels[i + 2].g * ag + pSrc_pixels[i + 2].b * ab;
        const int d3 = pSrc_pixels[i + 3].r * ar + pSrc_pixels[i + 3].g * ag + pSrc_pixels[i + 3].b * ab;

        sels[i + 0] = s_sels[(d0 <= t0) + (d0 < t1) + (d0 < t2)];
        sels[i + 1] = s_sels[(d1 <= t0) + (d1 < t1) + (d1 < t2)];
        sels[i + 2] = s_sels[(d2 <= t0) + (d2 < t1) + (d2 < t2)];
        sels[i + 3] = s_sels[(d3 <= t0) + (d3 < t1) + (d3 < t2)];
    }
}

static inline uint32_t bc1_find_sels4_fasterr(const Color *pSrc_pixels, uint32_t lr, uint32_t lg, uint32_t lb, uint32_t hr, uint32_t hg, uint32_t hb,
                                              uint8_t sels[16], uint32_t cur_err) {
    uint32_t block_r[4], block_g[4], block_b[4];
    bc1_get_block_colors4(block_r, block_g, block_b, lr, lg, lb, hr, hg, hb);

    int ar = block_r[3] - block_r[0], ag = block_g[3] - block_g[0], ab = block_b[3] - block_b[0];

    int dots[4];
    for (uint32_t i = 0; i < 4; i++) dots[i] = (int)block_r[i] * ar + (int)block_g[i] * ag + (int)block_b[i] * ab;

    int t0 = dots[0] + dots[1], t1 = dots[1] + dots[2], t2 = dots[2] + dots[3];

    ar *= 2;
    ag *= 2;
    ab *= 2;

    static const uint8_t s_sels[4] = {3, 2, 1, 0};

    uint32_t total_err = 0;

    for (uint32_t i = 0; i < 16; i += 4) {
        const int d0 = pSrc_pixels[i + 0].r * ar + pSrc_pixels[i + 0].g * ag + pSrc_pixels[i + 0].b * ab;
        const int d1 = pSrc_pixels[i + 1].r * ar + pSrc_pixels[i + 1].g * ag + pSrc_pixels[i + 1].b * ab;
        const int d2 = pSrc_pixels[i + 2].r * ar + pSrc_pixels[i + 2].g * ag + pSrc_pixels[i + 2].b * ab;
        const int d3 = pSrc_pixels[i + 3].r * ar + pSrc_pixels[i + 3].g * ag + pSrc_pixels[i + 3].b * ab;

        uint8_t sel0 = s_sels[(d0 <= t0) + (d0 < t1) + (d0 < t2)];
        uint8_t sel1 = s_sels[(d1 <= t0) + (d1 < t1) + (d1 < t2)];
        uint8_t sel2 = s_sels[(d2 <= t0) + (d2 < t1) + (d2 < t2)];
        uint8_t sel3 = s_sels[(d3 <= t0) + (d3 < t1) + (d3 < t2)];

        sels[i + 0] = sel0;
        sels[i + 1] = sel1;
        sels[i + 2] = sel2;
        sels[i + 3] = sel3;

        total_err +=
            squarei(pSrc_pixels[i + 0].r - block_r[sel0]) + squarei(pSrc_pixels[i + 0].g - block_g[sel0]) + squarei(pSrc_pixels[i + 0].b - block_b[sel0]);
        total_err +=
            squarei(pSrc_pixels[i + 1].r - block_r[sel1]) + squarei(pSrc_pixels[i + 1].g - block_g[sel1]) + squarei(pSrc_pixels[i + 1].b - block_b[sel1]);
        total_err +=
            squarei(pSrc_pixels[i + 2].r - block_r[sel2]) + squarei(pSrc_pixels[i + 2].g - block_g[sel2]) + squarei(pSrc_pixels[i + 2].b - block_b[sel2]);
        total_err +=
            squarei(pSrc_pixels[i + 3].r - block_r[sel3]) + squarei(pSrc_pixels[i + 3].g - block_g[sel3]) + squarei(pSrc_pixels[i + 3].b - block_b[sel3]);

        if (total_err >= cur_err) break;
    }

    return total_err;
}

static inline uint32_t bc1_find_sels4_check2_err(const Color *pSrc_pixels, uint32_t lr, uint32_t lg, uint32_t lb, uint32_t hr, uint32_t hg, uint32_t hb,
                                                 uint8_t sels[16], uint32_t cur_err) {
    uint32_t block_r[4], block_g[4], block_b[4];
    bc1_get_block_colors4(block_r, block_g, block_b, lr, lg, lb, hr, hg, hb);

    int dr = block_r[3] - block_r[0], dg = block_g[3] - block_g[0], db = block_b[3] - block_b[0];

    const float f = 4.0f / (float)(squarei(dr) + squarei(dg) + squarei(db) + .00000125f);

    uint32_t total_err = 0;

    for (uint32_t i = 0; i < 16; i++) {
        const int r = pSrc_pixels[i].r;
        const int g = pSrc_pixels[i].g;
        const int b = pSrc_pixels[i].b;

        int sel = (int)((float)((r - (int)block_r[0]) * dr + (g - (int)block_g[0]) * dg + (b - (int)block_b[0]) * db) * f + .5f);
        sel = clampi(sel, 1, 3);

        uint32_t err0 = squarei((int)block_r[sel - 1] - (int)r) + squarei((int)block_g[sel - 1] - (int)g) + squarei((int)block_b[sel - 1] - (int)b);
        uint32_t err1 = squarei((int)block_r[sel] - (int)r) + squarei((int)block_g[sel] - (int)g) + squarei((int)block_b[sel] - (int)b);

        int best_sel = sel;
        uint32_t best_err = err1;
        if (err0 == err1) {
            // Prefer non-interpolation
            if ((best_sel - 1) == 0) best_sel = 0;
        } else if (err0 < best_err) {
            best_sel = sel - 1;
            best_err = err0;
        }

        total_err += best_err;

        if (total_err >= cur_err) break;

        sels[i] = (uint8_t)best_sel;
    }
    return total_err;
}

static inline uint32_t bc1_find_sels4_fullerr(const Color *pSrc_pixels, uint32_t lr, uint32_t lg, uint32_t lb, uint32_t hr, uint32_t hg, uint32_t hb,
                                              uint8_t sels[16], uint32_t cur_err) {
    uint32_t block_r[4], block_g[4], block_b[4];
    bc1_get_block_colors4(block_r, block_g, block_b, lr, lg, lb, hr, hg, hb);

    uint32_t total_err = 0;

    for (uint32_t i = 0; i < 16; i++) {
        const int r = pSrc_pixels[i].r;
        const int g = pSrc_pixels[i].g;
        const int b = pSrc_pixels[i].b;

        uint32_t best_err = squarei((int)block_r[0] - (int)r) + squarei((int)block_g[0] - (int)g) + squarei((int)block_b[0] - (int)b);
        uint8_t best_sel = 0;

        for (uint32_t j = 1; (j < 4) && best_err; j++) {
            uint32_t err = squarei((int)block_r[j] - (int)r) + squarei((int)block_g[j] - (int)g) + squarei((int)block_b[j] - (int)b);
            if ((err < best_err) || ((err == best_err) && (j == 3))) {
                best_err = err;
                best_sel = (uint8_t)j;
            }
        }

        total_err += best_err;

        if (total_err >= cur_err) break;

        sels[i] = (uint8_t)best_sel;
    }
    return total_err;
}

static inline uint32_t bc1_find_sels4(uint32_t flags, const Color *pSrc_pixels, uint32_t lr, uint32_t lg, uint32_t lb, uint32_t hr, uint32_t hg, uint32_t hb,
                                      uint8_t sels[16], uint32_t cur_err) {
    uint32_t err;

    if (flags & cEncodeBC1UseFasterMSEEval)
        err = bc1_find_sels4_fasterr(pSrc_pixels, lr, lg, lb, hr, hg, hb, sels, cur_err);
    else if (flags & cEncodeBC1UseFullMSEEval)
        err = bc1_find_sels4_fullerr(pSrc_pixels, lr, lg, lb, hr, hg, hb, sels, cur_err);
    else
        err = bc1_find_sels4_check2_err(pSrc_pixels, lr, lg, lb, hr, hg, hb, sels, cur_err);

    return err;
}

static inline uint32_t bc1_find_sels3_fullerr(bool use_black, const Color *pSrc_pixels, uint32_t lr, uint32_t lg, uint32_t lb, uint32_t hr, uint32_t hg,
                                              uint32_t hb, uint8_t sels[16], uint32_t cur_err) {
    uint32_t block_r[3], block_g[3], block_b[3];
    bc1_get_block_colors3(block_r, block_g, block_b, lr, lg, lb, hr, hg, hb);

    uint32_t total_err = 0;

    for (uint32_t i = 0; i < 16; i++) {
        const int r = pSrc_pixels[i].r;
        const int g = pSrc_pixels[i].g;
        const int b = pSrc_pixels[i].b;

        uint32_t best_err = squarei((int)block_r[0] - (int)r) + squarei((int)block_g[0] - (int)g) + squarei((int)block_b[0] - (int)b);
        uint32_t best_sel = 0;

        uint32_t err1 = squarei((int)block_r[1] - (int)r) + squarei((int)block_g[1] - (int)g) + squarei((int)block_b[1] - (int)b);
        if (err1 < best_err) {
            best_err = err1;
            best_sel = 1;
        }

        uint32_t err2 = squarei((int)block_r[2] - (int)r) + squarei((int)block_g[2] - (int)g) + squarei((int)block_b[2] - (int)b);
        if (err2 < best_err) {
            best_err = err2;
            best_sel = 2;
        }

        if (use_black) {
            uint32_t err3 = squarei(r) + squarei(g) + squarei(b);
            if (err3 < best_err) {
                best_err = err3;
                best_sel = 3;
            }
        }

        total_err += best_err;
        if (total_err >= cur_err) return total_err;

        sels[i] = (uint8_t)best_sel;
    }

    return total_err;
}

static inline void precise_round_565(const vec3F &xl, const vec3F &xh, int &trial_lr, int &trial_lg, int &trial_lb, int &trial_hr, int &trial_hg,
                                     int &trial_hb) {
    trial_lr = (int)(xl.c[0] * 31.0f);
    trial_lg = (int)(xl.c[1] * 63.0f);
    trial_lb = (int)(xl.c[2] * 31.0f);

    trial_hr = (int)(xh.c[0] * 31.0f);
    trial_hg = (int)(xh.c[1] * 63.0f);
    trial_hb = (int)(xh.c[2] * 31.0f);

    if ((uint32_t)(trial_lr | trial_lb | trial_hr | trial_hb) > 31U) {
        trial_lr = ((uint32_t)trial_lr > 31U) ? (~trial_lr >> 31) & 31 : trial_lr;
        trial_hr = ((uint32_t)trial_hr > 31U) ? (~trial_hr >> 31) & 31 : trial_hr;

        trial_lb = ((uint32_t)trial_lb > 31U) ? (~trial_lb >> 31) & 31 : trial_lb;
        trial_hb = ((uint32_t)trial_hb > 31U) ? (~trial_hb >> 31) & 31 : trial_hb;
    }

    if ((uint32_t)(trial_lg | trial_hg) > 63U) {
        trial_lg = ((uint32_t)trial_lg > 63U) ? (~trial_lg >> 31) & 63 : trial_lg;
        trial_hg = ((uint32_t)trial_hg > 63U) ? (~trial_hg >> 31) & 63 : trial_hg;
    }

    trial_lr = (trial_lr + (xl.c[0] > g_midpoint5[trial_lr])) & 31;
    trial_lg = (trial_lg + (xl.c[1] > g_midpoint6[trial_lg])) & 63;
    trial_lb = (trial_lb + (xl.c[2] > g_midpoint5[trial_lb])) & 31;

    trial_hr = (trial_hr + (xh.c[0] > g_midpoint5[trial_hr])) & 31;
    trial_hg = (trial_hg + (xh.c[1] > g_midpoint6[trial_hg])) & 63;
    trial_hb = (trial_hb + (xh.c[2] > g_midpoint5[trial_hb])) & 31;
}

static inline void precise_round_565_noscale(vec3F xl, vec3F xh, int &trial_lr, int &trial_lg, int &trial_lb, int &trial_hr, int &trial_hg, int &trial_hb) {
    xl.c[0] *= 1.0f / 255.0f;
    xl.c[1] *= 1.0f / 255.0f;
    xl.c[2] *= 1.0f / 255.0f;

    xh.c[0] *= 1.0f / 255.0f;
    xh.c[1] *= 1.0f / 255.0f;
    xh.c[2] *= 1.0f / 255.0f;

    precise_round_565(xl, xh, trial_lr, trial_lg, trial_lb, trial_hr, trial_hg, trial_hb);
}

static inline void bc1_encode4(BC1Block *pDst_block, int lr, int lg, int lb, int hr, int hg, int hb, const uint8_t sels[16]) {
    uint16_t lc16 = Color::Pack565Unscaled(lr, lg, lb);
    uint16_t hc16 = Color::Pack565Unscaled(hr, hg, hb);

    // Always forbid 3 color blocks
    if (lc16 == hc16) {
        uint8_t mask = 0;

        // Make l > h
        if (hc16 > 0)
            hc16--;
        else {
            // lc16 = hc16 = 0
            assert(lc16 == hc16 && hc16 == 0);

            hc16 = 0;
            lc16 = 1;
            mask = 0x55;  // select hc16
        }

        assert(lc16 > hc16);
        pDst_block->SetLowColor(static_cast<uint16_t>(lc16));
        pDst_block->SetHighColor(static_cast<uint16_t>(hc16));

        pDst_block->selectors[0] = mask;
        pDst_block->selectors[1] = mask;
        pDst_block->selectors[2] = mask;
        pDst_block->selectors[3] = mask;
    } else {
        uint8_t invert_mask = 0;
        if (lc16 < hc16) {
            std::swap(lc16, hc16);
            invert_mask = 0x55;
        }

        assert(lc16 > hc16);
        pDst_block->SetLowColor((uint16_t)lc16);
        pDst_block->SetHighColor((uint16_t)hc16);

        uint32_t packed_sels = 0;
        static const uint8_t s_sel_trans[4] = {0, 2, 3, 1};
        for (uint32_t i = 0; i < 16; i++) packed_sels |= ((uint32_t)s_sel_trans[sels[i]] << (i * 2));

        // todo: make this less silly to prevent packing and unpacking
        pDst_block->selectors[0] = (uint8_t)packed_sels ^ invert_mask;
        pDst_block->selectors[1] = (uint8_t)(packed_sels >> 8) ^ invert_mask;
        pDst_block->selectors[2] = (uint8_t)(packed_sels >> 16) ^ invert_mask;
        pDst_block->selectors[3] = (uint8_t)(packed_sels >> 24) ^ invert_mask;
    }
}

static inline void bc1_encode3(BC1Block *pDst_block, int lr, int lg, int lb, int hr, int hg, int hb, const uint8_t sels[16]) {
    uint16_t lc16 = Color::Pack565Unscaled(lr, lg, lb);
    uint16_t hc16 = Color::Pack565Unscaled(hr, hg, hb);

    bool invert_flag = false;
    if (lc16 > hc16) {
        std::swap(lc16, hc16);
        invert_flag = true;
    }

    assert(lc16 <= hc16);

    pDst_block->SetLowColor((uint16_t)lc16);
    pDst_block->SetHighColor((uint16_t)hc16);

    uint32_t packed_sels = 0;

    if (invert_flag) {
        static const uint8_t s_sel_trans_inv[4] = {1, 0, 2, 3};

        for (uint32_t i = 0; i < 16; i++) packed_sels |= ((uint32_t)s_sel_trans_inv[sels[i]] << (i * 2));
    } else {
        for (uint32_t i = 0; i < 16; i++) packed_sels |= ((uint32_t)sels[i] << (i * 2));
    }

    // todo: make this less silly to prevent packing and unpacking
    pDst_block->selectors[0] = (uint8_t)packed_sels;
    pDst_block->selectors[1] = (uint8_t)(packed_sels >> 8);
    pDst_block->selectors[2] = (uint8_t)(packed_sels >> 16);
    pDst_block->selectors[3] = (uint8_t)(packed_sels >> 24);
}

struct bc1_encode_results {
    int lr, lg, lb;
    int hr, hg, hb;
    uint8_t sels[16];
    bool m_3color;
};

static bool try_3color_block_useblack(const Color *pSrc_pixels, uint32_t flags, uint32_t &cur_err, bc1_encode_results &results) {
    int total_r = 0, total_g = 0, total_b = 0;
    int max_r = 0, max_g = 0, max_b = 0;
    int min_r = 255, min_g = 255, min_b = 255;
    int total_pixels = 0;
    for (uint32_t i = 0; i < 16; i++) {
        const int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
        if ((r | g | b) < 4) continue;

        max_r = std::max(max_r, r);
        max_g = std::max(max_g, g);
        max_b = std::max(max_b, b);
        min_r = std::min(min_r, r);
        min_g = std::min(min_g, g);
        min_b = std::min(min_b, b);
        total_r += r;
        total_g += g;
        total_b += b;

        total_pixels++;
    }

    if (!total_pixels) return false;

    int half_total_pixels = total_pixels >> 1;
    int avg_r = (total_r + half_total_pixels) / total_pixels;
    int avg_g = (total_g + half_total_pixels) / total_pixels;
    int avg_b = (total_b + half_total_pixels) / total_pixels;

    uint32_t low_c = 0, high_c = 0;

    int icov[6] = {0, 0, 0, 0, 0, 0};
    for (uint32_t i = 0; i < 16; i++) {
        int r = (int)pSrc_pixels[i].r;
        int g = (int)pSrc_pixels[i].g;
        int b = (int)pSrc_pixels[i].b;

        if ((r | g | b) < 4) continue;

        r -= avg_r;
        g -= avg_g;
        b -= avg_b;

        icov[0] += r * r;
        icov[1] += r * g;
        icov[2] += r * b;
        icov[3] += g * g;
        icov[4] += g * b;
        icov[5] += b * b;
    }

    float cov[6];
    for (uint32_t i = 0; i < 6; i++) cov[i] = (float)(icov[i]) * (1.0f / 255.0f);

    float xr = (float)(max_r - min_r);
    float xg = (float)(max_g - min_g);
    float xb = (float)(max_b - min_b);

    if (icov[2] < 0) xr = -xr;

    if (icov[4] < 0) xg = -xg;

    for (uint32_t power_iter = 0; power_iter < 4; power_iter++) {
        float r = xr * cov[0] + xg * cov[1] + xb * cov[2];
        float g = xr * cov[1] + xg * cov[3] + xb * cov[4];
        float b = xr * cov[2] + xg * cov[4] + xb * cov[5];
        xr = r;
        xg = g;
        xb = b;
    }

    float k = maximum(fabsf(xr), fabsf(xg), fabsf(xb));
    int saxis_r = 306, saxis_g = 601, saxis_b = 117;
    if (k >= 2) {
        float m = 1024.0f / k;
        saxis_r = (int)(xr * m);
        saxis_g = (int)(xg * m);
        saxis_b = (int)(xb * m);
    }

    int low_dot = INT_MAX, high_dot = INT_MIN;
    for (uint32_t i = 0; i < 16; i++) {
        int r = (int)pSrc_pixels[i].r, g = (int)pSrc_pixels[i].g, b = (int)pSrc_pixels[i].b;

        if ((r | g | b) < 4) continue;

        int dot = r * saxis_r + g * saxis_g + b * saxis_b;
        if (dot < low_dot) {
            low_dot = dot;
            low_c = i;
        }
        if (dot > high_dot) {
            high_dot = dot;
            high_c = i;
        }
    }

    int lr = scale8To5(pSrc_pixels[low_c].r);
    int lg = scale8To6(pSrc_pixels[low_c].g);
    int lb = scale8To5(pSrc_pixels[low_c].b);

    int hr = scale8To5(pSrc_pixels[high_c].r);
    int hg = scale8To6(pSrc_pixels[high_c].g);
    int hb = scale8To5(pSrc_pixels[high_c].b);

    uint8_t trial_sels[16];
    uint32_t trial_err = bc1_find_sels3_fullerr(true, pSrc_pixels, lr, lg, lb, hr, hg, hb, trial_sels, UINT32_MAX);

    if (trial_err) {
        const uint32_t total_ls_passes = flags & cEncodeBC1TwoLeastSquaresPasses ? 2 : 1;
        for (uint32_t trials = 0; trials < total_ls_passes; trials++) {
            vec3F xl, xh;
            int lr2, lg2, lb2, hr2, hg2, hb2;
            if (!compute_least_squares_endpoints3_rgb(true, pSrc_pixels, trial_sels, &xl, &xh)) {
                lr2 = g_bc1_match5_half[avg_r].m_hi;
                lg2 = g_bc1_match6_half[avg_g].m_hi;
                lb2 = g_bc1_match5_half[avg_b].m_hi;

                hr2 = g_bc1_match5_half[avg_r].m_lo;
                hg2 = g_bc1_match6_half[avg_g].m_lo;
                hb2 = g_bc1_match5_half[avg_b].m_lo;
            } else {
                precise_round_565(xl, xh, hr2, hg2, hb2, lr2, lg2, lb2);
            }

            if ((lr == lr2) && (lg == lg2) && (lb == lb2) && (hr == hr2) && (hg == hg2) && (hb == hb2)) break;

            uint8_t trial_sels2[16];
            uint32_t trial_err2 = bc1_find_sels3_fullerr(true, pSrc_pixels, lr2, lg2, lb2, hr2, hg2, hb2, trial_sels2, trial_err);

            if (trial_err2 < trial_err) {
                trial_err = trial_err2;
                lr = lr2;
                lg = lg2;
                lb = lb2;
                hr = hr2;
                hg = hg2;
                hb = hb2;
                memcpy(trial_sels, trial_sels2, sizeof(trial_sels));
            } else
                break;
        }
    }

    if (trial_err < cur_err) {
        results.m_3color = true;
        results.lr = lr;
        results.lg = lg;
        results.lb = lb;
        results.hr = hr;
        results.hg = hg;
        results.hb = hb;
        memcpy(results.sels, trial_sels, 16);

        cur_err = trial_err;

        return true;
    }

    return false;
}

static bool try_3color_block(const Color *pSrc_pixels, uint32_t flags, uint32_t &cur_err, int avg_r, int avg_g, int avg_b, int lr, int lg, int lb, int hr,
                             int hg, int hb, int total_r, int total_g, int total_b, uint32_t total_orderings_to_try, bc1_encode_results &results) {
    uint8_t trial_sels[16];
    uint32_t trial_err = bc1_find_sels3_fullerr(false, pSrc_pixels, lr, lg, lb, hr, hg, hb, trial_sels, UINT32_MAX);

    if (trial_err) {
        const uint32_t total_ls_passes = flags & cEncodeBC1TwoLeastSquaresPasses ? 2 : 1;
        for (uint32_t trials = 0; trials < total_ls_passes; trials++) {
            vec3F xl, xh;
            int lr2, lg2, lb2, hr2, hg2, hb2;
            if (!compute_least_squares_endpoints3_rgb(false, pSrc_pixels, trial_sels, &xl, &xh)) {
                lr2 = g_bc1_match5_half[avg_r].m_hi;
                lg2 = g_bc1_match6_half[avg_g].m_hi;
                lb2 = g_bc1_match5_half[avg_b].m_hi;

                hr2 = g_bc1_match5_half[avg_r].m_lo;
                hg2 = g_bc1_match6_half[avg_g].m_lo;
                hb2 = g_bc1_match5_half[avg_b].m_lo;
            } else {
                precise_round_565(xl, xh, hr2, hg2, hb2, lr2, lg2, lb2);
            }

            if ((lr == lr2) && (lg == lg2) && (lb == lb2) && (hr == hr2) && (hg == hg2) && (hb == hb2)) break;

            uint8_t trial_sels2[16];
            uint32_t trial_err2 = bc1_find_sels3_fullerr(false, pSrc_pixels, lr2, lg2, lb2, hr2, hg2, hb2, trial_sels2, trial_err);

            if (trial_err2 < trial_err) {
                trial_err = trial_err2;
                lr = lr2;
                lg = lg2;
                lb = lb2;
                hr = hr2;
                hg = hg2;
                hb = hb2;
                memcpy(trial_sels, trial_sels2, sizeof(trial_sels));
            } else
                break;
        }
    }

    if ((trial_err) && (flags & cEncodeBC1UseLikelyTotalOrderings) && (total_orderings_to_try)) {
        hist3 h;
        for (uint32_t i = 0; i < 16; i++) {
            assert(trial_sels[i] < 3);
            h.m_hist[trial_sels[i]]++;
        }

        const uint32_t orig_total_order_index = h.lookup_total_ordering_index();

        int r0, g0, b0, r3, g3, b3;
        r0 = (lr << 3) | (lr >> 2);
        g0 = (lg << 2) | (lg >> 4);
        b0 = (lb << 3) | (lb >> 2);
        r3 = (hr << 3) | (hr >> 2);
        g3 = (hg << 2) | (hg >> 4);
        b3 = (hb << 3) | (hb >> 2);

        int ar = r3 - r0, ag = g3 - g0, ab = b3 - b0;

        int dots[16];
        for (uint32_t i = 0; i < 16; i++) {
            int r = pSrc_pixels[i].r;
            int g = pSrc_pixels[i].g;
            int b = pSrc_pixels[i].b;
            int d = 0x1000000 + (r * ar + g * ag + b * ab);
            assert(d >= 0);
            dots[i] = (d << 4) + i;
        }

        std::sort(dots, dots + 16);

        uint32_t r_sum[17], g_sum[17], b_sum[17];
        uint32_t r = 0, g = 0, b = 0;
        for (uint32_t i = 0; i < 16; i++) {
            const uint32_t p = dots[i] & 15;

            r_sum[i] = r;
            g_sum[i] = g;
            b_sum[i] = b;

            r += pSrc_pixels[p].r;
            g += pSrc_pixels[p].g;
            b += pSrc_pixels[p].b;
        }

        r_sum[16] = total_r;
        g_sum[16] = total_g;
        b_sum[16] = total_b;

        const uint32_t q_total = (flags & cEncodeBC1Exhaustive) ? NUM_UNIQUE_TOTAL_ORDERINGS3 : std::min(total_orderings_to_try, MAX_TOTAL_ORDERINGS3);
        for (uint32_t q = 0; q < q_total; q++) {
            const uint32_t s = (flags & cEncodeBC1Exhaustive) ? q : g_best_total_orderings3[orig_total_order_index][q];

            int trial_lr, trial_lg, trial_lb, trial_hr, trial_hg, trial_hb;

            vec3F xl, xh;

            if ((s == TOTAL_ORDER_3_0_16) || (s == TOTAL_ORDER_3_1_16) || (s == TOTAL_ORDER_3_2_16)) {
                trial_lr = g_bc1_match5_half[avg_r].m_hi;
                trial_lg = g_bc1_match6_half[avg_g].m_hi;
                trial_lb = g_bc1_match5_half[avg_b].m_hi;

                trial_hr = g_bc1_match5_half[avg_r].m_lo;
                trial_hg = g_bc1_match6_half[avg_g].m_lo;
                trial_hb = g_bc1_match5_half[avg_b].m_lo;
            } else {
                compute_least_squares_endpoints3_rgb(&xl, &xh, total_r, total_g, total_b, g_selector_factors3[s][0], g_selector_factors3[s][1],
                                                     g_selector_factors3[s][2], s, r_sum, g_sum, b_sum);

                precise_round_565(xl, xh, trial_hr, trial_hg, trial_hb, trial_lr, trial_lg, trial_lb);
            }

            uint8_t trial_sels2[16];
            uint32_t trial_err2 =
                bc1_find_sels3_fullerr(false, pSrc_pixels, trial_lr, trial_lg, trial_lb, trial_hr, trial_hg, trial_hb, trial_sels2, UINT32_MAX);

            if (trial_err2 < trial_err) {
                trial_err = trial_err2;

                lr = trial_lr;
                lg = trial_lg;
                lb = trial_lb;

                hr = trial_hr;
                hg = trial_hg;
                hb = trial_hb;

                memcpy(trial_sels, trial_sels2, sizeof(trial_sels));
            }

        }  // s
    }

    if (trial_err < cur_err) {
        results.m_3color = true;
        results.lr = lr;
        results.lg = lg;
        results.lb = lb;
        results.hr = hr;
        results.hg = hg;
        results.hb = hb;
        memcpy(results.sels, trial_sels, 16);

        cur_err = trial_err;

        return true;
    }

    return false;
}

void encode_bc1(uint32_t level, void *pDst, const uint8_t *pPixels, bool allow_3color, bool allow_transparent_texels_for_black) {
    uint32_t flags = 0, total_orderings4 = 1, total_orderings3 = 1;

    static_assert(MAX_TOTAL_ORDERINGS3 >= 32, "MAX_TOTAL_ORDERINGS3 >= 32");
    static_assert(MAX_TOTAL_ORDERINGS4 >= 32, "MAX_TOTAL_ORDERINGS4 >= 32");

    switch (level) {
        case 0:
            // Faster/higher quality than stb_dxt default.
            flags = cEncodeBC1BoundingBoxInt;
            break;
        case 1:
            // Faster/higher quality than stb_dxt default. a bit higher average quality vs. mode 0.
            flags = cEncodeBC1Use2DLS;
            break;
        case 2:
            // On average mode 2 is a little weaker than modes 0/1, but it's stronger on outliers (very tough textures).
            // Slightly stronger than stb_dxt.
            flags = 0;
            break;
        case 3:
            // Slightly stronger than stb_dxt HIGHQUAL.
            flags = cEncodeBC1TwoLeastSquaresPasses;
            break;
        case 4:
            flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseFullMSEEval | cEncodeBC1Use6PowerIters;
            break;
        default:
        case 5:
            // stb_dxt HIGHQUAL + permit 3 color (if it's enabled).
            flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseFasterMSEEval;
            flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0);
            break;
        case 6:
            flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseFasterMSEEval | cEncodeBC1UseLikelyTotalOrderings;
            flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0);
            break;
        case 7:
            flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseFasterMSEEval | cEncodeBC1UseLikelyTotalOrderings;
            flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0);
            total_orderings4 = 4;
            break;
        case 8:
            flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseFasterMSEEval | cEncodeBC1UseLikelyTotalOrderings;
            flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0);
            total_orderings4 = 8;
            break;
        case 9:
            flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseLikelyTotalOrderings;
            flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0);
            total_orderings4 = 11;
            total_orderings3 = 3;
            break;
        case 10:
            flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseLikelyTotalOrderings;
            flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0);
            total_orderings4 = 20;
            total_orderings3 = 8;
            break;
        case 11:
            flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseLikelyTotalOrderings;
            flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0);
            total_orderings4 = 28;
            total_orderings3 = 16;
            break;
        case 12:
            flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseLikelyTotalOrderings;
            flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0);
            total_orderings4 = 32;
            total_orderings3 = 32;
            break;
        case 13:
            flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseFullMSEEval | cEncodeBC1UseLikelyTotalOrderings | cEncodeBC1Use6PowerIters |
                    (20 << cEncodeBC1EndpointSearchRoundsShift) | cEncodeBC1TryAllInitialEndponts;
            flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0);
            total_orderings4 = 32;
            total_orderings3 = 32;
            break;
        case 14:
            flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseFullMSEEval | cEncodeBC1UseLikelyTotalOrderings | cEncodeBC1Use6PowerIters |
                    (32 << cEncodeBC1EndpointSearchRoundsShift) | cEncodeBC1TryAllInitialEndponts;
            flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0);
            total_orderings4 = 32;
            total_orderings3 = 32;
            break;
        case 15:
            flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseFullMSEEval | cEncodeBC1UseLikelyTotalOrderings | cEncodeBC1Use6PowerIters |
                    (32 << cEncodeBC1EndpointSearchRoundsShift) | cEncodeBC1TryAllInitialEndponts;
            flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0);
            total_orderings4 = ((((32 + MAX_TOTAL_ORDERINGS4) / 2) + 32) / 2);
            total_orderings3 = 32;
            break;
        case 16:
            flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseFullMSEEval | cEncodeBC1UseLikelyTotalOrderings | cEncodeBC1Use6PowerIters |
                    (256 << cEncodeBC1EndpointSearchRoundsShift) | cEncodeBC1TryAllInitialEndponts;
            flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0);
            total_orderings4 = (32 + MAX_TOTAL_ORDERINGS4) / 2;
            total_orderings3 = 32;
            break;
        case 17:
            flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseFullMSEEval | cEncodeBC1UseLikelyTotalOrderings | cEncodeBC1Use6PowerIters |
                    (256 << cEncodeBC1EndpointSearchRoundsShift) | cEncodeBC1TryAllInitialEndponts;
            flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0);
            total_orderings4 = MAX_TOTAL_ORDERINGS4;
            total_orderings3 = 32;
            break;
        case 18:
            flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseFullMSEEval | cEncodeBC1UseLikelyTotalOrderings | cEncodeBC1Use6PowerIters |
                    cEncodeBC1Iterative | (256 << cEncodeBC1EndpointSearchRoundsShift) | cEncodeBC1TryAllInitialEndponts;
            flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0);
            total_orderings4 = MAX_TOTAL_ORDERINGS4;
            total_orderings3 = 32;
            break;
        case 19:
            // This hidden mode is *extremely* slow and abuses the encoder. It's just for testing/training.
            flags = cEncodeBC1TwoLeastSquaresPasses | cEncodeBC1UseFullMSEEval | cEncodeBC1UseLikelyTotalOrderings | cEncodeBC1Use6PowerIters |
                    cEncodeBC1Exhaustive | cEncodeBC1Iterative | (256 << cEncodeBC1EndpointSearchRoundsShift) | cEncodeBC1TryAllInitialEndponts;
            flags |= (allow_3color ? cEncodeBC1Use3ColorBlocks : 0) | (allow_transparent_texels_for_black ? cEncodeBC1Use3ColorBlocksForBlackPixels : 0);
            total_orderings4 = 32;
            total_orderings3 = 32;
            break;
    }

    encode_bc1(pDst, pPixels, flags, total_orderings4, total_orderings3);
}

// Finds low and high colors to begin with
static inline void encode_bc1_pick_initial(const Color *pSrc_pixels, uint32_t flags, bool grayscale_flag, int min_r, int min_g, int min_b, int max_r,
                                           int max_g, int max_b, int avg_r, int avg_g, int avg_b, int total_r, int total_g, int total_b, int &lr, int &lg,
                                           int &lb, int &hr, int &hg, int &hb) {
    if (grayscale_flag) {
        const int fr = pSrc_pixels[0].r;

        // Grayscale blocks are a common enough case to specialize.
        if ((max_r - min_r) < 2) {
            lr = lb = hr = hb = scale8To5(fr);
            lg = hg = scale8To6(fr);
        } else {
            lr = lb = scale8To5(min_r);
            lg = scale8To6(min_r);

            hr = hb = scale8To5(max_r);
            hg = scale8To6(max_r);
        }
    } else if (flags & cEncodeBC1Use2DLS) {
        //  2D Least Squares approach from Humus's example, with added inset and optimal rounding.
        int big_chan = 0, min_chan_val = min_r, max_chan_val = max_r;
        if ((max_g - min_g) > (max_chan_val - min_chan_val)) big_chan = 1, min_chan_val = min_g, max_chan_val = max_g;

        if ((max_b - min_b) > (max_chan_val - min_chan_val)) big_chan = 2, min_chan_val = min_b, max_chan_val = max_b;

        int sum_xy_r = 0, sum_xy_g = 0, sum_xy_b = 0;
        vec3F l, h;
        if (big_chan == 0) {
            for (uint32_t i = 0; i < 16; i++) {
                const int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
                sum_xy_r += r * r, sum_xy_g += r * g, sum_xy_b += r * b;
            }

            int sum_x = total_r;
            int sum_x2 = sum_xy_r;

            float div = (float)(16 * sum_x2 - sum_x * sum_x);
            float b_y = 0.0f, b_z = 0.0f;
            if (fabs(div) > 1e-8f) {
                div = 1.0f / div;
                b_y = (16 * sum_xy_g - sum_x * total_g) * div;
                b_z = (16 * sum_xy_b - sum_x * total_b) * div;
            }

            float a_y = (total_g - b_y * sum_x) / 16.0f;
            float a_z = (total_b - b_z * sum_x) / 16.0f;

            l.c[1] = a_y + b_y * min_chan_val;
            l.c[2] = a_z + b_z * min_chan_val;

            h.c[1] = a_y + b_y * max_chan_val;
            h.c[2] = a_z + b_z * max_chan_val;

            float dg = (h.c[1] - l.c[1]);
            float db = (h.c[2] - l.c[2]);

            h.c[1] = l.c[1] + dg * (15.0f / 16.0f);
            h.c[2] = l.c[2] + db * (15.0f / 16.0f);

            l.c[1] = l.c[1] + dg * (1.0f / 16.0f);
            l.c[2] = l.c[2] + db * (1.0f / 16.0f);

            float d = (float)(max_chan_val - min_chan_val);
            float fmin_chan_val = min_chan_val + d * (1.0f / 16.0f);
            float fmax_chan_val = min_chan_val + d * (15.0f / 16.0f);

            l.c[0] = fmin_chan_val;
            h.c[0] = fmax_chan_val;
        } else if (big_chan == 1) {
            for (uint32_t i = 0; i < 16; i++) {
                const int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
                sum_xy_r += g * r, sum_xy_g += g * g, sum_xy_b += g * b;
            }

            int sum_x = total_g;
            int sum_x2 = sum_xy_g;

            float div = (float)(16 * sum_x2 - sum_x * sum_x);
            float b_x = 0.0f, b_z = 0.0f;
            if (fabs(div) > 1e-8f) {
                div = 1.0f / div;
                b_x = (16 * sum_xy_r - sum_x * total_r) * div;
                b_z = (16 * sum_xy_b - sum_x * total_b) * div;
            }

            float a_x = (total_r - b_x * sum_x) / 16.0f;
            float a_z = (total_b - b_z * sum_x) / 16.0f;

            l.c[0] = a_x + b_x * min_chan_val;
            l.c[2] = a_z + b_z * min_chan_val;

            h.c[0] = a_x + b_x * max_chan_val;
            h.c[2] = a_z + b_z * max_chan_val;

            float dr = (h.c[0] - l.c[0]);
            float db = (h.c[2] - l.c[2]);

            h.c[0] = l.c[0] + dr * (15.0f / 16.0f);
            h.c[2] = l.c[2] + db * (15.0f / 16.0f);

            l.c[0] = l.c[0] + dr * (1.0f / 16.0f);
            l.c[2] = l.c[2] + db * (1.0f / 16.0f);

            float d = (float)(max_chan_val - min_chan_val);
            float fmin_chan_val = min_chan_val + d * (1.0f / 16.0f);
            float fmax_chan_val = min_chan_val + d * (15.0f / 16.0f);

            l.c[1] = fmin_chan_val;
            h.c[1] = fmax_chan_val;
        } else {
            for (uint32_t i = 0; i < 16; i++) {
                const int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
                sum_xy_r += b * r, sum_xy_g += b * g, sum_xy_b += b * b;
            }

            int sum_x = total_b;
            int sum_x2 = sum_xy_b;

            float div = (float)(16 * sum_x2 - sum_x * sum_x);
            float b_x = 0.0f, b_y = 0.0f;
            if (fabs(div) > 1e-8f) {
                div = 1.0f / div;
                b_x = (16 * sum_xy_r - sum_x * total_r) * div;
                b_y = (16 * sum_xy_g - sum_x * total_g) * div;
            }

            float a_x = (total_r - b_x * sum_x) / 16.0f;
            float a_y = (total_g - b_y * sum_x) / 16.0f;

            l.c[0] = a_x + b_x * min_chan_val;
            l.c[1] = a_y + b_y * min_chan_val;

            h.c[0] = a_x + b_x * max_chan_val;
            h.c[1] = a_y + b_y * max_chan_val;

            float dr = (h.c[0] - l.c[0]);
            float dg = (h.c[1] - l.c[1]);

            h.c[0] = l.c[0] + dr * (15.0f / 16.0f);
            h.c[1] = l.c[1] + dg * (15.0f / 16.0f);

            l.c[0] = l.c[0] + dr * (1.0f / 16.0f);
            l.c[1] = l.c[1] + dg * (1.0f / 16.0f);

            float d = (float)(max_chan_val - min_chan_val);
            float fmin_chan_val = min_chan_val + d * (1.0f / 16.0f);
            float fmax_chan_val = min_chan_val + d * (15.0f / 16.0f);

            l.c[2] = fmin_chan_val;
            h.c[2] = fmax_chan_val;
        }

        precise_round_565_noscale(l, h, lr, lg, lb, hr, hg, hb);
    } else if (flags & cEncodeBC1BoundingBox) {
        // Algorithm from icbc.h compress_dxt1_fast()
        vec3F l, h;
        l.c[0] = min_r * (1.0f / 255.0f);
        l.c[1] = min_g * (1.0f / 255.0f);
        l.c[2] = min_b * (1.0f / 255.0f);

        h.c[0] = max_r * (1.0f / 255.0f);
        h.c[1] = max_g * (1.0f / 255.0f);
        h.c[2] = max_b * (1.0f / 255.0f);

        const float bias = 8.0f / 255.0f;
        float inset_r = (h.c[0] - l.c[0] - bias) * (1.0f / 16.0f);
        float inset_g = (h.c[1] - l.c[1] - bias) * (1.0f / 16.0f);
        float inset_b = (h.c[2] - l.c[2] - bias) * (1.0f / 16.0f);

        l.c[0] = clampf(l.c[0] + inset_r, 0.0f, 1.0f);
        l.c[1] = clampf(l.c[1] + inset_g, 0.0f, 1.0f);
        l.c[2] = clampf(l.c[2] + inset_b, 0.0f, 1.0f);

        h.c[0] = clampf(h.c[0] - inset_r, 0.0f, 1.0f);
        h.c[1] = clampf(h.c[1] - inset_g, 0.0f, 1.0f);
        h.c[2] = clampf(h.c[2] - inset_b, 0.0f, 1.0f);

        int icov_xz = 0, icov_yz = 0;
        for (uint32_t i = 0; i < 16; i++) {
            int r = (int)pSrc_pixels[i].r - avg_r;
            int g = (int)pSrc_pixels[i].g - avg_g;
            int b = (int)pSrc_pixels[i].b - avg_b;
            icov_xz += r * b;
            icov_yz += g * b;
        }

        if (icov_xz < 0) std::swap(l.c[0], h.c[0]);

        if (icov_yz < 0) std::swap(l.c[1], h.c[1]);

        precise_round_565(l, h, lr, lg, lb, hr, hg, hb);
    } else if (flags & cEncodeBC1BoundingBoxInt) {
        // Algorithm from icbc.h compress_dxt1_fast(), but converted to integer.
        int inset_r = (max_r - min_r - 8) >> 4;
        int inset_g = (max_g - min_g - 8) >> 4;
        int inset_b = (max_b - min_b - 8) >> 4;

        min_r += inset_r;
        min_g += inset_g;
        min_b += inset_b;
        if ((uint32_t)(min_r | min_g | min_b) > 255U) {
            min_r = clampi(min_r, 0, 255);
            min_g = clampi(min_g, 0, 255);
            min_b = clampi(min_b, 0, 255);
        }

        max_r -= inset_r;
        max_g -= inset_g;
        max_b -= inset_b;
        if ((uint32_t)(max_r | max_g | max_b) > 255U) {
            max_r = clampi(max_r, 0, 255);
            max_g = clampi(max_g, 0, 255);
            max_b = clampi(max_b, 0, 255);
        }

        int icov_xz = 0, icov_yz = 0;
        for (uint32_t i = 0; i < 16; i++) {
            int r = (int)pSrc_pixels[i].r - avg_r;
            int g = (int)pSrc_pixels[i].g - avg_g;
            int b = (int)pSrc_pixels[i].b - avg_b;
            icov_xz += r * b;
            icov_yz += g * b;
        }

        int x0 = min_r;
        int y0 = min_g;
        int x1 = max_r;
        int y1 = max_g;

        // swap r and g min and max to align principal axis
        if (icov_xz < 0) std::swap(x0, x1);

        if (icov_yz < 0) std::swap(y0, y1);

        lr = scale8To5(x0);
        lg = scale8To6(y0);
        lb = scale8To5(min_b);

        hr = scale8To5(x1);
        hg = scale8To6(y1);
        hb = scale8To5(max_b);
    } else {
        // Select 2 colors along the principle axis. (There must be a faster/simpler way.)
        uint32_t low_c = 0, high_c = 0;

        int icov[6] = {0, 0, 0, 0, 0, 0};
        for (uint32_t i = 0; i < 16; i++) {
            int r = (int)pSrc_pixels[i].r - avg_r;
            int g = (int)pSrc_pixels[i].g - avg_g;
            int b = (int)pSrc_pixels[i].b - avg_b;
            icov[0] += r * r;
            icov[1] += r * g;
            icov[2] += r * b;
            icov[3] += g * g;
            icov[4] += g * b;
            icov[5] += b * b;
        }

        int saxis_r = 306, saxis_g = 601, saxis_b = 117;

        float xr = (float)(max_r - min_r);
        float xg = (float)(max_g - min_g);
        float xb = (float)(max_b - min_b);

        if (icov[2] < 0) xr = -xr;

        if (icov[4] < 0) xg = -xg;

        float cov[6];
        for (uint32_t i = 0; i < 6; i++) cov[i] = (float)(icov[i]) * (1.0f / 255.0f);

        const uint32_t total_power_iters = (flags & cEncodeBC1Use6PowerIters) ? 6 : 4;
        for (uint32_t power_iter = 0; power_iter < total_power_iters; power_iter++) {
            float r = xr * cov[0] + xg * cov[1] + xb * cov[2];
            float g = xr * cov[1] + xg * cov[3] + xb * cov[4];
            float b = xr * cov[2] + xg * cov[4] + xb * cov[5];
            xr = r;
            xg = g;
            xb = b;
        }

        float k = maximum(fabsf(xr), fabsf(xg), fabsf(xb));
        if (k >= 2) {
            float m = 2048.0f / k;
            saxis_r = (int)(xr * m);
            saxis_g = (int)(xg * m);
            saxis_b = (int)(xb * m);
        }

        int low_dot = INT_MAX, high_dot = INT_MIN;

        saxis_r = (int)((uint32_t)saxis_r << 4U);
        saxis_g = (int)((uint32_t)saxis_g << 4U);
        saxis_b = (int)((uint32_t)saxis_b << 4U);

        for (uint32_t i = 0; i < 16; i += 4) {
            int dot0 = ((pSrc_pixels[i].r * saxis_r + pSrc_pixels[i].g * saxis_g + pSrc_pixels[i].b * saxis_b) & ~0xF) + i;
            int dot1 = ((pSrc_pixels[i + 1].r * saxis_r + pSrc_pixels[i + 1].g * saxis_g + pSrc_pixels[i + 1].b * saxis_b) & ~0xF) + i + 1;
            int dot2 = ((pSrc_pixels[i + 2].r * saxis_r + pSrc_pixels[i + 2].g * saxis_g + pSrc_pixels[i + 2].b * saxis_b) & ~0xF) + i + 2;
            int dot3 = ((pSrc_pixels[i + 3].r * saxis_r + pSrc_pixels[i + 3].g * saxis_g + pSrc_pixels[i + 3].b * saxis_b) & ~0xF) + i + 3;

            int min_d01 = std::min(dot0, dot1);
            int max_d01 = std::max(dot0, dot1);

            int min_d23 = std::min(dot2, dot3);
            int max_d23 = std::max(dot2, dot3);

            int min_d = std::min(min_d01, min_d23);
            int max_d = std::max(max_d01, max_d23);

            low_dot = std::min(low_dot, min_d);
            high_dot = std::max(high_dot, max_d);
        }
        low_c = low_dot & 15;
        high_c = high_dot & 15;

        lr = scale8To5(pSrc_pixels[low_c].r);
        lg = scale8To6(pSrc_pixels[low_c].g);
        lb = scale8To5(pSrc_pixels[low_c].b);

        hr = scale8To5(pSrc_pixels[high_c].r);
        hg = scale8To6(pSrc_pixels[high_c].g);
        hb = scale8To5(pSrc_pixels[high_c].b);
    }
}

static const int8_t s_adjacent_voxels[16][4] = {
    {1, 0, 0, 3},    // 0
    {0, 1, 0, 4},    // 1
    {0, 0, 1, 5},    // 2
    {-1, 0, 0, 0},   // 3
    {0, -1, 0, 1},   // 4
    {0, 0, -1, 2},   // 5
    {1, 1, 0, 9},    // 6
    {1, 0, 1, 10},   // 7
    {0, 1, 1, 11},   // 8
    {-1, -1, 0, 6},  // 9
    {-1, 0, -1, 7},  // 10
    {0, -1, -1, 8},  // 11
    {-1, 1, 0, 13},  // 12
    {1, -1, 0, 12},  // 13
    {0, -1, 1, 15},  // 14
    {0, 1, -1, 14},  // 15
};

// From icbc's high quality mode.
static inline void encode_bc1_endpoint_search(const Color *pSrc_pixels, bool any_black_pixels, uint32_t flags, bc1_encode_results &results,
                                              uint32_t cur_err) {
    int &lr = results.lr, &lg = results.lg, &lb = results.lb, &hr = results.hr, &hg = results.hg, &hb = results.hb;
    uint8_t *sels = results.sels;

    int prev_improvement_index = 0, forbidden_direction = -1;

    const int endpoint_search_rounds = (flags & cEncodeBC1EndpointSearchRoundsMask) >> cEncodeBC1EndpointSearchRoundsShift;
    for (int i = 0; i < endpoint_search_rounds; i++) {
        assert(s_adjacent_voxels[s_adjacent_voxels[i & 15][3]][3] == (i & 15));

        if (forbidden_direction == (i & 31)) continue;

        const int8_t delta[3] = {s_adjacent_voxels[i & 15][0], s_adjacent_voxels[i & 15][1], s_adjacent_voxels[i & 15][2]};

        int trial_lr = lr, trial_lg = lg, trial_lb = lb, trial_hr = hr, trial_hg = hg, trial_hb = hb;

        if ((i >> 4) & 1) {
            trial_lr = clampi(trial_lr + delta[0], 0, 31);
            trial_lg = clampi(trial_lg + delta[1], 0, 63);
            trial_lb = clampi(trial_lb + delta[2], 0, 31);
        } else {
            trial_hr = clampi(trial_hr + delta[0], 0, 31);
            trial_hg = clampi(trial_hg + delta[1], 0, 63);
            trial_hb = clampi(trial_hb + delta[2], 0, 31);
        }

        uint8_t trial_sels[16];

        uint32_t trial_err;
        if (results.m_3color) {
            trial_err = bc1_find_sels3_fullerr(((any_black_pixels) && ((flags & cEncodeBC1Use3ColorBlocksForBlackPixels) != 0)), pSrc_pixels, trial_lr,
                                               trial_lg, trial_lb, trial_hr, trial_hg, trial_hb, trial_sels, cur_err);
        } else {
            trial_err = bc1_find_sels4(flags, pSrc_pixels, trial_lr, trial_lg, trial_lb, trial_hr, trial_hg, trial_hb, trial_sels, cur_err);
        }

        if (trial_err < cur_err) {
            cur_err = trial_err;

            forbidden_direction = s_adjacent_voxels[i & 15][3] | (i & 16);

            lr = trial_lr, lg = trial_lg, lb = trial_lb, hr = trial_hr, hg = trial_hg, hb = trial_hb;

            memcpy(sels, trial_sels, 16);

            prev_improvement_index = i;
        }

        if (i - prev_improvement_index > 32) break;
    }
}

void encode_bc1(void *pDst, const uint8_t *pPixels, uint32_t flags, uint32_t total_orderings_to_try, uint32_t total_orderings_to_try3) {
    assert(g_initialized);

    const Color *pSrc_pixels = (const Color *)pPixels;
    BC1Block *pDst_block = static_cast<BC1Block *>(pDst);

    int avg_r, avg_g, avg_b, min_r, min_g, min_b, max_r, max_g, max_b;

    const uint32_t fr = pSrc_pixels[0].r, fg = pSrc_pixels[0].g, fb = pSrc_pixels[0].b;

    uint32_t j;
    for (j = 15; j >= 1; --j)
        if ((pSrc_pixels[j].r != fr) || (pSrc_pixels[j].g != fg) || (pSrc_pixels[j].b != fb)) break;

    if (j == 0) {
        encode_bc1_solid_block(pDst, fr, fg, fb, (flags & (cEncodeBC1Use3ColorBlocks | cEncodeBC1Use3ColorBlocksForBlackPixels)) != 0);
        return;
    }

    int total_r = fr, total_g = fg, total_b = fb;

    max_r = fr, max_g = fg, max_b = fb;
    min_r = fr, min_g = fg, min_b = fb;

    uint32_t grayscale_flag = (fr == fg) && (fr == fb);
    uint32_t any_black_pixels = (fr | fg | fb) < 4;

    for (uint32_t i = 1; i < 16; i++) {
        const int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;

        grayscale_flag &= ((r == g) && (r == b));
        any_black_pixels |= ((r | g | b) < 4);

        max_r = std::max(max_r, r);
        max_g = std::max(max_g, g);
        max_b = std::max(max_b, b);
        min_r = std::min(min_r, r);
        min_g = std::min(min_g, g);
        min_b = std::min(min_b, b);
        total_r += r;
        total_g += g;
        total_b += b;
    }

    avg_r = (total_r + 8) >> 4, avg_g = (total_g + 8) >> 4, avg_b = (total_b + 8) >> 4;

    bc1_encode_results results;
    results.m_3color = false;

    uint8_t *sels = results.sels;
    int &lr = results.lr, &lg = results.lg, &lb = results.lb, &hr = results.hr, &hg = results.hg, &hb = results.hb;
    int orig_lr = 0, orig_lg = 0, orig_lb = 0, orig_hr = 0, orig_hg = 0, orig_hb = 0;

    lr = 0, lg = 0, lb = 0, hr = 0, hg = 0, hb = 0;

    const bool needs_block_error =
        ((flags & (cEncodeBC1UseLikelyTotalOrderings | cEncodeBC1Use3ColorBlocks | cEncodeBC1UseFullMSEEval | cEncodeBC1EndpointSearchRoundsMask)) != 0) ||
        (any_black_pixels && ((flags & cEncodeBC1Use3ColorBlocksForBlackPixels) != 0));

    uint32_t cur_err = UINT32_MAX;

    if (!needs_block_error) {
        assert((flags & cEncodeBC1TryAllInitialEndponts) == 0);

        encode_bc1_pick_initial(pSrc_pixels, flags, grayscale_flag != 0, min_r, min_g, min_b, max_r, max_g, max_b, avg_r, avg_g, avg_b, total_r, total_g,
                                total_b, lr, lg, lb, hr, hg, hb);

        orig_lr = lr, orig_lg = lg, orig_lb = lb, orig_hr = hr, orig_hg = hg, orig_hb = hb;

        bc1_find_sels4_noerr(pSrc_pixels, lr, lg, lb, hr, hg, hb, sels);

        const uint32_t total_ls_passes = flags & cEncodeBC1TwoLeastSquaresPasses ? 2 : 1;
        for (uint32_t ls_pass = 0; ls_pass < total_ls_passes; ls_pass++) {
            int trial_lr, trial_lg, trial_lb, trial_hr, trial_hg, trial_hb;

            vec3F xl, xh;
            if (!compute_least_squares_endpoints4_rgb(pSrc_pixels, sels, &xl, &xh, total_r, total_g, total_b)) {
                // All selectors equal - treat it as a solid block which should always be equal or better.
                trial_lr = g_bc1_match5_equals_1[avg_r].m_hi;
                trial_lg = g_bc1_match6_equals_1[avg_g].m_hi;
                trial_lb = g_bc1_match5_equals_1[avg_b].m_hi;

                trial_hr = g_bc1_match5_equals_1[avg_r].m_lo;
                trial_hg = g_bc1_match6_equals_1[avg_g].m_lo;
                trial_hb = g_bc1_match5_equals_1[avg_b].m_lo;

                // In high/higher quality mode, let it try again in case the optimal tables have caused the sels to diverge.
            } else {
                precise_round_565(xl, xh, trial_hr, trial_hg, trial_hb, trial_lr, trial_lg, trial_lb);
            }

            if ((lr == trial_lr) && (lg == trial_lg) && (lb == trial_lb) && (hr == trial_hr) && (hg == trial_hg) && (hb == trial_hb)) break;

            bc1_find_sels4_noerr(pSrc_pixels, trial_lr, trial_lg, trial_lb, trial_hr, trial_hg, trial_hb, sels);

            lr = trial_lr;
            lg = trial_lg;
            lb = trial_lb;
            hr = trial_hr;
            hg = trial_hg;
            hb = trial_hb;

        }  // ls_pass
    } else {
        // calculate block error from naïve approach
        const uint32_t total_rounds = (flags & cEncodeBC1TryAllInitialEndponts) ? 2 : 1;
        for (uint32_t round = 0; round < total_rounds; round++) {
            uint32_t modified_flags = flags;
            if (round == 1) {
                modified_flags &= ~(cEncodeBC1Use2DLS | cEncodeBC1BoundingBox);
                modified_flags |= cEncodeBC1BoundingBox;
            }

            int round_lr, round_lg, round_lb, round_hr, round_hg, round_hb;
            uint8_t round_sels[16];

            encode_bc1_pick_initial(pSrc_pixels, modified_flags, grayscale_flag != 0, min_r, min_g, min_b, max_r, max_g, max_b, avg_r, avg_g, avg_b, total_r,
                                    total_g, total_b, round_lr, round_lg, round_lb, round_hr, round_hg, round_hb);

            int orig_round_lr = round_lr, orig_round_lg = round_lg, orig_round_lb = round_lb, orig_round_hr = round_hr, orig_round_hg = round_hg,
                orig_round_hb = round_hb;

            uint32_t round_err = bc1_find_sels4(flags, pSrc_pixels, round_lr, round_lg, round_lb, round_hr, round_hg, round_hb, round_sels, UINT32_MAX);

            const uint32_t total_ls_passes = flags & cEncodeBC1TwoLeastSquaresPasses ? 2 : 1;
            for (uint32_t ls_pass = 0; ls_pass < total_ls_passes; ls_pass++) {
                int trial_lr, trial_lg, trial_lb, trial_hr, trial_hg, trial_hb;

                vec3F xl, xh;
                if (!compute_least_squares_endpoints4_rgb(pSrc_pixels, round_sels, &xl, &xh, total_r, total_g, total_b)) {
                    // All selectors equal - treat it as a solid block which should always be equal or better.
                    trial_lr = g_bc1_match5_equals_1[avg_r].m_hi;
                    trial_lg = g_bc1_match6_equals_1[avg_g].m_hi;
                    trial_lb = g_bc1_match5_equals_1[avg_b].m_hi;

                    trial_hr = g_bc1_match5_equals_1[avg_r].m_lo;
                    trial_hg = g_bc1_match6_equals_1[avg_g].m_lo;
                    trial_hb = g_bc1_match5_equals_1[avg_b].m_lo;

                    // In high/higher quality mode, let it try again in case the optimal tables have caused the sels to diverge.
                } else {
                    precise_round_565(xl, xh, trial_hr, trial_hg, trial_hb, trial_lr, trial_lg, trial_lb);
                }

                if ((round_lr == trial_lr) && (round_lg == trial_lg) && (round_lb == trial_lb) && (round_hr == trial_hr) && (round_hg == trial_hg) &&
                    (round_hb == trial_hb))
                    break;

                uint8_t trial_sels[16];
                uint32_t trial_err = bc1_find_sels4(flags, pSrc_pixels, trial_lr, trial_lg, trial_lb, trial_hr, trial_hg, trial_hb, trial_sels, round_err);

                if (trial_err < round_err) {
                    round_lr = trial_lr;
                    round_lg = trial_lg;
                    round_lb = trial_lb;

                    round_hr = trial_hr;
                    round_hg = trial_hg;
                    round_hb = trial_hb;

                    round_err = trial_err;
                    memcpy(round_sels, trial_sels, 16);
                } else
                    break;

            }  // ls_pass

            if (round_err <= cur_err) {
                cur_err = round_err;

                lr = round_lr;
                lg = round_lg;
                lb = round_lb;
                hr = round_hr;
                hg = round_hg;
                hb = round_hb;

                orig_lr = orig_round_lr;
                orig_lg = orig_round_lg;
                orig_lb = orig_round_lb;
                orig_hr = orig_round_hr;
                orig_hg = orig_round_hg;
                orig_hb = orig_round_hb;

                memcpy(sels, round_sels, 16);
            }

        }  // round
    }

    if ((cur_err) && (flags & cEncodeBC1UseLikelyTotalOrderings)) {
        assert(needs_block_error);

        const uint32_t total_iters = (flags & cEncodeBC1Iterative) ? 2 : 1;
        for (uint32_t iter_index = 0; iter_index < total_iters; iter_index++) {
            const uint32_t orig_err = cur_err;

            hist4 h;
            for (uint32_t i = 0; i < 16; i++) {
                assert(sels[i] < 4);
                h.m_hist[sels[i]]++;
            }

            const uint32_t orig_total_order_index = h.lookup_total_ordering_index();

            int r0, g0, b0, r3, g3, b3;
            r0 = (lr << 3) | (lr >> 2);
            g0 = (lg << 2) | (lg >> 4);
            b0 = (lb << 3) | (lb >> 2);
            r3 = (hr << 3) | (hr >> 2);
            g3 = (hg << 2) | (hg >> 4);
            b3 = (hb << 3) | (hb >> 2);

            int ar = r3 - r0, ag = g3 - g0, ab = b3 - b0;

            int dots[16];
            for (uint32_t i = 0; i < 16; i++) {
                int r = pSrc_pixels[i].r;
                int g = pSrc_pixels[i].g;
                int b = pSrc_pixels[i].b;
                int d = 0x1000000 + (r * ar + g * ag + b * ab);
                assert(d >= 0);
                dots[i] = (d << 4) + i;
            }

            std::sort(dots, dots + 16);

            uint32_t r_sum[17], g_sum[17], b_sum[17];
            uint32_t r = 0, g = 0, b = 0;
            for (uint32_t i = 0; i < 16; i++) {
                const uint32_t p = dots[i] & 15;

                r_sum[i] = r;
                g_sum[i] = g;
                b_sum[i] = b;

                r += pSrc_pixels[p].r;
                g += pSrc_pixels[p].g;
                b += pSrc_pixels[p].b;
            }

            r_sum[16] = total_r;
            g_sum[16] = total_g;
            b_sum[16] = total_b;

            const uint32_t q_total =
                (flags & cEncodeBC1Exhaustive) ? NUM_UNIQUE_TOTAL_ORDERINGS4 : clampi(total_orderings_to_try, MIN_TOTAL_ORDERINGS, MAX_TOTAL_ORDERINGS4);
            for (uint32_t q = 0; q < q_total; q++) {
                const uint32_t s = (flags & cEncodeBC1Exhaustive) ? q : g_best_total_orderings4[orig_total_order_index][q];

                int trial_lr, trial_lg, trial_lb, trial_hr, trial_hg, trial_hb;

                vec3F xl, xh;

                if ((s == TOTAL_ORDER_4_0_16) || (s == TOTAL_ORDER_4_1_16) || (s == TOTAL_ORDER_4_2_16) || (s == TOTAL_ORDER_4_3_16)) {
                    trial_lr = g_bc1_match5_equals_1[avg_r].m_hi;
                    trial_lg = g_bc1_match6_equals_1[avg_g].m_hi;
                    trial_lb = g_bc1_match5_equals_1[avg_b].m_hi;

                    trial_hr = g_bc1_match5_equals_1[avg_r].m_lo;
                    trial_hg = g_bc1_match6_equals_1[avg_g].m_lo;
                    trial_hb = g_bc1_match5_equals_1[avg_b].m_lo;
                } else {
                    compute_least_squares_endpoints4_rgb(&xl, &xh, total_r, total_g, total_b, g_selector_factors4[s][0], g_selector_factors4[s][1],
                                                         g_selector_factors4[s][2], s, r_sum, g_sum, b_sum);

                    precise_round_565(xl, xh, trial_hr, trial_hg, trial_hb, trial_lr, trial_lg, trial_lb);
                }

                uint8_t trial_sels[16];

                uint32_t trial_err = bc1_find_sels4(flags, pSrc_pixels, trial_lr, trial_lg, trial_lb, trial_hr, trial_hg, trial_hb, trial_sels, cur_err);

                if (trial_err < cur_err) {
                    cur_err = trial_err;

                    lr = trial_lr;
                    lg = trial_lg;
                    lb = trial_lb;

                    hr = trial_hr;
                    hg = trial_hg;
                    hb = trial_hb;

                    memcpy(sels, trial_sels, 16);
                }

            }  // s

            if ((!cur_err) || (cur_err == orig_err)) break;

        }  // iter_index
    }

    if (((flags & (cEncodeBC1Use3ColorBlocks | cEncodeBC1Use3ColorBlocksForBlackPixels)) != 0) && (cur_err)) {
        if (flags & cEncodeBC1Use3ColorBlocks) {
            assert(needs_block_error);
            try_3color_block(pSrc_pixels, flags, cur_err, avg_r, avg_g, avg_b, orig_lr, orig_lg, orig_lb, orig_hr, orig_hg, orig_hb, total_r, total_g, total_b,
                             total_orderings_to_try3, results);
        }

        if ((any_black_pixels) && ((flags & cEncodeBC1Use3ColorBlocksForBlackPixels) != 0)) {
            assert(needs_block_error);
            try_3color_block_useblack(pSrc_pixels, flags, cur_err, results);
        }
    }

    if ((flags & cEncodeBC1EndpointSearchRoundsMask) && (cur_err)) {
        assert(needs_block_error);

        encode_bc1_endpoint_search(pSrc_pixels, any_black_pixels != 0, flags, results, cur_err);
    }

    if (results.m_3color)
        bc1_encode3(pDst_block, results.lr, results.lg, results.lb, results.hr, results.hg, results.hb, results.sels);
    else
        bc1_encode4(pDst_block, results.lr, results.lg, results.lb, results.hr, results.hg, results.hb, results.sels);
}

// BC3-5
void encode_bc4(void *pDst, const uint8_t *pPixels, uint32_t stride) {
    assert(g_initialized);

    uint32_t min0_v, max0_v, min1_v, max1_v, min2_v, max2_v, min3_v, max3_v;

    {
        min0_v = max0_v = pPixels[0 * stride];
        min1_v = max1_v = pPixels[1 * stride];
        min2_v = max2_v = pPixels[2 * stride];
        min3_v = max3_v = pPixels[3 * stride];
    }

    {
        uint32_t v0 = pPixels[4 * stride];
        min0_v = std::min(min0_v, v0);
        max0_v = std::max(max0_v, v0);
        uint32_t v1 = pPixels[5 * stride];
        min1_v = std::min(min1_v, v1);
        max1_v = std::max(max1_v, v1);
        uint32_t v2 = pPixels[6 * stride];
        min2_v = std::min(min2_v, v2);
        max2_v = std::max(max2_v, v2);
        uint32_t v3 = pPixels[7 * stride];
        min3_v = std::min(min3_v, v3);
        max3_v = std::max(max3_v, v3);
    }

    {
        uint32_t v0 = pPixels[8 * stride];
        min0_v = std::min(min0_v, v0);
        max0_v = std::max(max0_v, v0);
        uint32_t v1 = pPixels[9 * stride];
        min1_v = std::min(min1_v, v1);
        max1_v = std::max(max1_v, v1);
        uint32_t v2 = pPixels[10 * stride];
        min2_v = std::min(min2_v, v2);
        max2_v = std::max(max2_v, v2);
        uint32_t v3 = pPixels[11 * stride];
        min3_v = std::min(min3_v, v3);
        max3_v = std::max(max3_v, v3);
    }

    {
        uint32_t v0 = pPixels[12 * stride];
        min0_v = std::min(min0_v, v0);
        max0_v = std::max(max0_v, v0);
        uint32_t v1 = pPixels[13 * stride];
        min1_v = std::min(min1_v, v1);
        max1_v = std::max(max1_v, v1);
        uint32_t v2 = pPixels[14 * stride];
        min2_v = std::min(min2_v, v2);
        max2_v = std::max(max2_v, v2);
        uint32_t v3 = pPixels[15 * stride];
        min3_v = std::min(min3_v, v3);
        max3_v = std::max(max3_v, v3);
    }

    const uint32_t min_v = minimum(min0_v, min1_v, min2_v, min3_v);
    const uint32_t max_v = maximum(max0_v, max1_v, max2_v, max3_v);

    uint8_t *pDst_bytes = static_cast<uint8_t *>(pDst);
    pDst_bytes[0] = (uint8_t)max_v;
    pDst_bytes[1] = (uint8_t)min_v;

    if (max_v == min_v) {
        memset(pDst_bytes + 2, 0, 6);
        return;
    }

    const uint32_t delta = max_v - min_v;

    // min_v is now 0. Compute thresholds between values by scaling max_v. It's x14 because we're adding two x7 scale factors.
    const int t0 = delta * 13;
    const int t1 = delta * 11;
    const int t2 = delta * 9;
    const int t3 = delta * 7;
    const int t4 = delta * 5;
    const int t5 = delta * 3;
    const int t6 = delta * 1;

    // BC4 floors in its divisions, which we compensate for with the 4 bias.
    // This function is optimal for all possible inputs (i.e. it outputs the same results as checking all 8 values and choosing the closest one).
    const int bias = 4 - min_v * 14;

    static const uint32_t s_tran0[8] = {1U, 7U, 6U, 5U, 4U, 3U, 2U, 0U};
    static const uint32_t s_tran1[8] = {1U << 3U, 7U << 3U, 6U << 3U, 5U << 3U, 4U << 3U, 3U << 3U, 2U << 3U, 0U << 3U};
    static const uint32_t s_tran2[8] = {1U << 6U, 7U << 6U, 6U << 6U, 5U << 6U, 4U << 6U, 3U << 6U, 2U << 6U, 0U << 6U};
    static const uint32_t s_tran3[8] = {1U << 9U, 7U << 9U, 6U << 9U, 5U << 9U, 4U << 9U, 3U << 9U, 2U << 9U, 0U << 9U};

    uint64_t a0, a1, a2, a3;
    {
        const int v0 = pPixels[0 * stride] * 14 + bias;
        const int v1 = pPixels[1 * stride] * 14 + bias;
        const int v2 = pPixels[2 * stride] * 14 + bias;
        const int v3 = pPixels[3 * stride] * 14 + bias;
        a0 = s_tran0[(v0 >= t0) + (v0 >= t1) + (v0 >= t2) + (v0 >= t3) + (v0 >= t4) + (v0 >= t5) + (v0 >= t6)];
        a1 = s_tran1[(v1 >= t0) + (v1 >= t1) + (v1 >= t2) + (v1 >= t3) + (v1 >= t4) + (v1 >= t5) + (v1 >= t6)];
        a2 = s_tran2[(v2 >= t0) + (v2 >= t1) + (v2 >= t2) + (v2 >= t3) + (v2 >= t4) + (v2 >= t5) + (v2 >= t6)];
        a3 = s_tran3[(v3 >= t0) + (v3 >= t1) + (v3 >= t2) + (v3 >= t3) + (v3 >= t4) + (v3 >= t5) + (v3 >= t6)];
    }

    {
        const int v0 = pPixels[4 * stride] * 14 + bias;
        const int v1 = pPixels[5 * stride] * 14 + bias;
        const int v2 = pPixels[6 * stride] * 14 + bias;
        const int v3 = pPixels[7 * stride] * 14 + bias;
        a0 |= (uint64_t)(s_tran0[(v0 >= t0) + (v0 >= t1) + (v0 >= t2) + (v0 >= t3) + (v0 >= t4) + (v0 >= t5) + (v0 >= t6)] << 12U);
        a1 |= (uint64_t)(s_tran1[(v1 >= t0) + (v1 >= t1) + (v1 >= t2) + (v1 >= t3) + (v1 >= t4) + (v1 >= t5) + (v1 >= t6)] << 12U);
        a2 |= (uint64_t)(s_tran2[(v2 >= t0) + (v2 >= t1) + (v2 >= t2) + (v2 >= t3) + (v2 >= t4) + (v2 >= t5) + (v2 >= t6)] << 12U);
        a3 |= (uint64_t)(s_tran3[(v3 >= t0) + (v3 >= t1) + (v3 >= t2) + (v3 >= t3) + (v3 >= t4) + (v3 >= t5) + (v3 >= t6)] << 12U);
    }

    {
        const int v0 = pPixels[8 * stride] * 14 + bias;
        const int v1 = pPixels[9 * stride] * 14 + bias;
        const int v2 = pPixels[10 * stride] * 14 + bias;
        const int v3 = pPixels[11 * stride] * 14 + bias;
        a0 |= (((uint64_t)s_tran0[(v0 >= t0) + (v0 >= t1) + (v0 >= t2) + (v0 >= t3) + (v0 >= t4) + (v0 >= t5) + (v0 >= t6)]) << 24U);
        a1 |= (((uint64_t)s_tran1[(v1 >= t0) + (v1 >= t1) + (v1 >= t2) + (v1 >= t3) + (v1 >= t4) + (v1 >= t5) + (v1 >= t6)]) << 24U);
        a2 |= (((uint64_t)s_tran2[(v2 >= t0) + (v2 >= t1) + (v2 >= t2) + (v2 >= t3) + (v2 >= t4) + (v2 >= t5) + (v2 >= t6)]) << 24U);
        a3 |= (((uint64_t)s_tran3[(v3 >= t0) + (v3 >= t1) + (v3 >= t2) + (v3 >= t3) + (v3 >= t4) + (v3 >= t5) + (v3 >= t6)]) << 24U);
    }

    {
        const int v0 = pPixels[12 * stride] * 14 + bias;
        const int v1 = pPixels[13 * stride] * 14 + bias;
        const int v2 = pPixels[14 * stride] * 14 + bias;
        const int v3 = pPixels[15 * stride] * 14 + bias;
        a0 |= (((uint64_t)s_tran0[(v0 >= t0) + (v0 >= t1) + (v0 >= t2) + (v0 >= t3) + (v0 >= t4) + (v0 >= t5) + (v0 >= t6)]) << 36U);
        a1 |= (((uint64_t)s_tran1[(v1 >= t0) + (v1 >= t1) + (v1 >= t2) + (v1 >= t3) + (v1 >= t4) + (v1 >= t5) + (v1 >= t6)]) << 36U);
        a2 |= (((uint64_t)s_tran2[(v2 >= t0) + (v2 >= t1) + (v2 >= t2) + (v2 >= t3) + (v2 >= t4) + (v2 >= t5) + (v2 >= t6)]) << 36U);
        a3 |= (((uint64_t)s_tran3[(v3 >= t0) + (v3 >= t1) + (v3 >= t2) + (v3 >= t3) + (v3 >= t4) + (v3 >= t5) + (v3 >= t6)]) << 36U);
    }

    const uint64_t f = a0 | a1 | a2 | a3;

    // TODO: make this less silly by using the BC4Block class
    pDst_bytes[2] = (uint8_t)f;
    pDst_bytes[3] = (uint8_t)(f >> 8U);
    pDst_bytes[4] = (uint8_t)(f >> 16U);
    pDst_bytes[5] = (uint8_t)(f >> 24U);
    pDst_bytes[6] = (uint8_t)(f >> 32U);
    pDst_bytes[7] = (uint8_t)(f >> 40U);
}

void encode_bc3(BC3Block *pDst, const uint8_t *pPixels, uint32_t flags, uint32_t total_orderings_to_try) {
    assert(g_initialized);

    // 3-color blocks are not allowed with BC3 (on most GPU's).
    flags &= ~(cEncodeBC1Use3ColorBlocksForBlackPixels | cEncodeBC1Use3ColorBlocks);

    encode_bc4(&pDst->alpha_block, pPixels + 3, 4);
    encode_bc1(&pDst->color_block, pPixels, flags, total_orderings_to_try);
}

void encode_bc3(uint32_t level, BC3Block *pDst, const uint8_t *pPixels) {
    assert(g_initialized);

    encode_bc4(&pDst->alpha_block, pPixels + 3, 4);
    encode_bc1(level, &pDst->color_block, pPixels, false, false);
}

void encode_bc5(BC5Block *pDst, const uint8_t *pPixels, uint32_t chan0, uint32_t chan1, uint32_t stride) {
    assert(g_initialized);

    encode_bc4(&pDst->chan0_block, pPixels + chan0, stride);
    encode_bc4(&pDst->chan1_block, pPixels + chan1, stride);
}

// Returns true if the block uses 3 color punchthrough alpha mode.
bool unpack_bc1(const void *pBlock_bits, void *pPixels, bool set_alpha, bc1_approx_mode mode) {
    Color *pDst_pixels = static_cast<Color *>(pPixels);

    static_assert(sizeof(BC1Block) == 8, "sizeof(BC1Block) == 8");
    static_assert(sizeof(BC4Block) == 8, "sizeof(BC4Block) == 8");

    const BC1Block *pBlock = static_cast<const BC1Block *>(pBlock_bits);

    const uint32_t l = pBlock->GetLowColor();
    const uint32_t h = pBlock->GetHighColor();

    Color c[4];

    const int cr0 = (l >> 11) & 31;
    const int cg0 = (l >> 5) & 63;
    const int cb0 = l & 31;
    const int r0 = (cr0 << 3) | (cr0 >> 2);
    const int g0 = (cg0 << 2) | (cg0 >> 4);
    const int b0 = (cb0 << 3) | (cb0 >> 2);

    const int cr1 = (h >> 11) & 31;
    const int cg1 = (h >> 5) & 63;
    const int cb1 = h & 31;
    const int r1 = (cr1 << 3) | (cr1 >> 2);
    const int g1 = (cg1 << 2) | (cg1 >> 4);
    const int b1 = (cb1 << 3) | (cb1 >> 2);

    bool used_punchthrough = false;

    if (l > h) {
        c[0].SetRGBA(r0, g0, b0, 255);
        c[1].SetRGBA(r1, g1, b1, 255);
        switch (mode) {
            case bc1_approx_mode::cBC1Ideal:
                c[2].SetRGBA((r0 * 2 + r1) / 3, (g0 * 2 + g1) / 3, (b0 * 2 + b1) / 3, 255);
                c[3].SetRGBA((r1 * 2 + r0) / 3, (g1 * 2 + g0) / 3, (b1 * 2 + b0) / 3, 255);
                break;
            case bc1_approx_mode::cBC1IdealRound4:
                c[2].SetRGBA((r0 * 2 + r1 + 1) / 3, (g0 * 2 + g1 + 1) / 3, (b0 * 2 + b1 + 1) / 3, 255);
                c[3].SetRGBA((r1 * 2 + r0 + 1) / 3, (g1 * 2 + g0 + 1) / 3, (b1 * 2 + b0 + 1) / 3, 255);
                break;
            case bc1_approx_mode::cBC1NVidia:
                c[2].SetRGBA(interp_5_nv(cr0, cr1), interp_6_nv(g0, g1), interp_5_nv(cb0, cb1), 255);
                c[3].SetRGBA(interp_5_nv(cr1, cr0), interp_6_nv(g1, g0), interp_5_nv(cb1, cb0), 255);
                break;
            case bc1_approx_mode::cBC1AMD:
                c[2].SetRGBA(interp_5_6_amd(r0, r1), interp_5_6_amd(g0, g1), interp_5_6_amd(b0, b1), 255);
                c[3].SetRGBA(interp_5_6_amd(r1, r0), interp_5_6_amd(g1, g0), interp_5_6_amd(b1, b0), 255);
                break;
        }
    } else {
        c[0].SetRGBA(r0, g0, b0, 255);
        c[1].SetRGBA(r1, g1, b1, 255);
        switch (mode) {
            case bc1_approx_mode::cBC1Ideal:
            case bc1_approx_mode::cBC1IdealRound4:
                c[2].SetRGBA((r0 + r1) / 2, (g0 + g1) / 2, (b0 + b1) / 2, 255);
                break;
            case bc1_approx_mode::cBC1NVidia:
                c[2].SetRGBA(interp_half_5_nv(cr0, cr1), interp_half_6_nv(g0, g1), interp_half_5_nv(cb0, cb1), 255);
                break;
            case bc1_approx_mode::cBC1AMD:
                c[2].SetRGBA(interp_half_5_6_amd(r0, r1), interp_half_5_6_amd(g0, g1), interp_half_5_6_amd(b0, b1), 255);
                break;
        }

        c[3].SetRGBA(0, 0, 0, 0);
        used_punchthrough = true;
    }

    if (set_alpha) {
        for (uint32_t y = 0; y < 4; y++, pDst_pixels += 4) {
            pDst_pixels[0] = c[pBlock->GetSelector(0, y)];
            pDst_pixels[1] = c[pBlock->GetSelector(1, y)];
            pDst_pixels[2] = c[pBlock->GetSelector(2, y)];
            pDst_pixels[3] = c[pBlock->GetSelector(3, y)];
        }
    } else {
        for (uint32_t y = 0; y < 4; y++, pDst_pixels += 4) {
            pDst_pixels[0].SetRGBA(c[pBlock->GetSelector(0, y)]);
            pDst_pixels[1].SetRGBA(c[pBlock->GetSelector(1, y)]);
            pDst_pixels[2].SetRGBA(c[pBlock->GetSelector(2, y)]);
            pDst_pixels[3].SetRGBA(c[pBlock->GetSelector(3, y)]);
        }
    }

    return used_punchthrough;
}

void unpack_bc4(const void *pBlock_bits, uint8_t *pPixels, uint32_t stride) {
    static_assert(sizeof(BC4Block) == 8, "sizeof(BC4Block) == 8");

    const BC4Block *pBlock = static_cast<const BC4Block *>(pBlock_bits);

    auto sel_values = BC4Block::GetValues(pBlock->GetLowAlpha(), pBlock->GetHighAlpha());

    const uint64_t selector_bits = pBlock->GetSelectorBits();

    for (uint32_t y = 0; y < 4; y++, pPixels += (stride * 4U)) {
        pPixels[0] = sel_values[pBlock->GetSelector(0, y, selector_bits)];
        pPixels[stride * 1] = sel_values[pBlock->GetSelector(1, y, selector_bits)];
        pPixels[stride * 2] = sel_values[pBlock->GetSelector(2, y, selector_bits)];
        pPixels[stride * 3] = sel_values[pBlock->GetSelector(3, y, selector_bits)];
    }
}

// Returns false if the block uses 3-color punchthrough alpha mode, which isn't supported on some GPU's for BC3.
bool unpack_bc3(const void *pBlock_bits, void *pPixels, bc1_approx_mode mode) {
    Color *pDst_pixels = static_cast<Color *>(pPixels);

    bool success = true;

    if (unpack_bc1((const uint8_t *)pBlock_bits + sizeof(BC4Block), pDst_pixels, true, mode)) success = false;

    unpack_bc4(pBlock_bits, &pDst_pixels[0].a, sizeof(Color));

    return success;
}

// writes RG
void unpack_bc5(const void *pBlock_bits, void *pPixels, uint32_t chan0, uint32_t chan1, uint32_t stride) {
    unpack_bc4(pBlock_bits, (uint8_t *)pPixels + chan0, stride);
    unpack_bc4((const uint8_t *)pBlock_bits + sizeof(BC4Block), (uint8_t *)pPixels + chan1, stride);
}

}  // namespace rgbcx

/*
------------------------------------------------------------------------------
This software is available under 2 licenses -- choose whichever you prefer.
------------------------------------------------------------------------------
ALTERNATIVE A - MIT License
Copyright(c) 2020 Richard Geldreich, Jr.
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files(the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and / or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions :
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
------------------------------------------------------------------------------
ALTERNATIVE B - Public Domain(www.unlicense.org)
This is free and unencumbered software released into the public domain.
Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
software, either in source code form or as a compiled binary, for any purpose,
commercial or non - commercial, and by any means.
In jurisdictions that recognize copyright laws, the author or authors of this
software dedicate any and all copyright interest in the software to the public
domain.We make this dedication for the benefit of the public at large and to
the detriment of our heirs and successors.We intend this dedication to be an
overt act of relinquishment in perpetuity of all present and future rights to
this software under copyright law.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
------------------------------------------------------------------------------
*/