#include "CompressorETC.h"

#include "nvmath/Vector.inl"
#include "nvmath/Matrix.inl"
#include "nvmath/Color.inl"
#include "nvcore/Utils.h"    // clamp

#define HAVE_RGETC 1
#define HAVE_ETCPACK 0 // Only enable in OSX for debugging.

#if HAVE_RGETC
#include "rg_etc1.h"
#endif

#if HAVE_ETCPACK
// From etcpack.cxx
extern void decompressBlockETC2(unsigned int block_part1, unsigned int block_part2, uint8 *img, int width, int height, int startx, int starty);
extern void decompressBlockAlpha(uint8* data, uint8* img, int width, int height, int ix, int iy);
extern void decompressBlockAlpha16bit(uint8* data, uint8* img, int width, int height, int ix, int iy);
extern int formatSigned;
#endif

#define assert nvCheck

using namespace nv;

// TODO:
// - Accurate rounding of signed 3-bit components.
// - Range based table selection.
// - Slower try all options table selection?
// - Trivial selector assignment.
// * Base point optimization.
// * Brute force base point optimization.
// - Enumerate and evaluate all clusters.
// - Brute force planar mode endpoint refinement. For each color try two rounding directions (8 tests).
// - T & H modes decompression.

union BlockETC {
    // Definitions from EtcLib/EtcBlock4x4EncodingBits.h
    struct Individual {
        uint red2 : 4;      // byte 0
        uint red1 : 4;
        uint green2 : 4;    // byte 1
        uint green1 : 4;
        uint blue2 : 4;     // byte 2
        uint blue1 : 4;
        uint flip : 1;      // byte 3
        uint diff : 1;
        uint cw2 : 3;
        uint cw1 : 3;
        uint selectors;     // bytes 4-7
    };
    NV_COMPILER_CHECK(sizeof(BlockETC::Individual) == 64/8);

    struct Differential {
        uint dred2 : 3;     // byte 0
        uint red1 : 5;
        uint dgreen2 : 3;   // byte 1
        uint green1 : 5;
        uint dblue2 : 3;    // byte 2
        uint blue1 : 5;
        uint flip : 1;      // byte 3
        uint diff : 1;
        uint cw2 : 3;
        uint cw1 : 3;
        uint selectors;     // bytes 4-7
    };
    NV_COMPILER_CHECK(sizeof(Differential) == 64/8);

    struct T {
        uint red1b : 2;     // byte 0
        uint detect2 : 1;
        uint red1a : 2;
        uint detect1 : 3;
        uint blue1 : 4;     // byte 1
        uint green1 : 4;
        uint green2 : 4;    // byte 2
        uint red2 : 4;
        uint db : 1;        // byte 3
        uint diff : 1;
        uint da : 2;
        uint blue2 : 4;
        uint selectors;     // bytes 4-7
    };
    NV_COMPILER_CHECK(sizeof(T) == 64/8);

    struct H {
        uint green1a : 3;   // byte 0
        uint red1 : 4;
        uint detect1 : 1;
        uint blue1b : 2;    // byte 1
        uint detect3 : 1;
        uint blue1a : 1;
        uint green1b : 1;
        uint detect2 : 3;
        uint green2a : 3;   // byte 2
        uint red2 : 4;
        uint blue1c : 1;
        uint db : 1;        // byte 3
        uint diff : 1;
        uint da : 1;
        uint blue2 : 4;
        uint green2b : 1;
        uint selectors;     // bytes 4-7
    };
    NV_COMPILER_CHECK(sizeof(H) == 64/8);

    struct Planar {
        uint originGreen1 : 1;  // byte 0
        uint originRed : 6;
        uint detect1 : 1;
        uint originBlue1 : 1;   // byte 1
        uint originGreen2 : 6;
        uint detect2 : 1;
        uint originBlue3 : 2;   // byte 2
        uint detect4 : 1;
        uint originBlue2 : 2;
        uint detect3 : 3;
        uint horizRed2 : 1;     // byte 3
        uint diff : 1;
        uint horizRed1 : 5;
        uint originBlue4 : 1;
        uint horizBlue1: 1;     // byte 4
        uint horizGreen : 7;
        uint vertRed1 : 3;      // byte 5
        uint horizBlue2 : 5;
        uint vertGreen1 : 5;    // byte 6
        uint vertRed2 : 3;
        uint vertBlue : 6;      // byte 7
        uint vertGreen2 : 2;
    };
    NV_COMPILER_CHECK(sizeof(Planar) == 64/8);


    uint64 data64;
    uint32 data32[2];
    uint8 data8[8];
    Individual individual;
    Differential differential;
    T t;
    H h;
    Planar planar;
};
NV_COMPILER_CHECK(sizeof(BlockETC) == 64/8);


static const int etc_intensity_modifiers[8][4] = {
    { -8,  -2,   2,   8 },
    { -17,  -5,  5,  17 },
    { -29,  -9,   9,  29 },
    {  -42, -13, 13,  42 },
    { -60, -18, 18,  60 },
    { -80, -24, 24,  80 },
    { -106, -33, 33, 106 },
    { -183, -47, 47, 183 }
};

static const int etc_intensity_range[8] = {
    16, 34, 58, 84, 120, 160, 212, 366
};

static const int etc_th_distances[8] = { 3, 6, 11, 16, 23, 32, 41, 64 };

static const uint8 etc_selector_scramble[] = { 3, 2, 0, 1 };
static const uint8 etc_selector_unscramble[] = { 2, 3, 1, 0 };


static float midpoints4[16];
NV_AT_STARTUP(
    for (int i = 0; i < 15; i++) {
        float f0 = float(((i+0) << 4) | ((i+0) >> 4)) / 255.0f;
        float f1 = float(((i+1) << 4) | ((i+1) >> 4)) / 255.0f;
        midpoints4[i] = (f0 + f1) * 0.5f;
    }
    midpoints4[15] = 1.0f;
);

static const float midpoints5[32] = {
    0.015686f, 0.047059f, 0.078431f, 0.111765f, 0.145098f, 0.176471f, 0.207843f, 0.241176f, 0.274510f, 0.305882f, 0.337255f, 0.370588f, 0.403922f, 0.435294f, 0.466667f, 0.5f,
    0.533333f, 0.564706f, 0.596078f, 0.629412f, 0.662745f, 0.694118f, 0.725490f, 0.758824f, 0.792157f, 0.823529f, 0.854902f, 0.888235f, 0.921569f, 0.952941f, 0.984314f, 1.0f
};

//static const float midpoints6[64];
//static const float midpoints7[128];


// ETC2 Modes:
// - ETC1:
//  - two partitions (flip modes): 2*(4x2, 2x4)
//  - two base colors stored as 444+444 or 555+333
//  - two 3 bit intensity modifiers
// - T Mode. 2 colors 444, 3 bit intensity modifiers, 2 bit indices.
// - H Mode. 2 colors 444, 3 bit intensity modifiers, 2 bit indices.
// - Planar mode: 3 colors 676

struct ETC_Data {
    enum Mode {
        Mode_ETC1,
        Mode_T,
        Mode_H,
        Mode_Planar,
    } mode;

    // @@ It may make more sense to store bit-expanded or even float colors here.
    union {
        struct {
            uint16 color0;      // 444 or 555
            uint16 color1;      // 444 or 333
            uint8 table0;       // 3 bits
            uint8 table1;       // 3 bits
            bool flip;          // partition mode
            bool diff;          // color encoding
        } etc;
        struct {
            uint16 color0;      // 444
            uint16 color1;      // 444
            uint8 table;        // 3 bits
        } t, h;
        struct {
            uint8 ro, go, bo;   // 676
            uint8 rh, gh, bh;   // 676
            uint8 rv, gv, bv;   // 676
        } planar;
    };

    uint8 selector[16];         // 2 bit indices (32 bits)
};

struct ETC_Solution {
    float error = NV_FLOAT_MAX;
    ETC_Data data;
};


struct ETC_Options {
    //bool fast_flip_mode_selection = false;
    bool use_rg_etc = true;
    bool enable_etc2 = true;
    bool use_planar = true;
    bool use_t_mode = true;
    bool use_h_mode = true;
    bool onebit_alpha = false;
    Vector3 color_weights = Vector3(1);
    
    //int8 eac_search_radius = 1;  // [0-3]
    //bool eac_11bit_mode = false;
};

/*static*/ float compress_etc(Vector4 input_colors[16], float input_weights[16], const ETC_Options & options, BlockETC * output);


struct BlockEAC {
    uint base : 8;
    uint table : 4;
    uint multiplier : 4;
    uint selectors0 : 8;
    uint selectors1 : 8;
    
    uint selectors2 : 8;
    uint selectors3 : 8;
    uint selectors4 : 8;
    uint selectors5 : 8;
};
NV_COMPILER_CHECK(sizeof(BlockEAC) == 64/8);

struct BlockETC_EAC {
    BlockEAC eac;
    BlockETC etc;
};
NV_COMPILER_CHECK(sizeof(BlockETC_EAC) == 128/8);

// EAC:
// 8 bit base code word
// 4 bit multiplier
// 4 bit table index
// 16 * 3 bit indices.

struct EAC_Data {
    uint8 alpha;        // 8 bits
    uint8 multiplier;   // 4 bits
    uint8 table_index;  // 4 bits
    uint8 selector[16]; // 3 bit indices
};

struct EAC_Solution {
    float error = NV_FLOAT_MAX;
    EAC_Data data;
};

struct EAC_Options {
    int search_radius = 1;      // 0 = fast, 1 = medium, 2 = slow
    bool use_11bit_mode = false;
};


static const int eac_intensity_modifiers[16][8] = {
    {-3, -6, -9, -15, 2, 5, 8, 14}, // 0
    {-3, -7,-10, -13, 2, 6, 9, 12}, // 1
    {-2, -5, -8, -13, 1, 4, 7, 12}, // 2
    {-2, -4, -6, -13, 1, 3, 5, 12}, // 3
    {-3, -6, -8, -12, 2, 5, 7, 11}, // 4
    {-3, -7, -9, -11, 2, 6, 8, 10}, // 5
    {-4, -7, -8, -11, 3, 6, 7, 10}, // 6
    {-3, -5, -8, -11, 2, 4, 7, 10}, // 7
    {-2, -6, -8, -10, 1, 5, 7, 9 }, // 8
    {-2, -5, -8, -10, 1, 4, 7, 9 }, // 9
    {-2, -4, -8, -10, 1, 3, 7, 9 }, // 10
    {-2, -5, -7, -10, 1, 4, 6, 9 }, // 11
    {-3, -4, -7, -10, 2, 3, 6, 9 }, // 12
    {-1, -2, -3, -10, 0, 1, 2, 9 }, // 13
    {-4, -6, -8,  -9, 3, 5, 7, 8 }, // 14
    {-3, -5, -7,  -9, 2, 4, 6, 8 }  // 15
};


static void pack_etc2_block(const ETC_Data & data, BlockETC * output_block) {
    BlockETC block;

    bool swap_colors = false;

    if (data.mode == ETC_Data::Mode_ETC1) {
        // These are the same for individual and differential blocks.
        block.individual.diff = data.etc.diff;
        block.individual.flip = data.etc.flip;
        block.individual.cw1 = data.etc.table0;
        block.individual.cw2 = data.etc.table1;

        if (data.etc.diff) {
            block.differential.red1 = data.etc.color0 >> 10;
            block.differential.dred2 = data.etc.color1 >> 6;
            block.differential.green1 = (data.etc.color0 >> 5) & 0x1F;
            block.differential.dgreen2 = (data.etc.color1 >> 3) & 0x7;
            block.differential.blue1 = data.etc.color0 & 0x1F;
            block.differential.dblue2 = data.etc.color1 & 0x7;
        }
        else {
            block.individual.red1 = data.etc.color0 >> 8;
            block.individual.red2 = data.etc.color1 >> 8;
            block.individual.green1 = (data.etc.color0 >> 4) & 0xF;
            block.individual.green2 = (data.etc.color1 >> 4) & 0xF;
            block.individual.blue1 = data.etc.color0 & 0xF;
            block.individual.blue2 = data.etc.color1 & 0xF;
        }
    }
    else if (data.mode == ETC_Data::Mode_T) {
        block.t.red1a = (data.t.color0 >> 8) >> 2;
        block.t.red1b = (data.t.color0 >> 8);
        block.t.green1 = (data.t.color0 >> 4);
        block.t.blue1 = data.t.color0;

        block.t.red2 = (data.t.color1 >> 8);
        block.t.green2 = (data.t.color1 >> 4);
        block.t.blue2 = data.t.color1;

        block.t.da = data.t.table >> 1;
        block.t.db = data.t.table;

        block.t.diff = 1;

        // create an invalid R differential to trigger T mode
        int dr = block.differential.dred2;
        if (dr >= 4) dr -= 8;
        int r = block.differential.red1 + dr;

        block.t.detect1 = 0;
        block.t.detect2 = 1;
        if (r >= 4) {
            block.t.detect1 = 7;
            block.t.detect2 = 0;
        }
    }
    else if (data.mode == ETC_Data::Mode_H) {
        bool table_lsb = data.h.table & 1;
        swap_colors = (data.h.color0 < data.h.color1) ^ !table_lsb;

        uint16 color0 = data.h.color0;
        uint16 color1 = data.h.color1;

        if (swap_colors) {
            swap(color0, color1);
        }

        block.h.red1 = (data.h.color0 >> 8);
        block.h.green1a = (data.h.color0 >> 4) >> 1;
        block.h.green1b = (data.h.color0 >> 4);
        block.h.blue1a = data.h.color0 >> 3;
        block.h.blue1b = data.h.color0 >> 1;
        block.h.blue1c = data.h.color0;

        block.h.red2 = (data.h.color1 >> 8);
        block.h.green2a = (data.h.color1 >> 4) >> 1;
        block.h.green2b = (data.h.color1 >> 4);
        block.h.blue2 = (data.h.color1 >> 8);

        block.h.da = data.h.table >> 2;
        block.h.db = data.h.table >> 1;

        block.h.diff = 1;

        // create an invalid R differential to trigger T mode
        block.h.detect1 = 0;
        block.h.detect2 = 0;
        block.h.detect3 = 0;

        int dr = block.differential.dred2;
        int dg = block.differential.dgreen2;
        if (dr >= 4) dr -= 8;
        if (dg >= 4) dg -= 8;
        int r = block.differential.red1 + dr;
        int g = block.differential.green1 + dg;

        if (r < 0 || r > 31) {
            block.h.detect1 = 1;
        }
        if (g >= 4) {
            block.h.detect2 = 7;
            block.h.detect3 = 0;
        }
        else {
            block.h.detect2 = 0;
            block.h.detect3 = 1;
        }
    }

    if (data.mode == ETC_Data::Mode_Planar) {
        // From ETCLib:
        block.planar.originRed = data.planar.ro;
        block.planar.originGreen1 = data.planar.go >> 6;
        block.planar.originGreen2 = data.planar.go;
        block.planar.originBlue1 = data.planar.bo >> 5;
        block.planar.originBlue2 = data.planar.bo >> 3;
        block.planar.originBlue3 = data.planar.bo >> 1;
        block.planar.originBlue4 = data.planar.bo;

        block.planar.horizRed1 = data.planar.rh >> 1;
        block.planar.horizRed2 = data.planar.rh;
        block.planar.horizGreen = data.planar.gh;
        block.planar.horizBlue1 = data.planar.bh >> 5;
        block.planar.horizBlue2 = data.planar.bh;

        block.planar.vertRed1 = data.planar.rv >> 3;
        block.planar.vertRed2 = data.planar.rv;
        block.planar.vertGreen1 = data.planar.gv >> 2;
        block.planar.vertGreen2 = data.planar.gv;
        block.planar.vertBlue = data.planar.bv;

        block.planar.diff = 1;

        // create valid RG differentials and an invalid B differential to trigger planar mode
        block.planar.detect1 = 0;
        block.planar.detect2 = 0;
        block.planar.detect3 = 0;
        block.planar.detect4 = 0;

        // @@ Clean this up.
        int dr = block.differential.dred2;
        int dg = block.differential.dgreen2;
        int db = block.differential.dblue2;
        if (dr >= 4) dr -= 8;
        if (dg >= 4) dg -= 8;
        if (db >= 4) db -= 8;
        int r = block.differential.red1 + dr;
        int g = block.differential.green1 + dg;
        int b = block.differential.blue1 + db;

        if (r < 0 || r > 31) {
            block.planar.detect1 = 1;
        }
        if (g < 0 || g > 31) {
            block.planar.detect2 = 1;
        }
        if (b >= 4) {
            block.planar.detect3 = 7;
            block.planar.detect4 = 0;
        }
        else {
            block.planar.detect3 = 0;
            block.planar.detect4 = 1;
        }
    }
    else {
        block.individual.selectors = 0;
        for (int i = 0; i < 16; i++) {
            uint selector = data.selector[i];
            selector = etc_selector_scramble[selector];
            block.individual.selectors |= (selector >> 1) << (i ^ 8);
            block.individual.selectors |= (selector & 1) << ((16 + i) ^ 8);
        }

        if (swap_colors) {
            block.individual.selectors ^= 0x0000FFFF;
        }
    }

    // @@ output_block is big endian, byte swap:
    *output_block = block;
}

static void unpack_etc2_block(const BlockETC * input_block, ETC_Data * data) {

    // @@ input_block is big endian, byte swap first:
    BlockETC block = *input_block;

    // Assume ETC1 for now.
    data->mode = ETC_Data::Mode_ETC1;

    // These are the same for individual and differential blocks.
    data->etc.diff = block.individual.diff != 0;
    data->etc.flip = block.individual.flip != 0;
    data->etc.table0 = block.individual.cw1;
    data->etc.table1 = block.individual.cw2;

    // Decode colors.
    if (data->etc.diff) {
        data->etc.color0 = U16((block.differential.red1 << 10) | (block.differential.green1 << 5) | block.differential.blue1);
        data->etc.color1 = U16((block.differential.dred2 << 6) | (block.differential.dgreen2 << 3) | block.differential.dblue2);

        // @@ Clean this up.
        int dr = block.differential.dred2;
        int dg = block.differential.dgreen2;
        int db = block.differential.dblue2;
        if (dr >= 4) dr -= 8;
        if (dg >= 4) dg -= 8;
        if (db >= 4) db -= 8;
        int r = block.differential.red1 + dr;
        int g = block.differential.green1 + dg;
        int b = block.differential.blue1 + db;

        // Detect ETC2 modes (invalid combinations).
        if (r < 0 || r > 31) {
            data->mode = ETC_Data::Mode_T;
        }
        else if (g < 0 || g > 31) {
            data->mode = ETC_Data::Mode_H;
        }
        else if (b < 0 || b > 31) {
            data->mode = ETC_Data::Mode_Planar;
        }
    }
    else {
        data->etc.color0 = U16((block.individual.red1 << 8) | (block.individual.green1 << 4) | block.individual.blue1);
        data->etc.color1 = U16((block.individual.red2 << 8) | (block.individual.green2 << 4) | block.individual.blue2);
    }

    if (data->mode == ETC_Data::Mode_T) {
        uint16 r0 = U16((block.t.red1a << 2) | block.t.red1b);
        uint16 g0 = U16(block.t.green1);
        uint16 b0 = U16(block.t.blue1);
        data->t.color0 = U16(r0 << 8) | U16(g0 << 4) | b0;

        uint16 r1 = U16(block.t.red2);
        uint16 g1 = U16(block.t.green2);
        uint16 b1 = U16(block.t.blue2);
        data->t.color1 = U16(r1 << 8) | U16(g1 << 4) | b1;

        data->t.table = U8((block.t.da << 1) | block.t.db);
    }
    else if (data->mode == ETC_Data::Mode_H) {
        uint16 r0 = U16(block.h.red1);
        uint16 g0 = U16((block.h.green1a << 1) | block.h.green1b);
        uint16 b0 = U16((block.h.blue1a << 3) | (block.h.blue1b << 1) | block.h.blue1c);
        data->h.color0 = U16(r0 << 8) | U16(g0 << 4) | b0;

        uint16 r1 = U16(block.h.red2);
        uint16 g1 = U16((block.h.green2a << 1) | block.h.green2b);
        uint16 b1 = U16(block.h.blue2);
        data->h.color1 = U16(r1 << 8) | U16(g1 << 4) | b1;

        data->h.table = U8((block.h.da << 2) | (block.h.db << 1));

        if (data->h.color0 >= data->h.color1) {
            data->h.table++;
        }
    }

    if (data->mode == ETC_Data::Mode_Planar) {
        data->planar.ro = U8(block.planar.originRed);
        data->planar.go = U8((block.planar.originGreen1 << 6) + block.planar.originGreen2);
        data->planar.bo = U8((block.planar.originBlue1 << 5) + (block.planar.originBlue2 << 3) + (block.planar.originBlue3 << 1) + block.planar.originBlue4);

        data->planar.rh = U8((block.planar.horizRed1 << 1) + block.planar.horizRed2);
        data->planar.gh = U8(block.planar.horizGreen);
        data->planar.bh = U8((block.planar.horizBlue1 << 5) + block.planar.horizBlue2);

        data->planar.rv = U8((block.planar.vertRed1 << 3) + block.planar.vertRed2);
        data->planar.gv = U8((block.planar.vertGreen1 << 2) + block.planar.vertGreen2);
        data->planar.bv = U8(block.planar.vertBlue);
    }
    else {
        // Note, selectors are arranged in columns, keep that order.
        unsigned char * selectors = (uint8 *)&block.individual.selectors;
        for (int i = 0; i < 16; i++) {
            int byte_msb = (1 - (i / 8));
            int byte_lsb = (3 - (i / 8));
            int shift = (i & 7);

            uint msb = (selectors[byte_msb] >> shift) & 1;
            uint lsb = (selectors[byte_lsb] >> shift) & 1;

            uint index = (msb << 1) | lsb;

            if (data->mode == ETC_Data::Mode_ETC1) {
                data->selector[i] = etc_selector_unscramble[index];
            }
            else {
                // No scrambling in T & H modes.
                data->selector[i] = index;
            }
        }
    }
}

static void pack_eac_block(const EAC_Data & data, BlockEAC * output_block) {

    output_block->base = data.alpha;
    output_block->table = data.table_index;
    output_block->multiplier = data.multiplier;
    
    uint64 selector_bits = 0;
    for (uint i = 0; i < 16; i++) {
        uint shift = 45 - (3 * i);
        selector_bits |= uint64(data.selector[i]) << shift;
    }
    
    output_block->selectors0 = selector_bits >> 40;
    output_block->selectors1 = selector_bits >> 32;
    output_block->selectors2 = selector_bits >> 24;
    output_block->selectors3 = selector_bits >> 16;
    output_block->selectors4 = selector_bits >> 8;
    output_block->selectors5 = selector_bits >> 0;
}

static void unpack_eac_block(const BlockEAC * input_block, EAC_Data * data) {
    
    data->alpha = input_block->base;
    data->table_index = input_block->table;
    data->multiplier = input_block->multiplier;
    
    uint64 selector_bits = 0;
    selector_bits |= uint64(input_block->selectors0) << 40;
    selector_bits |= uint64(input_block->selectors1) << 32;
    selector_bits |= uint64(input_block->selectors2) << 24;
    selector_bits |= uint64(input_block->selectors3) << 16;
    selector_bits |= uint64(input_block->selectors4) << 8;
    selector_bits |= uint64(input_block->selectors5) << 0;
    for (uint i = 0; i < 16; i++) {
        uint shift = 45 - (3 * i);
        data->selector[i] = (selector_bits >> shift) & 0x7;
    }
}


// This assumes nin > nout-nin
inline int bitexpand(uint32 bits, uint nin, uint nout) {
    assert(nout > nin);
    //assert(nout - nin > nin);
    return (bits << uint(nout - nin)) | (bits >> uint(2U * nin - nout));
}

// Integer color unpacking for decompressor.
static void unpack_color_444(uint32 packed_color, int * r, int * g, int * b) {
    int r4 = (packed_color >> 8) & 0xF;
    int g4 = (packed_color >> 4) & 0xF;
    int b4 = packed_color & 0xF;
    *r = r4 << 4 | r4;  // bitexpand(r4, 4, 8);
    *g = g4 << 4 | g4;  // bitexpand(g4, 4, 8);
    *b = b4 << 4 | b4;  // bitexpand(b4, 4, 8);
}

static Vector3 unpack_color_444(uint32 packed_color) {
    int r, g, b;
    unpack_color_444(packed_color, &r, &g, &b);
    return Vector3(float(r), float(g), float(b)) * 1.0f / 255.0f;
}

static void unpack_color_555(uint32 packed_color, int * r, int * g, int * b) {
    int r5 = (packed_color >> 10) & 0x1F;
    int g5 = (packed_color >> 5) & 0x1F;
    int b5 = packed_color & 0x1F;
    *r = (r5 << 3) | (r5 >> 2); // bitexpand(r5, 5, 8);
    *g = (g5 << 3) | (g5 >> 2); // bitexpand(g5, 5, 8);
    *b = (b5 << 3) | (b5 >> 2); // bitexpand(b5, 5, 8);
}

static Vector3 unpack_color_555(uint32 packed_color) {
    int r, g, b;
    unpack_color_555(packed_color, &r, &g, &b);
    return Vector3(float(r), float(g), float(b)) * 1.0f / 255.0f;
}

// Returns signed r,g,b without bit expansion.
static void unpack_delta_333(uint32 packed_delta, int * r, int * g, int * b) {
    *r = (packed_delta >> 6) & 7;
    *g = (packed_delta >> 3) & 7;
    *b = packed_delta & 7;
    if (*r >= 4) *r -= 8;
    if (*g >= 4) *g -= 8;
    if (*b >= 4) *b -= 8;
}

static bool unpack_color_555(uint32 packed_color, uint32 packed_delta, int * r, int * g, int * b) {
    int dc_r, dc_g, dc_b;
    unpack_delta_333(packed_delta, &dc_r, &dc_g, &dc_b);

    int r5 = int((packed_color >> 10U) & 0x1F) + dc_r;
    int g5 = int((packed_color >> 5U) & 0x1F) + dc_g;
    int b5 = int(packed_color & 0x1F) + dc_b;

    bool success = true;
    if (static_cast<uint>(r5 | g5 | b5) > 31U)
    {
       success = false;
       r5 = clamp(r5, 0, 31);
       g5 = clamp(g5, 0, 31);
       b5 = clamp(b5, 0, 31);
    }

    *r = (r5 << 3) | (r5 >> 2); // bitexpand(r5, 5, 8);
    *g = (g5 << 3) | (g5 >> 2); // bitexpand(g5, 5, 8);
    *b = (b5 << 3) | (b5 >> 2); // bitexpand(b5, 5, 8);

    return success;
}

static Vector3 unpack_color_555(uint32 packed_color, uint32 packed_delta) {
    int r, g, b;
    bool success = unpack_color_555(packed_color, packed_delta, &r, &g, &b);
    assert(success);
    return Vector3(float(r), float(g), float(b)) * 1.0f / 255.0f;
}


static void unpack_color_676(uint32 packed_color, int * r, int * g, int * b) {
    int r6 = (packed_color >> 13) & 0x3F;
    int g7 = (packed_color >> 6) & 0x7F;
    int b6 = packed_color & 0x3F;

    *r = bitexpand(r6, 6, 8);   // r << 2 | r >> 4
    *g = bitexpand(g7, 7, 8);   // g << 1 | g >> 6
    *b = bitexpand(b6, 6, 8);   // b << 2 | b >> 4
}


static uint32 pack_color_444(Vector3 color) {

    // Truncate.
    uint r = U32(ftoi_trunc(clamp(color.x * 15.0f, 0.0f, 15.0f)));
    uint g = U32(ftoi_trunc(clamp(color.y * 15.0f, 0.0f, 15.0f)));
    uint b = U32(ftoi_trunc(clamp(color.z * 15.0f, 0.0f, 15.0f)));

    // Round exactly according to 444 bit-expansion.
    r += (color.x > midpoints4[r]);
    g += (color.y > midpoints4[g]);
    b += (color.z > midpoints4[b]);

    return (r << 8) | (g << 4) | b;
}

static uint32 pack_color_555(Vector3 color) {

    // Truncate.
    uint r = U32(ftoi_trunc(clamp(color.x * 31.0f, 0.0f, 31.0f)));
    uint g = U32(ftoi_trunc(clamp(color.y * 31.0f, 0.0f, 31.0f)));
    uint b = U32(ftoi_trunc(clamp(color.z * 31.0f, 0.0f, 31.0f)));

    // Round exactly according to 555 bit-expansion.
    r += (color.x > midpoints5[r]);
    g += (color.y > midpoints5[g]);
    b += (color.z > midpoints5[b]);

    return (r << 10) | (g << 5) | b;
}

static uint32 pack_delta_333(Vector3 delta) {
    // @@ Accurate rounding of signed 3-bit components.

    int r = ftoi_round(clamp(delta.x * 31.0f, -4.0f, 3.0f));
    int g = ftoi_round(clamp(delta.y * 31.0f, -4.0f, 3.0f));
    int b = ftoi_round(clamp(delta.z * 31.0f, -4.0f, 3.0f));

    //r += (delta.x > delta_midpoints3[r]);
    //g += (delta.y > delta_midpoints3[g]);
    //b += (delta.z > delta_midpoints3[b]);

    if (r < 0) r += 8;
    if (g < 0) g += 8;
    if (b < 0) b += 8;
    return static_cast<uint16>(b | (g << 3) | (r << 6));
}

static uint8 pack_float_6(float f) {

    // Truncate.
    uint u = U32(ftoi_trunc(clamp(f * 63.0f, 0.0f, 63.0f)));

    // Round exactly according to 6 bit-expansion.
    //u += (f > midpoints6[u]);

    float midpoint = 0.5f * (bitexpand(u, 6, 8) + bitexpand(min(u + 1, 63U), 6, 8));    // @@ Precompute.
    u += (f > midpoint);

    return U8(u);
}

static uint8 pack_float_7(float f) {

    // Truncate.
    uint u = U32(ftoi_trunc(clamp(f * 127.0f, 0.0f, 127.0f)));

    // Round exactly according to 6 bit-expansion.
    //u += (f > midpoints7[u]);

    float midpoint = 0.5f * (bitexpand(u, 7, 8) + bitexpand(min(u + 1, 127U), 7, 8));   // @@ Precompute.
    u += (f > midpoint);

    return U8(u);
}

static uint8 pack_float_6(float f, bool round_dir) {
    uint u = U32(ftoi_trunc(clamp(f * 63.0f + round_dir, 0.0f, 63.0f)));
    return U8(u);
}

static uint8 pack_float_7(float f, bool round_dir) {
    uint u = U32(ftoi_trunc(clamp(f * 127.0f + round_dir, 0.0f, 127.0f)));
    return U8(u);
}


Vector3 get_partition_color_average(const Vector4 input_colors[16], const float input_weights[16], bool flip, int partition) {
    Vector3 sum_c(0);
    float sum_w = 0;

    if (flip) {
        // Horizontal partition.
        int offset = partition ? 8 : 0;

        for (int i = 0; i < 8; i++) {
            sum_c += input_colors[i+offset].xyz() * input_weights[i+offset];
            sum_w += input_weights[i+offset];
        }
    }
    else {
        // Vertical partition.
        int offset = partition ? 2 : 0;

        for (int i = 0; i < 4; i++) {
            sum_c += input_colors[i+offset].xyz() * input_weights[i+offset];
            sum_w += input_weights[i+offset];

            sum_c += input_colors[i+offset+1].xyz() * input_weights[i+offset+1];
            sum_w += input_weights[i+offset+1];

            offset += 2;
        }
    }
    if (sum_w == 0) {
        sum_w = 1;
    }

    return sum_c * 1.0f / sum_w;
}

// Approximate partition color using average.
Vector3 base_color_average(const Vector3 colors[8]) {
    Vector3 sum_c(0);

    for (uint i = 0; i < 8; i++) {
        sum_c += colors[i];
    }

    return sum_c * 1.0f / 8.0f;
}
Vector3 base_color_average(const Vector3 colors[8], const float weights[8]) {
    Vector3 sum_c(0);
    float sum_w = 0;

    for (uint i = 0; i < 8; i++) {
        sum_c += colors[i] * weights[i];
        sum_w += weights[i];
    }

    return sum_c * 1.0f / sum_w;
}

#if 0
// Compute base color using least squares.
Vector3 base_color_least_squares(const Vector3 colors[8], int table_index, int indices[8]) {

    // Compute dot(C, I) and dot(I, I)
    Vector3 CI(0);
    float II = 0;

    for (int i = 0; i < 8; i++) {
        Vector3 C = colors[i];
        float I = etc_intensity_modifiers[table_index][indices[i]];
        CI += C * I;
        II += I * I;
    }

    return CI / II;
}

// @@ Do weighted least squares!
Vector3 base_color_least_squares(const Vector3 colors[8], const float weights[8], int table_index, int indices[8]) {

    // Compute dot(C, I) and dot(I, I)
    Vector3 CI(0);
    float II = 0;

    for (int i = 0; i < 8; i++) {
        Vector3 C = colors[i];
        float w = weights[i];
        float I = etc_intensity_modifiers[table_index][indices[i]];
        CI += C * I * w;
        II += I * I;
    }

    return CI / II;
}

// Is this any faster than the above?
Vector3 base_color_least_squares(const Vector3 colors[8], int table_index, int c0, int c1, int c2) {

    // Compute dot(C, I) and dot(I, I)
    Vector3 CI(0);

    float I0 = etc_intensity_modifiers[table_index][0];
    float I1 = etc_intensity_modifiers[table_index][1];
    float I2 = etc_intensity_modifiers[table_index][2];
    float I3 = etc_intensity_modifiers[table_index][3];

    float II = 0;
    II += c0 * I0 * I0;
    II += c1 * I1 * I1;
    II += c2 * I2 * I2;
    II += (8-c0-c1-c2) * I3 * I3;

    int i = 0;
    for (; i < c0; i++)         CI += colors[i] * I0;
    for (; i < c0+c1; i++)      CI += colors[i] * I1;
    for (; i < c0+c1+c2; i++)   CI += colors[i] * I2;
    for (; i < 8; i++)          CI += colors[i] * I3;

    return CI / II;
}

static void selectors_for_clusters(int c0, int c1, int c2, int selector[8]) {
    int i = 0;
    for (; i < c0; i++)         selector[i] = 0;
    for (; i < c0+c1; i++)      selector[i] = 1;
    for (; i < c0+c1+c2; i++)   selector[i] = 2;
    for (; i < 8; i++)          selector[i] = 3;
}

static int cluster_count(int count = 8) {
    int total = 0;

    for (uint c0 = 0; c0 <= count; c0++) {
        for (uint c1 = 0; c1 <= count-c0; c1++) {
            for (uint c2 = 0; c2 <= count-c0-c1; c2++) {
                total++;
            }
        }
    }

    // total is the number of possible cluster combinations.
    return total;
}

// Does each partition have its own table index? Or is it shared for both?


void test_all_total_orders(const Vector4 colors[8], const float weights[8], int table_index) {

    // @@ compute average luminance of each partition.


    // @@ sort colors by the luminance differences respect to partition average.

    // @@ compute luminance range, pick table index based on that. Try nearest indices also?

    // For each cluster combination:
/*
    for (uint c0 = 0; c0 <= count; c0++) {
        for (uint c1 = 0; c1 <= count-c0; c1++) {
            for (uint c2 = 0; c2 <= count-c0-c1; c2++) {

                // compute selectors.
                int selector[8];
                selectors_for_clusters(c0, c1, c2, selector);

                // compute base colors that minimize error in each partition.

                // determine error for these quantized base colors. Record best cluster combination.

            }
        }
    }
*/
}

void test_all_total_orders(const Vector4 input_colors[16], const float input_weights[16], uint count, bool flip, int table_index) {

    // Slow method is to test both flip modes.
    //test_all_total_orders(input_colors, input_weights, /*flip=*/false, int table_index);
    //test_all_total_orders(input_colors, input_weights, /*flip=*/true, int table_index);
}


// @@ How do compute the error for a given base color?

// Compute indices using range fitting / quantization of input colors?

// Compute indices using range fitting.

void test_all_clusters() {
    int count = 8; // Could be smaller.

    for (uint c0 = 0; c0 <= count; c0++) {
        Vector3 x1(0.0f);
        float w1 = 0.0f;

        for (uint c1 = 0; c1 <= count-c0; c1++) {
            Vector3 x2(0.0f);
            float w2 = 0.0f;

            for (uint c2 = 0; c2 <= count-c0-c1; c2++) {
            }
        }
    }

}

#endif


static Color32 saturate_color(int R, int G, int B) {
    Color32 c;
    c.r = U8(clamp(R, 0, 255));
    c.g = U8(clamp(G, 0, 255));
    c.b = U8(clamp(B, 0, 255));
    c.a = 255;
    return c;
}

static void get_diff_subblock_palette(uint16 packed_color, uint table_idx, Color32 palette[4]) {
    assert(table_idx < 8);

    const int * intensity_table = etc_intensity_modifiers[table_idx];

    int r, g, b;
    unpack_color_555(packed_color, &r, &g, &b);

    for (int i = 0; i < 4; i++) {
        const int y = intensity_table[i];
        palette[i] = saturate_color(r + y, g + y, b + y);
    }
}

static bool get_diff_subblock_palette(uint16 packed_color, uint16 packed_delta, uint table_idx, Color32 palette[4]) {
    assert(table_idx < 8);

    const int * intensity_table = etc_intensity_modifiers[table_idx];

    int r, g, b;
    bool success = unpack_color_555(packed_color, packed_delta, &r, &g, &b);

    for (int i = 0; i < 4; i++) {
        const int y = intensity_table[i];
        palette[i] = saturate_color(r + y, g + y, b + y);
    }

    return success;
}

static void get_abs_subblock_palette(uint16 packed_color, uint table_idx, Color32 palette[4]) {
    assert(table_idx < 8);

    const int * intensity_table = etc_intensity_modifiers[table_idx];

    int r, g, b;
    unpack_color_444(packed_color, &r, &g, &b);

    for (int i = 0; i < 4; i++) {
        const int y = intensity_table[i];
        palette[i] = saturate_color(r + y, g + y, b + y);
    }
}

static int get_selector(const ETC_Data & data, int x, int y) {
    // Note selectors are arranged in column order.
    return data.selector[x*4+y];
}
static int get_partition(const ETC_Data & data, int x, int y) {
    assert(data.mode == ETC_Data::Mode_ETC1);
    return data.etc.flip ? y > 1 : x > 1;
}

static void decode_etc1(const ETC_Data & data, Vector4 colors[16]) {
    assert(data.mode == ETC_Data::Mode_ETC1);

    Color32 palette[2][4];

    if (data.etc.diff) {
        // Decode colors in 555+333 mode.
        get_diff_subblock_palette(data.etc.color0, data.etc.table0, palette[0]);
        get_diff_subblock_palette(data.etc.color0, data.etc.color1, data.etc.table1, palette[1]);
    }
    else {
        // Decode colors in 444,444 mode.
        get_abs_subblock_palette(data.etc.color0, data.etc.table0, palette[0]);
        get_abs_subblock_palette(data.etc.color1, data.etc.table1, palette[1]);
    }

    for (int y = 0; y < 4; y++) {
        for (int x = 0; x < 4; x++) {
            colors[y*4+x] = toVector4(palette[get_partition(data, x, y)][get_selector(data, x, y)]);
        }
    }
}

static void decode_etc2_t(const ETC_Data & data, Vector4 output_colors[16]) {
    assert(data.mode == ETC_Data::Mode_T);

    int r, g, b;
    Color32 palette[4];

    int d = etc_th_distances[data.t.table];

    unpack_color_444(data.t.color0, &r, &g, &b);
    palette[0] = saturate_color(r, g, b);

    unpack_color_444(data.t.color1, &r, &g, &b);
    palette[1] = saturate_color(r + d, g + d, b + d);
    palette[2] = saturate_color(r, g, b);
    palette[3] = saturate_color(r - d, g - d, b - d);

    for (int y = 0; y < 4; y++) {
        for (int x = 0; x < 4; x++) {
            output_colors[y*4+x] = toVector4(palette[get_selector(data, x, y)]);
        }
    }
}

static void decode_etc2_h(const ETC_Data & data, Vector4 output_colors[16]) {
    assert(data.mode == ETC_Data::Mode_H);

    int r, g, b;
    Color32 palette[4];

    int d = etc_th_distances[data.t.table];

    unpack_color_444(data.t.color0, &r, &g, &b);
    palette[0] = saturate_color(r + d, g + d, b + d);
    palette[1] = saturate_color(r - d, g - d, b - d);

    unpack_color_444(data.t.color1, &r, &g, &b);
    palette[2] = saturate_color(r + d, g + d, b + d);
    palette[3] = saturate_color(r - d, g - d, b - d);

    for (int y = 0; y < 4; y++) {
        for (int x = 0; x < 4; x++) {
            output_colors[y*4+x] = toVector4(palette[get_selector(data, x, y)]);
        }
    }
}

static void decode_etc2_planar(const ETC_Data & data, Vector4 output_colors[16]) {
    assert(data.mode == ETC_Data::Mode_Planar);

    int ro, go, bo; // origin color
    int rh, gh, bh; // horizontal color
    int rv, gv, bv; // vertical color

    // Unpack from 676
    ro = bitexpand(data.planar.ro, 6, 8);   // r << 2 | r >> 4
    go = bitexpand(data.planar.go, 7, 8);   // g << 1 | g >> 6
    bo = bitexpand(data.planar.bo, 6, 8);

    rh = bitexpand(data.planar.rh, 6, 8);
    gh = bitexpand(data.planar.gh, 7, 8);
    bh = bitexpand(data.planar.bh, 6, 8);

    rv = bitexpand(data.planar.rv, 6, 8);
    gv = bitexpand(data.planar.gv, 7, 8);
    bv = bitexpand(data.planar.bv, 6, 8);


    for (int y = 0; y < 4; y++) {
        for (int x = 0; x < 4; x++) {
            int r = (4 * ro + x * (rh - ro) + y * (rv - ro) + 2) >> 2;
            int g = (4 * go + x * (gh - go) + y * (gv - go) + 2) >> 2;
            int b = (4 * bo + x * (bh - bo) + y * (bv - bo) + 2) >> 2;

            int idx = 4 * y + x;
            output_colors[idx].x = saturate(float(r) / 255.0f);
            output_colors[idx].y = saturate(float(g) / 255.0f);
            output_colors[idx].z = saturate(float(b) / 255.0f);
            output_colors[idx].w = 1;
        }
    }
}

static void decode_etc2(const ETC_Data & data, Vector4 colors[16]) {

    if (data.mode == ETC_Data::Mode_ETC1) {
        decode_etc1(data, colors);
    }
    else if (data.mode == ETC_Data::Mode_T) {
        decode_etc2_t(data, colors);
    }
    else if (data.mode == ETC_Data::Mode_H) {
        decode_etc2_h(data, colors);
    }
    else /*if (data.mode == ETC_Data::Mode_Planar)*/ {
        decode_etc2_planar(data, colors);
    }
}


static float get_alpha11(int base, int table, int mul, int index) {
	int elevenbase = base*8+4;
    int tabVal = eac_intensity_modifiers[table][index];
	int elevenTabVal = tabVal*8;
 
	if(mul!=0) elevenTabVal*=mul;
	else elevenTabVal/=8;
  
	//calculate sum
	int elevenbits = elevenbase+elevenTabVal;
 
	//clamp..
	if(elevenbits>=256*8) elevenbits=256*8-1;
	else if(elevenbits<0) elevenbits=0;
	//elevenbits now contains the 11 bit alpha value as defined in the spec.
 
	//extend to 16 bits before returning, since we don't have any good 11-bit file formats.
	uint16 sixteenbits = (elevenbits<<5)+(elevenbits>>6);
 
	return float(sixteenbits) / 65535.0f;
}

static float get_alpha8(int base, int table, int mul, int index) {
    int value = clamp(base + eac_intensity_modifiers[table][index] * mul, 0, 255);
    return value / 255.0f;
}


static void decode_eac_8(const EAC_Data & data, Vector4 output_colors[16], int output_channel = 3) {
    for (int i = 0; i < 16; i++) {
        int s = data.selector[4*(i%4) + i/4];
        output_colors[i].component[output_channel] = get_alpha8(data.alpha, data.table_index, data.multiplier, s);
    }
}

static void decode_eac_11(const EAC_Data & data, Vector4 output_colors[16], int output_channel = 0) {
    for (int i = 0; i < 16; i++) {
        int s = data.selector[4*(i%4) + i/4];
        output_colors[i].component[output_channel] = get_alpha11(data.alpha, data.table_index, data.multiplier, s);
    }
}


static float evaluate_mse(const Vector3 & p, const Vector3 & c, const Vector3 & w) {
    Vector3 d = (p - c) * w;
    return dot(d, d);
}

static float evaluate_rgb_mse(const Vector4 input_colors[16], const float input_weights[16], const ETC_Options & options, const ETC_Data & data) {
    // Decode data and compare?
    Vector4 colors[16];
    decode_etc2(data, colors);

    float error = 0;
    for (int i = 0; i < 16; i++) {
        error += input_weights[i] * evaluate_mse(input_colors[i].xyz(), colors[i].xyz(), options.color_weights);
    }
    return error;
}


static int select_table_index(const Vector3 & base_color, const Vector4 input_colors[16], const float input_weights[16], bool flip, int partition) {

    //float min_lum_delta = NV_FLOAT_MAX;
    float max_lum_delta = -NV_FLOAT_MAX;

    int xb = partition ? 2 : 0;
    int xe = partition ? 4 : 2;

    for (int y = 0; y < 4; y++) {
        for (int x = xb; x < xe; x++) {
            int idx = flip ? x*4 + y : y*4 + x;
            float lum_delta = dot(base_color, Vector3(1.0f/3)) - dot(input_colors[idx].xyz(), Vector3(1.0f/3));
            //min_lum_delta = min(min_lum_delta, lum_delta);
            max_lum_delta = max(max_lum_delta, fabsf(lum_delta));
        }
    }

    int best_range = -1;
    float best_error = NV_FLOAT_MAX;
    for (int i = 0; i < 8; i++) {
        float error = fabsf(etc_intensity_range[i] - 255 * max_lum_delta);
        if (error < best_error) {
            best_error = error;
            best_range = i;
        }
    }

    return best_range;
}

static float update_selectors(const Vector4 input_colors[16], const float input_weights[16], ETC_Data & data, const ETC_Options & options) {

    Color32 palette[2][4];

    if (data.etc.diff) {
        // Decode colors in 555+333 mode.
        get_diff_subblock_palette(data.etc.color0, data.etc.table0, palette[0]);
        get_diff_subblock_palette(data.etc.color0, data.etc.color1, data.etc.table1, palette[1]);
    }
    else {
        // Decode colors in 444,444 mode.
        get_abs_subblock_palette(data.etc.color0, data.etc.table0, palette[0]);
        get_abs_subblock_palette(data.etc.color1, data.etc.table1, palette[1]);
    }

    float total_error = 0;

    for (int y = 0; y < 4; y++) {
        for (int x = 0; x < 4; x++) {
            int i = y*4 + x;

            float best_error = NV_FLOAT_MAX;
            int best_p = 0;

            for (int p = 0; p < 4; p++) {
                float error = evaluate_mse(toVector3(palette[get_partition(data, x, y)][p]), input_colors[i].xyz(), options.color_weights);
                if (error < best_error) {
                    best_error = error;
                    best_p = p;
                }
            }

            int s = x*4 + y;
            data.selector[s] = U8(best_p);

            total_error += best_error * input_weights[i];
        }
    }

    return total_error;
}


static void partition_input_block(const Vector4 input_colors[16], const float input_weights[16], bool flip, int partition, Vector3 output_colors[8], float output_weights[8]) {

    const int xb = partition ? 2 : 0;
    const int xe = partition ? 4 : 2;

    for (int y = 0, i = 0; y < 4; y++) {
        for (int x = xb; x < xe; x++, i++) {
            int idx = flip ? x*4 + y : y*4 + x;

            output_colors[i] = input_colors[idx].xyz();
            output_weights[i] = input_weights[idx];
        }
    }
}


struct ETC_SubBlock {
    Vector3 color;
    bool delta;
    int table;
    int indices[8];
};

static float evaluate_rgb_mse(const Vector3 colors[8], const float weights[8], const ETC_Options & options, ETC_SubBlock * sub_block) {

    // Evaluate sub block palette.
    Vector3 palette[4];
    palette[0] = sub_block->color + Vector3(etc_intensity_modifiers[sub_block->table][0] / 255.0f);
    palette[1] = sub_block->color + Vector3(etc_intensity_modifiers[sub_block->table][1] / 255.0f);
    palette[2] = sub_block->color + Vector3(etc_intensity_modifiers[sub_block->table][2] / 255.0f);
    palette[3] = sub_block->color + Vector3(etc_intensity_modifiers[sub_block->table][3] / 255.0f);

    float mse = 0;
    for (int i = 0; i < 8; i++) {
        mse += evaluate_mse(colors[i], palette[sub_block->indices[i]], options.color_weights) * weights[i];
    }

    return mse;
}

static void optimize_base_color(const Vector3 colors[8], const float weights[8], ETC_SubBlock * sub_block) {

    // @@ For a given index selection, find color that minimizes the error. RGB components are independent.

    float D_sum = 0;
    float R_sum = 0;
    float G_sum = 0;
    float B_sum = 0;
    float W_sum = 0;

    for (int i = 0; i < 8; i++) {
        float Di = etc_intensity_modifiers[sub_block->table][sub_block->indices[i]] / 255.0f;  // @@ precompute?

        D_sum += Di * weights[i];
        R_sum += colors[i].x * weights[i];
        G_sum += colors[i].y * weights[i];
        B_sum += colors[i].z * weights[i];
        W_sum += weights[i];
    }

    sub_block->color.x = (R_sum - D_sum) / W_sum;
    sub_block->color.y = (R_sum - D_sum) / W_sum;
    sub_block->color.z = (R_sum - D_sum) / W_sum;

    // @@ Estimate error (without quantization)


    // @@ Repeat for all tables?

    // @@ Given a new center, compute new indices, then update center?

}


static int reduce_colors(Vector3 * colors, float * weights, int count) {

    int n = 0;

    for (int i = 0; i < count; i++) {

        if (weights[i] == 0.0f) {
            // skip without incrementing n.
            continue;
        }

        colors[n] = colors[i];
        weights[n] = weights[i];

        // find color[j] that matches color[i]
        for (int j = i + 1; j < count; j++) {
            if (colors[i] == colors[j]) {       // @@ Compare within threshold?
                weights[n] += weights[j];
                weights[j] = 0.0f;
            }
        }

        n++;
    }

    return n;
}

// stable sort. in place.
static void sort_colors(Vector3 * colors, float * weights, int count) {
    assert(count <= 8);

    // build the list of values
    //int order[8];
    float lum[8];
    for (int i = 0; i < count; ++i) {
        //order[i] = i;
        lum[i] = colors[i].x + colors[i].y + colors[i].z;
    }

    // stable sort
    for (int i = 0; i < count; ++i) {
        for (int j = i; j > 0 && lum[j] < lum[j - 1]; --j) {
            swap(lum[j], lum[j - 1]);
            //swap(order[j], order[j - 1]);
            swap(colors[j], colors[j - 1]);
        }
    }
}


/*
float optimize_center(float colors[4][10], uniform int p, uniform int table_level)
{
    float best_center = 0;
    for (uniform int q = 0; q < 4; q++)
    {
        best_center += (colors[q][7 + p] - get_etc1_dY(table_level, q)) * colors[q][3];
    }
    best_center /= 8;

    float best_err = 0;
    for (uniform int q = 0; q < 4; q++)
    {
        float dY = get_etc1_dY(table_level, q);
        best_err += sq(clamp(best_center + dY, 0, 255) - colors[q][7 + p]) * colors[q][3];
    }

    for (uniform int branch = 0; branch < 4; branch++)
    {
        float new_center = 0;
        float sum = 0;
        for (uniform int q = 0; q < 4; q++)
        {
            if (branch <= 1 && q <= branch) continue;
            if (branch >= 2 && q >= branch) continue;
            new_center += (colors[q][7 + p] - get_etc1_dY(table_level, q)) * colors[q][3];
            sum += colors[q][3];
        }

        new_center /= sum;

        float err = 0;
        for (uniform int q = 0; q < 4; q++)
        {
            float dY = get_etc1_dY(table_level, q);
            err += sq(clamp(new_center + dY, 0, 255) - colors[q][7 + p]) * colors[q][3];
        }

        if (err < best_err)
        {
            best_err = err;
            best_center = new_center;
        }
    }

    return best_center;
}
*/


static void compress_etc1_test(const Vector4 input_colors[16], const float input_weights[16], const ETC_Options & options, ETC_Solution * result) {

    Vector3 colors[8];
    float weights[8];
    //int xrefs[8];
    ETC_SubBlock sub_block[2];

    bool best_flip = false;
    for (int flip = 0; flip <= 1; flip++) {

        partition_input_block(input_colors, input_weights, !!flip, /*partition=*/0, colors, weights);

        int count = reduce_colors(colors, weights, 8);

        //sort_colors(colors, weights);

        // @@ sort colors along luminance axis.

        //sub_block[0].color

        partition_input_block(input_colors, input_weights, !!flip, /*partition=*/1, colors, weights);

    }

    //pack_colors(sub_block[0].color, sub_block[1].color, &result->data);

    result->error = update_selectors(input_colors, input_weights, result->data, options);

}

/*void pack_colors(const Vector3 & color0, const Vector3 & color1, const ETC_Options & options, ETC_Data * data) {

    uint16 abs_c0 = U16(pack_color_444(color0));
    uint16 abs_c1 = U16(pack_color_444(color1));
    Vector3 abs_vc0 = unpack_color_444(abs_c0);
    Vector3 abs_vc1 = unpack_color_444(abs_c1);
    float abs_error = evaluate_mse(color0, abs_vc0, options.color_weights) + evaluate_mse(color1, abs_vc1, options.color_weights);

    uint16 diff_c0 = U16(pack_color_555(color0));
    Vector3 diff_vc0 = unpack_color_555(diff_c0);
    uint16 diff_d1 = U16(pack_delta_333(color1 - diff_vc0));
    Vector3 diff_vc1 = unpack_color_555(diff_c0, diff_d1);
    float diff_error = evaluate_mse(color0, diff_vc0, options.color_weights) + evaluate_mse(color1, diff_vc1, options.color_weights);

    if (diff_error < abs_error) {
        data->etc.color0 = diff_c0;
        data->etc.color1 = diff_c1;
        return diff_error;
    }
    else {
        if (abs_error < best_error) {
            best_error = abs_error;
            best_diff = false;
            best_flip = flip;
            best_c0 = abs_c0;
            best_c1 = abs_c1;
            best_vc0 = abs_vc0;
            best_vc1 = abs_vc1;
        }
    }
}*/

static void compress_etc1_range_fit(const Vector4 input_colors[16], const float input_weights[16], const ETC_Options & options, ETC_Solution * result) {

    float best_error = NV_FLOAT_MAX;
    bool best_diff = false;
    bool best_flip = false;
    uint16 best_c0 = 0;
    uint16 best_c1 = 0;
    Vector3 best_vc0;
    Vector3 best_vc1;

    for (int flip = 0; flip <= 1; flip++) {
        Vector3 color0 = get_partition_color_average(input_colors, input_weights, !!flip, /*partition=*/0);
        Vector3 color1 = get_partition_color_average(input_colors, input_weights, !!flip, /*partition=*/1);

        uint16 abs_c0 = U16(pack_color_444(color0));
        uint16 abs_c1 = U16(pack_color_444(color1));
        Vector3 abs_vc0 = unpack_color_444(abs_c0);
        Vector3 abs_vc1 = unpack_color_444(abs_c1);
        float abs_error = evaluate_mse(color0, abs_vc0, options.color_weights) + evaluate_mse(color1, abs_vc1, options.color_weights);

        uint16 diff_c0 = U16(pack_color_555(color0));
        Vector3 diff_vc0 = unpack_color_555(diff_c0);
        uint16 diff_d1 = U16(pack_delta_333(color1 - diff_vc0));
        Vector3 diff_vc1 = unpack_color_555(diff_c0, diff_d1);
        float diff_error = evaluate_mse(color0, diff_vc0, options.color_weights) + evaluate_mse(color1, diff_vc1, options.color_weights);

        if (diff_error < abs_error) {
            if (diff_error < best_error) {
                best_error = diff_error;
                best_diff = true;
                best_flip = !!flip;
                best_c0 = diff_c0;
                best_c1 = diff_d1;
                best_vc0 = diff_vc0;
                best_vc1 = diff_vc1;
            }
        }
        else {
            if (abs_error < best_error) {
                best_error = abs_error;
                best_diff = false;
                best_flip = !!flip;
                best_c0 = abs_c0;
                best_c1 = abs_c1;
                best_vc0 = abs_vc0;
                best_vc1 = abs_vc1;
            }
        }
    }


    result->data.mode = ETC_Data::Mode_ETC1;
    result->data.etc.flip = best_flip;
    result->data.etc.diff = best_diff;
    result->data.etc.table0 = select_table_index(best_vc0, input_colors, input_weights, best_flip, /*partition=*/0);
    result->data.etc.table1 = select_table_index(best_vc1, input_colors, input_weights, best_flip, /*partition=*/1);
    result->data.etc.color0 = best_c0;
    result->data.etc.color1 = best_c1;

    result->error = update_selectors(input_colors, input_weights, result->data, options);

    result->error = evaluate_rgb_mse(input_colors, input_weights, options, result->data);
}

#if HAVE_RGETC
#include "nvimage/ColorBlock.h"

void compress_etc1_rg(const Vector4 input_colors[16], const float input_weights[16], const ETC_Options & options, ETC_Solution * result) {

    rg_etc1::etc1_pack_params pack_params;
    //pack_params.m_quality = rg_etc1::cLowQuality;
    pack_params.m_quality = rg_etc1::cMediumQuality;  // @@ Select quality based on compression options. 

    ColorBlock rgba;
    for (uint i = 0; i < 16; i++) {
        rgba.color(i) = toColor32(input_colors[i]);
    }
    rgba.swizzle(2, 1, 0, 3);

    BlockETC block;
    rg_etc1::pack_etc1_block((void *)&block, (const uint *)rgba.colors(), pack_params);

    unpack_etc2_block(&block, &result->data);

    result->error = evaluate_rgb_mse(input_colors, input_weights, options, result->data);
}
#endif

static void compress_etc2_planar_solid(const Vector4 input_colors[16], const float input_weights[16], const ETC_Options & options, ETC_Solution * result) {

    Vector3 C(0);
    float W = 0;

    for (int i = 0; i < 16; i++) {
        C += input_colors[i].xyz() * input_weights[i];
        W += input_weights[i];
    }

    C /= W;

    // Convert colors to 676
    result->data.mode = ETC_Data::Mode_Planar;
    result->data.planar.ro = pack_float_6(C.x);
    result->data.planar.go = pack_float_7(C.y);
    result->data.planar.bo = pack_float_6(C.z);

    result->data.planar.rh = result->data.planar.ro;
    result->data.planar.gh = result->data.planar.go;
    result->data.planar.bh = result->data.planar.bo;

    result->data.planar.rv = result->data.planar.ro;
    result->data.planar.gv = result->data.planar.go;
    result->data.planar.bv = result->data.planar.bo;

    // Evaluate error.
    result->error = evaluate_rgb_mse(input_colors, input_weights, options, result->data);
}

// Least squares optimization of planar endpoints.
static void compress_etc2_planar_lsqr(const Vector4 input_colors[16], const float input_weights[16], const ETC_Options & options, ETC_Solution * result) {

    // Isn't this a simple least squares problem?
    // - Yes, but that doesn't take clamping and quantization into account.
    // - Solve the least squares problem, then refine endpoints?

    // This matrix is always the same! But not when using arbitrary weights!
    // This would be faster computing the matrix first, then multiplying by the weight covariance matrix.
    Matrix3 m(0);

    // For every pixel, decoder does:
    // int r = (4 * ro + x * (rh - ro) + y * (rv - ro) + 2) >> 2;

    // R(x,y) = (4 * ro + x * (rh - ro) + y * (rv - ro) + 2) / 4;

    // R(x,y) = ro * (1 - x/4 - y/4) + rh * x/4 + rv * y/4 + 1/2;

    // a = x/4
    // b = y/4
    // c = 1 - a - b
    // R(x,y) = ro * c + rh * a + rv * b + 1/2;

    float A[3 * 16];

    for (int y = 0; y < 4; y++) {
        for (int x = 0; x < 4; x++) {
            float w = input_weights[4*y+x];
            //if ((x == 1 || x == 2) && (y == 1 && y == 2)) w *= 0.5;

            float a = float(x) / 4 * w;
            float b = float(y) / 4 * w;
            float c = (1 - a - b) * w;

            int i = y*4 + x;
            A[3 * i + 0] = a;
            A[3 * i + 1] = b;
            A[3 * i + 2] = c;

            /*for (int yy = 0; yy < 4; yy++) {
                for (int xx = 0; xx < 4; xx++) {
                    float ww = input_weights[4*yy+xx];
                    //if ((xx == 1 || xx == 2) && (yy == 1 && yy == 2)) ww *= 0.5;

                    float aa = float(xx) / 4 * ww;
                    float bb = float(yy) / 4 * ww;
                    float cc = (1 - aa - bb) * ww;

                    m(0,0) += a * aa;
                    m(1,0) += b * aa;
                    m(2,0) += c * aa;

                    m(0,1) += a * bb;
                    m(1,1) += b * bb;
                    m(2,1) += c * bb;

                    m(0,2) += a * cc;
                    m(1,2) += b * cc;
                    m(2,2) += c * cc;
                }
            }*/
        }
    }

    // At*A
    for (int y = 0; y < 3; y++) {
        for (int x = 0; x < 3; x++) {
            float d = 0;
            for (int i = 0; i < 16; i++) {
                d += A[3*i+x] * A[3*i+y];
            }
            m(x, y) = d;
        }
    }

    // Compute right side:
    Vector3 Ca(0), Cb(0), Cc(0);

    for (int y = 0; y < 4; y++) {
        for (int x = 0; x < 4; x++) {
            float a = float(x) / 4;
            float b = float(y) / 4;
            float c = 1 - a - b;

            Vector3 C = input_colors[4*y+x].xyz() - Vector3(0.5f / 255);

            Ca += C * a;
            Cb += C * b;
            Cc += C * c;
        }
    }

    // Now we have 3 equations (one for each color component).

    Vector3 R(Ca.x, Cb.x, Cc.x);
    Vector3 G(Ca.y, Cb.y, Cc.y);
    Vector3 B(Ca.z, Cb.z, Cc.z);

    Vector3 r, g, b;

    if (!solveLU(m, R, &r)) {
        result->error = NV_FLOAT_MAX;
        return;
    }
    if (!solveLU(m, G, &g)) {
        result->error = NV_FLOAT_MAX;
        return;
    }
    if (!solveLU(m, B, &b)) {
        result->error = NV_FLOAT_MAX;
        return;
    }

    Vector3 Ch(r.x, g.x, b.x);
    Vector3 Cv(r.y, g.y, b.y);
    Vector3 Co(r.z, g.z, b.z);

    // Convert colors to 676
    result->data.mode = ETC_Data::Mode_Planar;
    result->data.planar.ro = pack_float_6(Co.x);
    result->data.planar.go = pack_float_7(Co.y);
    result->data.planar.bo = pack_float_6(Co.z);

    result->data.planar.rh = pack_float_6(Ch.x);
    result->data.planar.gh = pack_float_7(Ch.y);
    result->data.planar.bh = pack_float_6(Ch.z);

    result->data.planar.rv = pack_float_6(Cv.x);
    result->data.planar.gv = pack_float_7(Cv.y);
    result->data.planar.bv = pack_float_6(Cv.z);

    // Evaluate error.
    result->error = evaluate_rgb_mse(input_colors, input_weights, options, result->data);

    bool refine_endpoints = true;
    if (refine_endpoints) {
        ETC_Solution best = *result;

        // @@ The per-component errors are not correllated, test 8 combinations 3 times.
        for (int i = 0; i < 8; i++) {
            result->data.planar.ro = pack_float_6(Co.x, (i & 1) != 0);
            result->data.planar.rh = pack_float_6(Ch.x, (i & 2) != 0);
            result->data.planar.rv = pack_float_6(Cv.x, (i & 4) != 0);

            result->error = evaluate_rgb_mse(input_colors, input_weights, options, result->data);
            if (result->error < best.error) {
                best = *result;
            }
        }

        *result = best;

        for (int i = 0; i < 8; i++) {
            result->data.planar.go = pack_float_7(Co.y, (i & 1) != 0);
            result->data.planar.gh = pack_float_7(Ch.y, (i & 2) != 0);
            result->data.planar.gv = pack_float_7(Cv.y, (i & 4) != 0);

            result->error = evaluate_rgb_mse(input_colors, input_weights, options, result->data);
            if (result->error < best.error) {
                best = *result;
            }
        }

        *result = best;

        for (int i = 0; i < 8; i++) {
            result->data.planar.bo = pack_float_6(Co.z, (i & 1) != 0);
            result->data.planar.bh = pack_float_6(Ch.z, (i & 2) != 0);
            result->data.planar.bv = pack_float_6(Cv.z, (i & 4) != 0);

            result->error = evaluate_rgb_mse(input_colors, input_weights, options, result->data);
            if (result->error < best.error) {
                best = *result;
            }
        }

        *result = best;
    }
}


static void process_input_colors(Vector4 input_colors[16]) {
    for (int i = 0; i < 16; i++) {
        input_colors[i] = saturate(input_colors[i]);
        
        // @@ Sanitize input_weights?
        //  - Avoid blocks with all zero weight.
        //  - Normalize weights to avoid too small values?
        //  - Remove NaNs, infinites, etc.
    }
}

static void process_input_alphas(Vector4 input_colors[16], int input_channel) {
    for (int i = 0; i < 16; i++) {
        input_colors[i].component[input_channel] = saturate(input_colors[i].component[input_channel]);
    }
}

static void process_input_weights(float input_weights[16]) {
    float max_weight = 0.0f;
    for (int i = 0; i < 16; i++) {
        max_weight = nv::max(max_weight, input_weights[i]);
    }
    
    const float min_weight = 0.0001f;
    
    if (max_weight <= min_weight) {
        // Handle degenerate case.
        for (int i = 0; i < 16; i++) {
            input_weights[i] = 1;
        }
    }
    else {
        for (int i = 0; i < 16; i++) {
            // Clamp to positive.
            input_weights[i] = nv::max(input_weights[i], 0.0f);

            // Flush to zero.
            if (input_weights[i] < min_weight) input_weights[i] = 0.0f;

            // Normalize.
            input_weights[i] /= max_weight;
        }
    }
}


static float compress_etc_a1(Vector4 input_colors[16], float input_weights[16], const ETC_Options & options, void * output) {
    assert(options.onebit_alpha == true);

    // Classify block.
    bool transparent_block = true;
    bool opaque_block = true;

    for (int i = 0; i < 16; i++) {
        if (input_colors[i].w != 0) transparent_block = false;
        if (input_colors[i].w != 1) opaque_block = false;
    }

    if (transparent_block) {
        // @@ Encode trivial transparent block.
        return 0;
    }

    if (opaque_block) {
        // @@ Encode block with opaque bit set. @@ Isn't this like the standard encoder?
    }
    
    // @@ Encode mixed block.
    nvCheck(false); // Not implemented!
    
    //uint8 color_rgb[16*3];
    //uint8 alpha[16];
    //uint etc_word1, etc_word2;
    //compressBlockDifferentialWithAlpha(bool isTransparent, uint8* img, uint8* alphaimg, uint8* imgdec, 4, 4, 0, 0, &etc_word1, &etc_word2);
    
    return NV_FLOAT_MAX;
}

//uint etc_blocks = 0;
//uint planar_blocks = 0;
//#include "nvthread/Atomic.h"

static float compress_etc(Vector4 input_colors[16], float input_weights[16], const ETC_Options & options, void * output) {
    assert(options.onebit_alpha == false);
    
    ETC_Solution result;
    compress_etc1_range_fit(input_colors, input_weights, options, &result);

    if (options.use_rg_etc) {
#if HAVE_RGETC
        ETC_Solution rg_result;
        compress_etc1_rg(input_colors, input_weights, options, &rg_result);
        if (rg_result.error < result.error) {
            result = rg_result;
        }
#else
        // @@ Print warning?
#endif
    }

    if (options.enable_etc2) {
        if (options.use_planar) {
            ETC_Solution planar_result;
            compress_etc2_planar_lsqr(input_colors, input_weights, options, &planar_result);

            if (planar_result.error < result.error) {
                result = planar_result;
                //nv::atomicIncrement(&planar_blocks);
            }
            else {
                //nv::atomicIncrement(&etc_blocks);
            }
        }
        if (options.use_t_mode) {
            // @@
        }
        if (options.use_h_mode) {
            // @@
        }
    }

    pack_etc2_block(result.data, (BlockETC *)output);

    return result.error;
}


// Range search EAC compressor, slightly modified from ETCLib.
float compress_eac_range_search(Vector4 input_colors[16], float input_weights[16], int input_channel, const EAC_Options & options, void * output) {

    // Find alpha range
    float min_a = 1.0f;
    float max_a = 0.0f;
    for (uint i = 0; i < 16; i++) {
        float a = input_colors[i].component[input_channel];
        min_a = nv::min(min_a, a);
        max_a = nv::max(max_a, a);
    }
    const float range_a = max_a - min_a;

    EAC_Solution best;
    best.error = NV_FLOAT_MAX;

    // try each modifier table entry
    static const uint MODIFIER_TABLE_ENTRYS = 16;
    for (uint t = 0; t < MODIFIER_TABLE_ENTRYS; t++) {
        static const uint MIN_VALUE_SELECTOR = 3;
        static const uint MAX_VALUE_SELECTOR = 7;
    
        const float fTableEntryCenter = (float)-eac_intensity_modifiers[t][MIN_VALUE_SELECTOR];
        const float fTableEntryRange = (float)eac_intensity_modifiers[t][MAX_VALUE_SELECTOR] - eac_intensity_modifiers[t][MIN_VALUE_SELECTOR];
        const float fCenterRatio = fTableEntryCenter / fTableEntryRange;
    
        const int center = ftoi_round(255.0f * (min_a + fCenterRatio * range_a));
        const int min_base = max(0, center - options.search_radius);
        const int max_base = min(center + options.search_radius, 255);
    
        for (int base = min_base; base <= max_base; base++) {
            int range_multiplier = ftoi_round(255 * range_a / fTableEntryRange);
            const int min_multiplier = clamp(range_multiplier - options.search_radius, 1, 15);
            const int max_multiplier = clamp(range_multiplier + options.search_radius, 1, 15);
        
            for (int multiplier = min_multiplier; multiplier <= max_multiplier; multiplier++) {
            
                // find best selector for each pixel
                float block_error = 0;
                uint best_selector[16];
                for (uint i = 0; i < 16; i++) {
                
                    float best_error_a = NV_FLOAT_MAX;
                
                    static const uint ALPHA_SELECTOR_BITS = 3;
                    static const uint ALPHA_SELECTORS = 1 << ALPHA_SELECTOR_BITS;
                    for (uint s = 0; s < ALPHA_SELECTORS; s++) {
                        float alpha;
                        if (options.use_11bit_mode) {
                            alpha = get_alpha11(base, t, multiplier, s);
                        }
                        else {
                            alpha = get_alpha8(base, t, multiplier, s);
                        }
                    
                        float error_a = alpha - input_colors[i].component[input_channel];
                        error_a = error_a * error_a;
                    
                        if (error_a < best_error_a) {
                            best_error_a = error_a;
                            best_selector[i] = s;
                        }
                    }
                
                    block_error += best_error_a * input_weights[i];
                    if (block_error > best.error) {
                        break;  // Don't waste more time.
                    }
                }
            
                if (block_error < best.error) {
                    best.error = block_error;
                
                    best.data.alpha = base;
                    best.data.multiplier = multiplier;
                    best.data.table_index = t;
                    for (uint i = 0; i < 16; i++) {
                        // Flip selectors.
                        best.data.selector[i] = best_selector[4*(i%4) + i/4];
                    }
                }
            }
        }
    }

    pack_eac_block(best.data, (BlockEAC *)output);

    return best.error;
}


// Public API:

void nv::decompress_etc(const void * input_block, Vector4 output_colors[16]) {
#if 1 // Our code
    ETC_Data data;
    unpack_etc2_block((const BlockETC *)input_block, &data);

    decode_etc2(data, output_colors);

#elif HAVE_RGETC && 0

    Color32 colors[16];
    rg_etc1::unpack_etc1_block(input_block, &colors->u);

    for (int i = 0; i < 16; i++) {
        output_colors[i].x = colors[i].b * (1.0f / 255.0f);
        output_colors[i].y = colors[i].g * (1.0f / 255.0f);
        output_colors[i].z = colors[i].r * (1.0f / 255.0f);
        output_colors[i].w = colors[i].a * (1.0f / 255.0f);
    }

#elif HAVE_ETCPACK // Use etcpack for reference.
    const BlockETC * block = (const BlockETC *)input_block;

    uint8 colors[3*16];
    uint part1 = POSH_SwapU32(block->data32[0]);
    uint part2 = POSH_SwapU32(block->data32[1]);
    decompressBlockETC2(part1, part2, colors, 4, 4, 0, 0);

    for (int i = 0; i < 16; i++) {
        output_colors[i].x = colors[3*i+0] * (1.0f / 255.0f);
        output_colors[i].y = colors[3*i+1] * (1.0f / 255.0f);
        output_colors[i].z = colors[3*i+2] * (1.0f / 255.0f);
        output_colors[i].w = 1.0f;
    }
#endif
}

void nv::decompress_eac(const void * input_block, Vector4 output_colors[16], int output_channel) {
    nvCheck(output_channel >= 0 && output_channel < 4);
    
#if 1
    EAC_Data data;
    unpack_eac_block((const BlockEAC *)input_block, &data);
    decode_eac_11(data, output_colors, output_channel);
    
#elif HAVE_ETCPACK
    // Use etcpack for reference.
    formatSigned = 0;

    uint16 alphas[16];
    decompressBlockAlpha16bit((uint8*)input_block, (uint8*)alphas, 4, 4, 0, 0);

    for (int i = 0; i < 16; i++) {
        uint16 alpha = POSH_SwapU16(alphas[i]);
        output_colors[i].component[output_channel] = alpha * (1.0f / 65535.0f);
    }
#endif
}

void nv::decompress_etc_eac(const void * input, Vector4 output_colors[16]) {
#if 1
    BlockETC_EAC * input_block = (BlockETC_EAC *)input;

    ETC_Data etc;
    unpack_etc2_block(&input_block->etc, &etc);
    decode_etc2(etc, output_colors);

    EAC_Data eac;
    unpack_eac_block(&input_block->eac, &eac);
    decode_eac_8(eac, output_colors, 3);

#elif HAVE_ETCPACK
    // Use etcpack for reference.
    uint8 colors[4*16];
    decompressBlockAlpha((uint8*)input_block, colors, 4, 4, 0, 0);

    for (int i = 0; i < 16; i++) {
        output_colors[i].x = colors[4*i+0] * (1.0f / 255.0f);
        output_colors[i].y = colors[4*i+1] * (1.0f / 255.0f);
        output_colors[i].z = colors[4*i+2] * (1.0f / 255.0f);
        output_colors[i].w = colors[4*i+3] * (1.0f / 255.0f);
    }
#endif
}

float nv::compress_etc1(Vector4 input_colors[16], float input_weights[16], const Vector3 & color_weights, void * output) {
    
    process_input_colors(input_colors);
    
    // @@ Use same options for all blocks?
    ETC_Options options;
    options.use_rg_etc = true;
    options.enable_etc2 = false;
    options.use_t_mode = false;
    options.use_h_mode = false;
    options.use_planar = false;
    options.color_weights = color_weights;

    return compress_etc(input_colors, input_weights, options, output);
}

float nv::compress_etc2(Vector4 input_colors[16], float input_weights[16], const Vector3 & color_weights, void * output) {
    
    process_input_colors(input_colors);
    process_input_weights(input_weights);
    
    ETC_Options options;
    options.use_rg_etc = true;
    options.enable_etc2 = true;
    options.use_t_mode = false; // @@ Not implemented.
    options.use_h_mode = false; // @@ Not implemented.
    options.use_planar = true;
    options.color_weights = color_weights;

    return compress_etc(input_colors, input_weights, options, output);
}

float nv::compress_etc2_a1(Vector4 input_colors[16], float input_weights[16], const Vector3 & color_weights, void * output) {
    
    process_input_colors(input_colors);
    process_input_weights(input_weights);
    
    ETC_Options options;
    options.use_rg_etc = true;
    options.enable_etc2 = true;
    options.use_t_mode = false; // @@ Not implemented.
    options.use_h_mode = false; // @@ Not implemented.
    options.use_planar = true;
    options.onebit_alpha = true;
    options.color_weights = color_weights;
    
    return compress_etc_a1(input_colors, input_weights, options, output);
}


float nv::compress_eac(Vector4 input_colors[16], float input_weights[16], int input_channel, int search_radius, bool use_11bit_mode, void * output) {
    nvCheck(input_channel >= 0 && input_channel < 4);
    
    process_input_alphas(input_colors, input_channel);
    process_input_weights(input_weights);
    
    EAC_Options options;
    options.search_radius = search_radius;
    options.use_11bit_mode = use_11bit_mode;
    
    return compress_eac_range_search(input_colors, input_weights, input_channel, options, output);
}

float nv::compress_etc2_eac(Vector4 input_colors[16], float input_weights[16], const Vector3 & color_weights, void * output) {
    BlockETC_EAC * output_block = (BlockETC_EAC *)output;
    float error = compress_etc2(input_colors, input_weights, color_weights, &output_block->etc);
    error += compress_eac(input_colors, input_weights, /*input_channel=*/3, /*search_radius=*/1, /*use_11bit_mode=*/false, &output_block->eac);
    return error;
}