You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
nvidia-texture-tools/src/nvtt/CompressorETC.cpp

2308 lines
70 KiB
C++

#include "CompressorETC.h"
#include "nvmath/Vector.inl"
#include "nvmath/Matrix.inl"
#include "nvmath/Color.inl"
#include "nvcore/Utils.h" // clamp
#define HAVE_RGETC 1
#define HAVE_ETCPACK 0 // Only enable in OSX for debugging.
#if HAVE_RGETC
#include "rg_etc1.h"
#endif
#if HAVE_ETCPACK
// From etcpack.cxx
extern void decompressBlockETC2(unsigned int block_part1, unsigned int block_part2, uint8 *img, int width, int height, int startx, int starty);
extern void decompressBlockAlpha(uint8* data, uint8* img, int width, int height, int ix, int iy);
extern void decompressBlockAlpha16bit(uint8* data, uint8* img, int width, int height, int ix, int iy);
extern int formatSigned;
#endif
#define assert nvCheck
using namespace nv;
// TODO:
// - Accurate rounding of signed 3-bit components.
// - Range based table selection.
// - Slower try all options table selection?
// - Trivial selector assignment.
// * Base point optimization.
// * Brute force base point optimization.
// - Enumerate and evaluate all clusters.
// - Brute force planar mode endpoint refinement. For each color try two rounding directions (8 tests).
// - T & H modes decompression.
union BlockETC {
// Definitions from EtcLib/EtcBlock4x4EncodingBits.h
struct Individual {
uint red2 : 4; // byte 0
uint red1 : 4;
uint green2 : 4; // byte 1
uint green1 : 4;
uint blue2 : 4; // byte 2
uint blue1 : 4;
uint flip : 1; // byte 3
uint diff : 1;
uint cw2 : 3;
uint cw1 : 3;
uint selectors; // bytes 4-7
};
NV_COMPILER_CHECK(sizeof(BlockETC::Individual) == 64/8);
struct Differential {
uint dred2 : 3; // byte 0
uint red1 : 5;
uint dgreen2 : 3; // byte 1
uint green1 : 5;
uint dblue2 : 3; // byte 2
uint blue1 : 5;
uint flip : 1; // byte 3
uint diff : 1;
uint cw2 : 3;
uint cw1 : 3;
uint selectors; // bytes 4-7
};
NV_COMPILER_CHECK(sizeof(Differential) == 64/8);
struct T {
uint red1b : 2; // byte 0
uint detect2 : 1;
uint red1a : 2;
uint detect1 : 3;
uint blue1 : 4; // byte 1
uint green1 : 4;
uint green2 : 4; // byte 2
uint red2 : 4;
uint db : 1; // byte 3
uint diff : 1;
uint da : 2;
uint blue2 : 4;
uint selectors; // bytes 4-7
};
NV_COMPILER_CHECK(sizeof(T) == 64/8);
struct H {
uint green1a : 3; // byte 0
uint red1 : 4;
uint detect1 : 1;
uint blue1b : 2; // byte 1
uint detect3 : 1;
uint blue1a : 1;
uint green1b : 1;
uint detect2 : 3;
uint green2a : 3; // byte 2
uint red2 : 4;
uint blue1c : 1;
uint db : 1; // byte 3
uint diff : 1;
uint da : 1;
uint blue2 : 4;
uint green2b : 1;
uint selectors; // bytes 4-7
};
NV_COMPILER_CHECK(sizeof(H) == 64/8);
struct Planar {
uint originGreen1 : 1; // byte 0
uint originRed : 6;
uint detect1 : 1;
uint originBlue1 : 1; // byte 1
uint originGreen2 : 6;
uint detect2 : 1;
uint originBlue3 : 2; // byte 2
uint detect4 : 1;
uint originBlue2 : 2;
uint detect3 : 3;
uint horizRed2 : 1; // byte 3
uint diff : 1;
uint horizRed1 : 5;
uint originBlue4 : 1;
uint horizBlue1: 1; // byte 4
uint horizGreen : 7;
uint vertRed1 : 3; // byte 5
uint horizBlue2 : 5;
uint vertGreen1 : 5; // byte 6
uint vertRed2 : 3;
uint vertBlue : 6; // byte 7
uint vertGreen2 : 2;
};
NV_COMPILER_CHECK(sizeof(Planar) == 64/8);
uint64 data64;
uint32 data32[2];
uint8 data8[8];
Individual individual;
Differential differential;
T t;
H h;
Planar planar;
};
NV_COMPILER_CHECK(sizeof(BlockETC) == 64/8);
static const int etc_intensity_modifiers[8][4] = {
{ -8, -2, 2, 8 },
{ -17, -5, 5, 17 },
{ -29, -9, 9, 29 },
{ -42, -13, 13, 42 },
{ -60, -18, 18, 60 },
{ -80, -24, 24, 80 },
{ -106, -33, 33, 106 },
{ -183, -47, 47, 183 }
};
static const int etc_intensity_range[8] = {
16, 34, 58, 84, 120, 160, 212, 366
};
static const int etc_th_distances[8] = { 3, 6, 11, 16, 23, 32, 41, 64 };
static const uint8 etc_selector_scramble[] = { 3, 2, 0, 1 };
static const uint8 etc_selector_unscramble[] = { 2, 3, 1, 0 };
static float midpoints4[16];
NV_AT_STARTUP(
for (int i = 0; i < 15; i++) {
float f0 = float(((i+0) << 4) | ((i+0) >> 4)) / 255.0f;
float f1 = float(((i+1) << 4) | ((i+1) >> 4)) / 255.0f;
midpoints4[i] = (f0 + f1) * 0.5f;
}
midpoints4[15] = 1.0f;
);
static const float midpoints5[32] = {
0.015686f, 0.047059f, 0.078431f, 0.111765f, 0.145098f, 0.176471f, 0.207843f, 0.241176f, 0.274510f, 0.305882f, 0.337255f, 0.370588f, 0.403922f, 0.435294f, 0.466667f, 0.5f,
0.533333f, 0.564706f, 0.596078f, 0.629412f, 0.662745f, 0.694118f, 0.725490f, 0.758824f, 0.792157f, 0.823529f, 0.854902f, 0.888235f, 0.921569f, 0.952941f, 0.984314f, 1.0f
};
//static const float midpoints6[64];
//static const float midpoints7[128];
// ETC2 Modes:
// - ETC1:
// - two partitions (flip modes): 2*(4x2, 2x4)
// - two base colors stored as 444+444 or 555+333
// - two 3 bit intensity modifiers
// - T Mode. 2 colors 444, 3 bit intensity modifiers, 2 bit indices.
// - H Mode. 2 colors 444, 3 bit intensity modifiers, 2 bit indices.
// - Planar mode: 3 colors 676
struct ETC_Data {
enum Mode {
Mode_ETC1,
Mode_T,
Mode_H,
Mode_Planar,
} mode;
// @@ It may make more sense to store bit-expanded or even float colors here.
union {
struct {
uint16 color0; // 444 or 555
uint16 color1; // 444 or 333
uint8 table0; // 3 bits
uint8 table1; // 3 bits
bool flip; // partition mode
bool diff; // color encoding
} etc;
struct {
uint16 color0; // 444
uint16 color1; // 444
uint8 table; // 3 bits
} t, h;
struct {
uint8 ro, go, bo; // 676
uint8 rh, gh, bh; // 676
uint8 rv, gv, bv; // 676
} planar;
};
uint8 selector[16]; // 2 bit indices (32 bits)
};
struct ETC_Solution {
float error = NV_FLOAT_MAX;
ETC_Data data;
};
struct ETC_Options {
//bool fast_flip_mode_selection = false;
bool use_rg_etc = true;
bool enable_etc2 = true;
bool use_planar = true;
bool use_t_mode = true;
bool use_h_mode = true;
bool onebit_alpha = false;
Vector3 color_weights = Vector3(1);
//int8 eac_search_radius = 1; // [0-3]
//bool eac_11bit_mode = false;
};
/*static*/ float compress_etc(Vector4 input_colors[16], float input_weights[16], const ETC_Options & options, BlockETC * output);
struct BlockEAC {
uint base : 8;
uint table : 4;
uint multiplier : 4;
uint selectors0 : 8;
uint selectors1 : 8;
uint selectors2 : 8;
uint selectors3 : 8;
uint selectors4 : 8;
uint selectors5 : 8;
};
NV_COMPILER_CHECK(sizeof(BlockEAC) == 64/8);
struct BlockETC_EAC {
BlockEAC eac;
BlockETC etc;
};
NV_COMPILER_CHECK(sizeof(BlockETC_EAC) == 128/8);
// EAC:
// 8 bit base code word
// 4 bit multiplier
// 4 bit table index
// 16 * 3 bit indices.
struct EAC_Data {
uint8 alpha; // 8 bits
uint8 multiplier; // 4 bits
uint8 table_index; // 4 bits
uint8 selector[16]; // 3 bit indices
};
struct EAC_Solution {
float error = NV_FLOAT_MAX;
EAC_Data data;
};
struct EAC_Options {
int search_radius = 1; // 0 = fast, 1 = medium, 2 = slow
bool use_11bit_mode = false;
};
static const int eac_intensity_modifiers[16][8] = {
{-3, -6, -9, -15, 2, 5, 8, 14}, // 0
{-3, -7,-10, -13, 2, 6, 9, 12}, // 1
{-2, -5, -8, -13, 1, 4, 7, 12}, // 2
{-2, -4, -6, -13, 1, 3, 5, 12}, // 3
{-3, -6, -8, -12, 2, 5, 7, 11}, // 4
{-3, -7, -9, -11, 2, 6, 8, 10}, // 5
{-4, -7, -8, -11, 3, 6, 7, 10}, // 6
{-3, -5, -8, -11, 2, 4, 7, 10}, // 7
{-2, -6, -8, -10, 1, 5, 7, 9 }, // 8
{-2, -5, -8, -10, 1, 4, 7, 9 }, // 9
{-2, -4, -8, -10, 1, 3, 7, 9 }, // 10
{-2, -5, -7, -10, 1, 4, 6, 9 }, // 11
{-3, -4, -7, -10, 2, 3, 6, 9 }, // 12
{-1, -2, -3, -10, 0, 1, 2, 9 }, // 13
{-4, -6, -8, -9, 3, 5, 7, 8 }, // 14
{-3, -5, -7, -9, 2, 4, 6, 8 } // 15
};
static void pack_etc2_block(const ETC_Data & data, BlockETC * output_block) {
BlockETC block;
bool swap_colors = false;
if (data.mode == ETC_Data::Mode_ETC1) {
// These are the same for individual and differential blocks.
block.individual.diff = data.etc.diff;
block.individual.flip = data.etc.flip;
block.individual.cw1 = data.etc.table0;
block.individual.cw2 = data.etc.table1;
if (data.etc.diff) {
block.differential.red1 = data.etc.color0 >> 10;
block.differential.dred2 = data.etc.color1 >> 6;
block.differential.green1 = (data.etc.color0 >> 5) & 0x1F;
block.differential.dgreen2 = (data.etc.color1 >> 3) & 0x7;
block.differential.blue1 = data.etc.color0 & 0x1F;
block.differential.dblue2 = data.etc.color1 & 0x7;
}
else {
block.individual.red1 = data.etc.color0 >> 8;
block.individual.red2 = data.etc.color1 >> 8;
block.individual.green1 = (data.etc.color0 >> 4) & 0xF;
block.individual.green2 = (data.etc.color1 >> 4) & 0xF;
block.individual.blue1 = data.etc.color0 & 0xF;
block.individual.blue2 = data.etc.color1 & 0xF;
}
}
else if (data.mode == ETC_Data::Mode_T) {
block.t.red1a = (data.t.color0 >> 8) >> 2;
block.t.red1b = (data.t.color0 >> 8);
block.t.green1 = (data.t.color0 >> 4);
block.t.blue1 = data.t.color0;
block.t.red2 = (data.t.color1 >> 8);
block.t.green2 = (data.t.color1 >> 4);
block.t.blue2 = data.t.color1;
block.t.da = data.t.table >> 1;
block.t.db = data.t.table;
block.t.diff = 1;
// create an invalid R differential to trigger T mode
int dr = block.differential.dred2;
if (dr >= 4) dr -= 8;
int r = block.differential.red1 + dr;
block.t.detect1 = 0;
block.t.detect2 = 1;
if (r >= 4) {
block.t.detect1 = 7;
block.t.detect2 = 0;
}
}
else if (data.mode == ETC_Data::Mode_H) {
bool table_lsb = data.h.table & 1;
swap_colors = (data.h.color0 < data.h.color1) ^ !table_lsb;
uint16 color0 = data.h.color0;
uint16 color1 = data.h.color1;
if (swap_colors) {
swap(color0, color1);
}
block.h.red1 = (data.h.color0 >> 8);
block.h.green1a = (data.h.color0 >> 4) >> 1;
block.h.green1b = (data.h.color0 >> 4);
block.h.blue1a = data.h.color0 >> 3;
block.h.blue1b = data.h.color0 >> 1;
block.h.blue1c = data.h.color0;
block.h.red2 = (data.h.color1 >> 8);
block.h.green2a = (data.h.color1 >> 4) >> 1;
block.h.green2b = (data.h.color1 >> 4);
block.h.blue2 = (data.h.color1 >> 8);
block.h.da = data.h.table >> 2;
block.h.db = data.h.table >> 1;
block.h.diff = 1;
// create an invalid R differential to trigger T mode
block.h.detect1 = 0;
block.h.detect2 = 0;
block.h.detect3 = 0;
int dr = block.differential.dred2;
int dg = block.differential.dgreen2;
if (dr >= 4) dr -= 8;
if (dg >= 4) dg -= 8;
int r = block.differential.red1 + dr;
int g = block.differential.green1 + dg;
if (r < 0 || r > 31) {
block.h.detect1 = 1;
}
if (g >= 4) {
block.h.detect2 = 7;
block.h.detect3 = 0;
}
else {
block.h.detect2 = 0;
block.h.detect3 = 1;
}
}
if (data.mode == ETC_Data::Mode_Planar) {
// From ETCLib:
block.planar.originRed = data.planar.ro;
block.planar.originGreen1 = data.planar.go >> 6;
block.planar.originGreen2 = data.planar.go;
block.planar.originBlue1 = data.planar.bo >> 5;
block.planar.originBlue2 = data.planar.bo >> 3;
block.planar.originBlue3 = data.planar.bo >> 1;
block.planar.originBlue4 = data.planar.bo;
block.planar.horizRed1 = data.planar.rh >> 1;
block.planar.horizRed2 = data.planar.rh;
block.planar.horizGreen = data.planar.gh;
block.planar.horizBlue1 = data.planar.bh >> 5;
block.planar.horizBlue2 = data.planar.bh;
block.planar.vertRed1 = data.planar.rv >> 3;
block.planar.vertRed2 = data.planar.rv;
block.planar.vertGreen1 = data.planar.gv >> 2;
block.planar.vertGreen2 = data.planar.gv;
block.planar.vertBlue = data.planar.bv;
block.planar.diff = 1;
// create valid RG differentials and an invalid B differential to trigger planar mode
block.planar.detect1 = 0;
block.planar.detect2 = 0;
block.planar.detect3 = 0;
block.planar.detect4 = 0;
// @@ Clean this up.
int dr = block.differential.dred2;
int dg = block.differential.dgreen2;
int db = block.differential.dblue2;
if (dr >= 4) dr -= 8;
if (dg >= 4) dg -= 8;
if (db >= 4) db -= 8;
int r = block.differential.red1 + dr;
int g = block.differential.green1 + dg;
int b = block.differential.blue1 + db;
if (r < 0 || r > 31) {
block.planar.detect1 = 1;
}
if (g < 0 || g > 31) {
block.planar.detect2 = 1;
}
if (b >= 4) {
block.planar.detect3 = 7;
block.planar.detect4 = 0;
}
else {
block.planar.detect3 = 0;
block.planar.detect4 = 1;
}
}
else {
block.individual.selectors = 0;
for (int i = 0; i < 16; i++) {
uint selector = data.selector[i];
selector = etc_selector_scramble[selector];
block.individual.selectors |= (selector >> 1) << (i ^ 8);
block.individual.selectors |= (selector & 1) << ((16 + i) ^ 8);
}
if (swap_colors) {
block.individual.selectors ^= 0x0000FFFF;
}
}
// @@ output_block is big endian, byte swap:
*output_block = block;
}
static void unpack_etc2_block(const BlockETC * input_block, ETC_Data * data) {
// @@ input_block is big endian, byte swap first:
BlockETC block = *input_block;
// Assume ETC1 for now.
data->mode = ETC_Data::Mode_ETC1;
// These are the same for individual and differential blocks.
data->etc.diff = block.individual.diff != 0;
data->etc.flip = block.individual.flip != 0;
data->etc.table0 = block.individual.cw1;
data->etc.table1 = block.individual.cw2;
// Decode colors.
if (data->etc.diff) {
data->etc.color0 = U16((block.differential.red1 << 10) | (block.differential.green1 << 5) | block.differential.blue1);
data->etc.color1 = U16((block.differential.dred2 << 6) | (block.differential.dgreen2 << 3) | block.differential.dblue2);
// @@ Clean this up.
int dr = block.differential.dred2;
int dg = block.differential.dgreen2;
int db = block.differential.dblue2;
if (dr >= 4) dr -= 8;
if (dg >= 4) dg -= 8;
if (db >= 4) db -= 8;
int r = block.differential.red1 + dr;
int g = block.differential.green1 + dg;
int b = block.differential.blue1 + db;
// Detect ETC2 modes (invalid combinations).
if (r < 0 || r > 31) {
data->mode = ETC_Data::Mode_T;
}
else if (g < 0 || g > 31) {
data->mode = ETC_Data::Mode_H;
}
else if (b < 0 || b > 31) {
data->mode = ETC_Data::Mode_Planar;
}
}
else {
data->etc.color0 = U16((block.individual.red1 << 8) | (block.individual.green1 << 4) | block.individual.blue1);
data->etc.color1 = U16((block.individual.red2 << 8) | (block.individual.green2 << 4) | block.individual.blue2);
}
if (data->mode == ETC_Data::Mode_T) {
uint16 r0 = U16((block.t.red1a << 2) | block.t.red1b);
uint16 g0 = U16(block.t.green1);
uint16 b0 = U16(block.t.blue1);
data->t.color0 = U16(r0 << 8) | U16(g0 << 4) | b0;
uint16 r1 = U16(block.t.red2);
uint16 g1 = U16(block.t.green2);
uint16 b1 = U16(block.t.blue2);
data->t.color1 = U16(r1 << 8) | U16(g1 << 4) | b1;
data->t.table = U8((block.t.da << 1) | block.t.db);
}
else if (data->mode == ETC_Data::Mode_H) {
uint16 r0 = U16(block.h.red1);
uint16 g0 = U16((block.h.green1a << 1) | block.h.green1b);
uint16 b0 = U16((block.h.blue1a << 3) | (block.h.blue1b << 1) | block.h.blue1c);
data->h.color0 = U16(r0 << 8) | U16(g0 << 4) | b0;
uint16 r1 = U16(block.h.red2);
uint16 g1 = U16((block.h.green2a << 1) | block.h.green2b);
uint16 b1 = U16(block.h.blue2);
data->h.color1 = U16(r1 << 8) | U16(g1 << 4) | b1;
data->h.table = U8((block.h.da << 2) | (block.h.db << 1));
if (data->h.color0 >= data->h.color1) {
data->h.table++;
}
}
if (data->mode == ETC_Data::Mode_Planar) {
data->planar.ro = U8(block.planar.originRed);
data->planar.go = U8((block.planar.originGreen1 << 6) + block.planar.originGreen2);
data->planar.bo = U8((block.planar.originBlue1 << 5) + (block.planar.originBlue2 << 3) + (block.planar.originBlue3 << 1) + block.planar.originBlue4);
data->planar.rh = U8((block.planar.horizRed1 << 1) + block.planar.horizRed2);
data->planar.gh = U8(block.planar.horizGreen);
data->planar.bh = U8((block.planar.horizBlue1 << 5) + block.planar.horizBlue2);
data->planar.rv = U8((block.planar.vertRed1 << 3) + block.planar.vertRed2);
data->planar.gv = U8((block.planar.vertGreen1 << 2) + block.planar.vertGreen2);
data->planar.bv = U8(block.planar.vertBlue);
}
else {
// Note, selectors are arranged in columns, keep that order.
unsigned char * selectors = (uint8 *)&block.individual.selectors;
for (int i = 0; i < 16; i++) {
int byte_msb = (1 - (i / 8));
int byte_lsb = (3 - (i / 8));
int shift = (i & 7);
uint msb = (selectors[byte_msb] >> shift) & 1;
uint lsb = (selectors[byte_lsb] >> shift) & 1;
uint index = (msb << 1) | lsb;
if (data->mode == ETC_Data::Mode_ETC1) {
data->selector[i] = etc_selector_unscramble[index];
}
else {
// No scrambling in T & H modes.
data->selector[i] = index;
}
}
}
}
static void pack_eac_block(const EAC_Data & data, BlockEAC * output_block) {
output_block->base = data.alpha;
output_block->table = data.table_index;
output_block->multiplier = data.multiplier;
uint64 selector_bits = 0;
for (uint i = 0; i < 16; i++) {
uint shift = 45 - (3 * i);
selector_bits |= uint64(data.selector[i]) << shift;
}
output_block->selectors0 = selector_bits >> 40;
output_block->selectors1 = selector_bits >> 32;
output_block->selectors2 = selector_bits >> 24;
output_block->selectors3 = selector_bits >> 16;
output_block->selectors4 = selector_bits >> 8;
output_block->selectors5 = selector_bits >> 0;
}
static void unpack_eac_block(const BlockEAC * input_block, EAC_Data * data) {
data->alpha = input_block->base;
data->table_index = input_block->table;
data->multiplier = input_block->multiplier;
uint64 selector_bits = 0;
selector_bits |= uint64(input_block->selectors0) << 40;
selector_bits |= uint64(input_block->selectors1) << 32;
selector_bits |= uint64(input_block->selectors2) << 24;
selector_bits |= uint64(input_block->selectors3) << 16;
selector_bits |= uint64(input_block->selectors4) << 8;
selector_bits |= uint64(input_block->selectors5) << 0;
for (uint i = 0; i < 16; i++) {
uint shift = 45 - (3 * i);
data->selector[i] = (selector_bits >> shift) & 0x7;
}
}
// This assumes nin > nout-nin
inline int bitexpand(uint32 bits, uint nin, uint nout) {
assert(nout > nin);
//assert(nout - nin > nin);
return (bits << uint(nout - nin)) | (bits >> uint(2U * nin - nout));
}
// Integer color unpacking for decompressor.
static void unpack_color_444(uint32 packed_color, int * r, int * g, int * b) {
int r4 = (packed_color >> 8) & 0xF;
int g4 = (packed_color >> 4) & 0xF;
int b4 = packed_color & 0xF;
*r = r4 << 4 | r4; // bitexpand(r4, 4, 8);
*g = g4 << 4 | g4; // bitexpand(g4, 4, 8);
*b = b4 << 4 | b4; // bitexpand(b4, 4, 8);
}
static Vector3 unpack_color_444(uint32 packed_color) {
int r, g, b;
unpack_color_444(packed_color, &r, &g, &b);
return Vector3(float(r), float(g), float(b)) * 1.0f / 255.0f;
}
static void unpack_color_555(uint32 packed_color, int * r, int * g, int * b) {
int r5 = (packed_color >> 10) & 0x1F;
int g5 = (packed_color >> 5) & 0x1F;
int b5 = packed_color & 0x1F;
*r = (r5 << 3) | (r5 >> 2); // bitexpand(r5, 5, 8);
*g = (g5 << 3) | (g5 >> 2); // bitexpand(g5, 5, 8);
*b = (b5 << 3) | (b5 >> 2); // bitexpand(b5, 5, 8);
}
static Vector3 unpack_color_555(uint32 packed_color) {
int r, g, b;
unpack_color_555(packed_color, &r, &g, &b);
return Vector3(float(r), float(g), float(b)) * 1.0f / 255.0f;
}
// Returns signed r,g,b without bit expansion.
static void unpack_delta_333(uint32 packed_delta, int * r, int * g, int * b) {
*r = (packed_delta >> 6) & 7;
*g = (packed_delta >> 3) & 7;
*b = packed_delta & 7;
if (*r >= 4) *r -= 8;
if (*g >= 4) *g -= 8;
if (*b >= 4) *b -= 8;
}
static bool unpack_color_555(uint32 packed_color, uint32 packed_delta, int * r, int * g, int * b) {
int dc_r, dc_g, dc_b;
unpack_delta_333(packed_delta, &dc_r, &dc_g, &dc_b);
int r5 = int((packed_color >> 10U) & 0x1F) + dc_r;
int g5 = int((packed_color >> 5U) & 0x1F) + dc_g;
int b5 = int(packed_color & 0x1F) + dc_b;
bool success = true;
if (static_cast<uint>(r5 | g5 | b5) > 31U)
{
success = false;
r5 = clamp(r5, 0, 31);
g5 = clamp(g5, 0, 31);
b5 = clamp(b5, 0, 31);
}
*r = (r5 << 3) | (r5 >> 2); // bitexpand(r5, 5, 8);
*g = (g5 << 3) | (g5 >> 2); // bitexpand(g5, 5, 8);
*b = (b5 << 3) | (b5 >> 2); // bitexpand(b5, 5, 8);
return success;
}
static Vector3 unpack_color_555(uint32 packed_color, uint32 packed_delta) {
int r, g, b;
bool success = unpack_color_555(packed_color, packed_delta, &r, &g, &b);
assert(success);
return Vector3(float(r), float(g), float(b)) * 1.0f / 255.0f;
}
static void unpack_color_676(uint32 packed_color, int * r, int * g, int * b) {
int r6 = (packed_color >> 13) & 0x3F;
int g7 = (packed_color >> 6) & 0x7F;
int b6 = packed_color & 0x3F;
*r = bitexpand(r6, 6, 8); // r << 2 | r >> 4
*g = bitexpand(g7, 7, 8); // g << 1 | g >> 6
*b = bitexpand(b6, 6, 8); // b << 2 | b >> 4
}
static uint32 pack_color_444(Vector3 color) {
// Truncate.
uint r = U32(ftoi_trunc(clamp(color.x * 15.0f, 0.0f, 15.0f)));
uint g = U32(ftoi_trunc(clamp(color.y * 15.0f, 0.0f, 15.0f)));
uint b = U32(ftoi_trunc(clamp(color.z * 15.0f, 0.0f, 15.0f)));
// Round exactly according to 444 bit-expansion.
r += (color.x > midpoints4[r]);
g += (color.y > midpoints4[g]);
b += (color.z > midpoints4[b]);
return (r << 8) | (g << 4) | b;
}
static uint32 pack_color_555(Vector3 color) {
// Truncate.
uint r = U32(ftoi_trunc(clamp(color.x * 31.0f, 0.0f, 31.0f)));
uint g = U32(ftoi_trunc(clamp(color.y * 31.0f, 0.0f, 31.0f)));
uint b = U32(ftoi_trunc(clamp(color.z * 31.0f, 0.0f, 31.0f)));
// Round exactly according to 555 bit-expansion.
r += (color.x > midpoints5[r]);
g += (color.y > midpoints5[g]);
b += (color.z > midpoints5[b]);
return (r << 10) | (g << 5) | b;
}
static uint32 pack_delta_333(Vector3 delta) {
// @@ Accurate rounding of signed 3-bit components.
int r = ftoi_round(clamp(delta.x * 31.0f, -4.0f, 3.0f));
int g = ftoi_round(clamp(delta.y * 31.0f, -4.0f, 3.0f));
int b = ftoi_round(clamp(delta.z * 31.0f, -4.0f, 3.0f));
//r += (delta.x > delta_midpoints3[r]);
//g += (delta.y > delta_midpoints3[g]);
//b += (delta.z > delta_midpoints3[b]);
if (r < 0) r += 8;
if (g < 0) g += 8;
if (b < 0) b += 8;
return static_cast<uint16>(b | (g << 3) | (r << 6));
}
static uint8 pack_float_6(float f) {
// Truncate.
uint u = U32(ftoi_trunc(clamp(f * 63.0f, 0.0f, 63.0f)));
// Round exactly according to 6 bit-expansion.
//u += (f > midpoints6[u]);
float midpoint = 0.5f * (bitexpand(u, 6, 8) + bitexpand(min(u + 1, 63U), 6, 8)); // @@ Precompute.
u += (f > midpoint);
return U8(u);
}
static uint8 pack_float_7(float f) {
// Truncate.
uint u = U32(ftoi_trunc(clamp(f * 127.0f, 0.0f, 127.0f)));
// Round exactly according to 6 bit-expansion.
//u += (f > midpoints7[u]);
float midpoint = 0.5f * (bitexpand(u, 7, 8) + bitexpand(min(u + 1, 127U), 7, 8)); // @@ Precompute.
u += (f > midpoint);
return U8(u);
}
static uint8 pack_float_6(float f, bool round_dir) {
uint u = U32(ftoi_trunc(clamp(f * 63.0f + round_dir, 0.0f, 63.0f)));
return U8(u);
}
static uint8 pack_float_7(float f, bool round_dir) {
uint u = U32(ftoi_trunc(clamp(f * 127.0f + round_dir, 0.0f, 127.0f)));
return U8(u);
}
Vector3 get_partition_color_average(const Vector4 input_colors[16], const float input_weights[16], bool flip, int partition) {
Vector3 sum_c(0);
float sum_w = 0;
if (flip) {
// Horizontal partition.
int offset = partition ? 8 : 0;
for (int i = 0; i < 8; i++) {
sum_c += input_colors[i+offset].xyz() * input_weights[i+offset];
sum_w += input_weights[i+offset];
}
}
else {
// Vertical partition.
int offset = partition ? 2 : 0;
for (int i = 0; i < 4; i++) {
sum_c += input_colors[i+offset].xyz() * input_weights[i+offset];
sum_w += input_weights[i+offset];
sum_c += input_colors[i+offset+1].xyz() * input_weights[i+offset+1];
sum_w += input_weights[i+offset+1];
offset += 2;
}
}
if (sum_w == 0) {
sum_w = 1;
}
return sum_c * 1.0f / sum_w;
}
// Approximate partition color using average.
Vector3 base_color_average(const Vector3 colors[8]) {
Vector3 sum_c(0);
for (uint i = 0; i < 8; i++) {
sum_c += colors[i];
}
return sum_c * 1.0f / 8.0f;
}
Vector3 base_color_average(const Vector3 colors[8], const float weights[8]) {
Vector3 sum_c(0);
float sum_w = 0;
for (uint i = 0; i < 8; i++) {
sum_c += colors[i] * weights[i];
sum_w += weights[i];
}
return sum_c * 1.0f / sum_w;
}
#if 0
// Compute base color using least squares.
Vector3 base_color_least_squares(const Vector3 colors[8], int table_index, int indices[8]) {
// Compute dot(C, I) and dot(I, I)
Vector3 CI(0);
float II = 0;
for (int i = 0; i < 8; i++) {
Vector3 C = colors[i];
float I = etc_intensity_modifiers[table_index][indices[i]];
CI += C * I;
II += I * I;
}
return CI / II;
}
// @@ Do weighted least squares!
Vector3 base_color_least_squares(const Vector3 colors[8], const float weights[8], int table_index, int indices[8]) {
// Compute dot(C, I) and dot(I, I)
Vector3 CI(0);
float II = 0;
for (int i = 0; i < 8; i++) {
Vector3 C = colors[i];
float w = weights[i];
float I = etc_intensity_modifiers[table_index][indices[i]];
CI += C * I * w;
II += I * I;
}
return CI / II;
}
// Is this any faster than the above?
Vector3 base_color_least_squares(const Vector3 colors[8], int table_index, int c0, int c1, int c2) {
// Compute dot(C, I) and dot(I, I)
Vector3 CI(0);
float I0 = etc_intensity_modifiers[table_index][0];
float I1 = etc_intensity_modifiers[table_index][1];
float I2 = etc_intensity_modifiers[table_index][2];
float I3 = etc_intensity_modifiers[table_index][3];
float II = 0;
II += c0 * I0 * I0;
II += c1 * I1 * I1;
II += c2 * I2 * I2;
II += (8-c0-c1-c2) * I3 * I3;
int i = 0;
for (; i < c0; i++) CI += colors[i] * I0;
for (; i < c0+c1; i++) CI += colors[i] * I1;
for (; i < c0+c1+c2; i++) CI += colors[i] * I2;
for (; i < 8; i++) CI += colors[i] * I3;
return CI / II;
}
static void selectors_for_clusters(int c0, int c1, int c2, int selector[8]) {
int i = 0;
for (; i < c0; i++) selector[i] = 0;
for (; i < c0+c1; i++) selector[i] = 1;
for (; i < c0+c1+c2; i++) selector[i] = 2;
for (; i < 8; i++) selector[i] = 3;
}
static int cluster_count(int count = 8) {
int total = 0;
for (uint c0 = 0; c0 <= count; c0++) {
for (uint c1 = 0; c1 <= count-c0; c1++) {
for (uint c2 = 0; c2 <= count-c0-c1; c2++) {
total++;
}
}
}
// total is the number of possible cluster combinations.
return total;
}
// Does each partition have its own table index? Or is it shared for both?
void test_all_total_orders(const Vector4 colors[8], const float weights[8], int table_index) {
// @@ compute average luminance of each partition.
// @@ sort colors by the luminance differences respect to partition average.
// @@ compute luminance range, pick table index based on that. Try nearest indices also?
// For each cluster combination:
/*
for (uint c0 = 0; c0 <= count; c0++) {
for (uint c1 = 0; c1 <= count-c0; c1++) {
for (uint c2 = 0; c2 <= count-c0-c1; c2++) {
// compute selectors.
int selector[8];
selectors_for_clusters(c0, c1, c2, selector);
// compute base colors that minimize error in each partition.
// determine error for these quantized base colors. Record best cluster combination.
}
}
}
*/
}
void test_all_total_orders(const Vector4 input_colors[16], const float input_weights[16], uint count, bool flip, int table_index) {
// Slow method is to test both flip modes.
//test_all_total_orders(input_colors, input_weights, /*flip=*/false, int table_index);
//test_all_total_orders(input_colors, input_weights, /*flip=*/true, int table_index);
}
// @@ How do compute the error for a given base color?
// Compute indices using range fitting / quantization of input colors?
// Compute indices using range fitting.
void test_all_clusters() {
int count = 8; // Could be smaller.
for (uint c0 = 0; c0 <= count; c0++) {
Vector3 x1(0.0f);
float w1 = 0.0f;
for (uint c1 = 0; c1 <= count-c0; c1++) {
Vector3 x2(0.0f);
float w2 = 0.0f;
for (uint c2 = 0; c2 <= count-c0-c1; c2++) {
}
}
}
}
#endif
static Color32 saturate_color(int R, int G, int B) {
Color32 c;
c.r = U8(clamp(R, 0, 255));
c.g = U8(clamp(G, 0, 255));
c.b = U8(clamp(B, 0, 255));
c.a = 255;
return c;
}
static void get_diff_subblock_palette(uint16 packed_color, uint table_idx, Color32 palette[4]) {
assert(table_idx < 8);
const int * intensity_table = etc_intensity_modifiers[table_idx];
int r, g, b;
unpack_color_555(packed_color, &r, &g, &b);
for (int i = 0; i < 4; i++) {
const int y = intensity_table[i];
palette[i] = saturate_color(r + y, g + y, b + y);
}
}
static bool get_diff_subblock_palette(uint16 packed_color, uint16 packed_delta, uint table_idx, Color32 palette[4]) {
assert(table_idx < 8);
const int * intensity_table = etc_intensity_modifiers[table_idx];
int r, g, b;
bool success = unpack_color_555(packed_color, packed_delta, &r, &g, &b);
for (int i = 0; i < 4; i++) {
const int y = intensity_table[i];
palette[i] = saturate_color(r + y, g + y, b + y);
}
return success;
}
static void get_abs_subblock_palette(uint16 packed_color, uint table_idx, Color32 palette[4]) {
assert(table_idx < 8);
const int * intensity_table = etc_intensity_modifiers[table_idx];
int r, g, b;
unpack_color_444(packed_color, &r, &g, &b);
for (int i = 0; i < 4; i++) {
const int y = intensity_table[i];
palette[i] = saturate_color(r + y, g + y, b + y);
}
}
static int get_selector(const ETC_Data & data, int x, int y) {
// Note selectors are arranged in column order.
return data.selector[x*4+y];
}
static int get_partition(const ETC_Data & data, int x, int y) {
assert(data.mode == ETC_Data::Mode_ETC1);
return data.etc.flip ? y > 1 : x > 1;
}
static void decode_etc1(const ETC_Data & data, Vector4 colors[16]) {
assert(data.mode == ETC_Data::Mode_ETC1);
Color32 palette[2][4];
if (data.etc.diff) {
// Decode colors in 555+333 mode.
get_diff_subblock_palette(data.etc.color0, data.etc.table0, palette[0]);
get_diff_subblock_palette(data.etc.color0, data.etc.color1, data.etc.table1, palette[1]);
}
else {
// Decode colors in 444,444 mode.
get_abs_subblock_palette(data.etc.color0, data.etc.table0, palette[0]);
get_abs_subblock_palette(data.etc.color1, data.etc.table1, palette[1]);
}
for (int y = 0; y < 4; y++) {
for (int x = 0; x < 4; x++) {
colors[y*4+x] = toVector4(palette[get_partition(data, x, y)][get_selector(data, x, y)]);
}
}
}
static void decode_etc2_t(const ETC_Data & data, Vector4 output_colors[16]) {
assert(data.mode == ETC_Data::Mode_T);
int r, g, b;
Color32 palette[4];
int d = etc_th_distances[data.t.table];
unpack_color_444(data.t.color0, &r, &g, &b);
palette[0] = saturate_color(r, g, b);
unpack_color_444(data.t.color1, &r, &g, &b);
palette[1] = saturate_color(r + d, g + d, b + d);
palette[2] = saturate_color(r, g, b);
palette[3] = saturate_color(r - d, g - d, b - d);
for (int y = 0; y < 4; y++) {
for (int x = 0; x < 4; x++) {
output_colors[y*4+x] = toVector4(palette[get_selector(data, x, y)]);
}
}
}
static void decode_etc2_h(const ETC_Data & data, Vector4 output_colors[16]) {
assert(data.mode == ETC_Data::Mode_H);
int r, g, b;
Color32 palette[4];
int d = etc_th_distances[data.t.table];
unpack_color_444(data.t.color0, &r, &g, &b);
palette[0] = saturate_color(r + d, g + d, b + d);
palette[1] = saturate_color(r - d, g - d, b - d);
unpack_color_444(data.t.color1, &r, &g, &b);
palette[2] = saturate_color(r + d, g + d, b + d);
palette[3] = saturate_color(r - d, g - d, b - d);
for (int y = 0; y < 4; y++) {
for (int x = 0; x < 4; x++) {
output_colors[y*4+x] = toVector4(palette[get_selector(data, x, y)]);
}
}
}
static void decode_etc2_planar(const ETC_Data & data, Vector4 output_colors[16]) {
assert(data.mode == ETC_Data::Mode_Planar);
int ro, go, bo; // origin color
int rh, gh, bh; // horizontal color
int rv, gv, bv; // vertical color
// Unpack from 676
ro = bitexpand(data.planar.ro, 6, 8); // r << 2 | r >> 4
go = bitexpand(data.planar.go, 7, 8); // g << 1 | g >> 6
bo = bitexpand(data.planar.bo, 6, 8);
rh = bitexpand(data.planar.rh, 6, 8);
gh = bitexpand(data.planar.gh, 7, 8);
bh = bitexpand(data.planar.bh, 6, 8);
rv = bitexpand(data.planar.rv, 6, 8);
gv = bitexpand(data.planar.gv, 7, 8);
bv = bitexpand(data.planar.bv, 6, 8);
for (int y = 0; y < 4; y++) {
for (int x = 0; x < 4; x++) {
int r = (4 * ro + x * (rh - ro) + y * (rv - ro) + 2) >> 2;
int g = (4 * go + x * (gh - go) + y * (gv - go) + 2) >> 2;
int b = (4 * bo + x * (bh - bo) + y * (bv - bo) + 2) >> 2;
int idx = 4 * y + x;
output_colors[idx].x = saturate(float(r) / 255.0f);
output_colors[idx].y = saturate(float(g) / 255.0f);
output_colors[idx].z = saturate(float(b) / 255.0f);
output_colors[idx].w = 1;
}
}
}
static void decode_etc2(const ETC_Data & data, Vector4 colors[16]) {
if (data.mode == ETC_Data::Mode_ETC1) {
decode_etc1(data, colors);
}
else if (data.mode == ETC_Data::Mode_T) {
decode_etc2_t(data, colors);
}
else if (data.mode == ETC_Data::Mode_H) {
decode_etc2_h(data, colors);
}
else /*if (data.mode == ETC_Data::Mode_Planar)*/ {
decode_etc2_planar(data, colors);
}
}
static float get_alpha11(int base, int table, int mul, int index) {
int elevenbase = base*8+4;
int tabVal = eac_intensity_modifiers[table][index];
int elevenTabVal = tabVal*8;
if(mul!=0) elevenTabVal*=mul;
else elevenTabVal/=8;
//calculate sum
int elevenbits = elevenbase+elevenTabVal;
//clamp..
if(elevenbits>=256*8) elevenbits=256*8-1;
else if(elevenbits<0) elevenbits=0;
//elevenbits now contains the 11 bit alpha value as defined in the spec.
//extend to 16 bits before returning, since we don't have any good 11-bit file formats.
uint16 sixteenbits = (elevenbits<<5)+(elevenbits>>6);
return float(sixteenbits) / 65535.0f;
}
static float get_alpha8(int base, int table, int mul, int index) {
int value = clamp(base + eac_intensity_modifiers[table][index] * mul, 0, 255);
return value / 255.0f;
}
static void decode_eac_8(const EAC_Data & data, Vector4 output_colors[16], int output_channel = 3) {
for (int i = 0; i < 16; i++) {
int s = data.selector[4*(i%4) + i/4];
output_colors[i].component[output_channel] = get_alpha8(data.alpha, data.table_index, data.multiplier, s);
}
}
static void decode_eac_11(const EAC_Data & data, Vector4 output_colors[16], int output_channel = 0) {
for (int i = 0; i < 16; i++) {
int s = data.selector[4*(i%4) + i/4];
output_colors[i].component[output_channel] = get_alpha11(data.alpha, data.table_index, data.multiplier, s);
}
}
static float evaluate_mse(const Vector3 & p, const Vector3 & c, const Vector3 & w) {
Vector3 d = (p - c) * w;
return dot(d, d);
}
static float evaluate_rgb_mse(const Vector4 input_colors[16], const float input_weights[16], const ETC_Options & options, const ETC_Data & data) {
// Decode data and compare?
Vector4 colors[16];
decode_etc2(data, colors);
float error = 0;
for (int i = 0; i < 16; i++) {
error += input_weights[i] * evaluate_mse(input_colors[i].xyz(), colors[i].xyz(), options.color_weights);
}
return error;
}
static int select_table_index(const Vector3 & base_color, const Vector4 input_colors[16], const float input_weights[16], bool flip, int partition) {
//float min_lum_delta = NV_FLOAT_MAX;
float max_lum_delta = -NV_FLOAT_MAX;
int xb = partition ? 2 : 0;
int xe = partition ? 4 : 2;
for (int y = 0; y < 4; y++) {
for (int x = xb; x < xe; x++) {
int idx = flip ? x*4 + y : y*4 + x;
float lum_delta = dot(base_color, Vector3(1.0f/3)) - dot(input_colors[idx].xyz(), Vector3(1.0f/3));
//min_lum_delta = min(min_lum_delta, lum_delta);
max_lum_delta = max(max_lum_delta, fabsf(lum_delta));
}
}
int best_range = -1;
float best_error = NV_FLOAT_MAX;
for (int i = 0; i < 8; i++) {
float error = fabsf(etc_intensity_range[i] - 255 * max_lum_delta);
if (error < best_error) {
best_error = error;
best_range = i;
}
}
return best_range;
}
static float update_selectors(const Vector4 input_colors[16], const float input_weights[16], ETC_Data & data, const ETC_Options & options) {
Color32 palette[2][4];
if (data.etc.diff) {
// Decode colors in 555+333 mode.
get_diff_subblock_palette(data.etc.color0, data.etc.table0, palette[0]);
get_diff_subblock_palette(data.etc.color0, data.etc.color1, data.etc.table1, palette[1]);
}
else {
// Decode colors in 444,444 mode.
get_abs_subblock_palette(data.etc.color0, data.etc.table0, palette[0]);
get_abs_subblock_palette(data.etc.color1, data.etc.table1, palette[1]);
}
float total_error = 0;
for (int y = 0; y < 4; y++) {
for (int x = 0; x < 4; x++) {
int i = y*4 + x;
float best_error = NV_FLOAT_MAX;
int best_p = 0;
for (int p = 0; p < 4; p++) {
float error = evaluate_mse(toVector3(palette[get_partition(data, x, y)][p]), input_colors[i].xyz(), options.color_weights);
if (error < best_error) {
best_error = error;
best_p = p;
}
}
int s = x*4 + y;
data.selector[s] = U8(best_p);
total_error += best_error * input_weights[i];
}
}
return total_error;
}
static void partition_input_block(const Vector4 input_colors[16], const float input_weights[16], bool flip, int partition, Vector3 output_colors[8], float output_weights[8]) {
const int xb = partition ? 2 : 0;
const int xe = partition ? 4 : 2;
for (int y = 0, i = 0; y < 4; y++) {
for (int x = xb; x < xe; x++, i++) {
int idx = flip ? x*4 + y : y*4 + x;
output_colors[i] = input_colors[idx].xyz();
output_weights[i] = input_weights[idx];
}
}
}
struct ETC_SubBlock {
Vector3 color;
bool delta;
int table;
int indices[8];
};
static float evaluate_rgb_mse(const Vector3 colors[8], const float weights[8], const ETC_Options & options, ETC_SubBlock * sub_block) {
// Evaluate sub block palette.
Vector3 palette[4];
palette[0] = sub_block->color + Vector3(etc_intensity_modifiers[sub_block->table][0] / 255.0f);
palette[1] = sub_block->color + Vector3(etc_intensity_modifiers[sub_block->table][1] / 255.0f);
palette[2] = sub_block->color + Vector3(etc_intensity_modifiers[sub_block->table][2] / 255.0f);
palette[3] = sub_block->color + Vector3(etc_intensity_modifiers[sub_block->table][3] / 255.0f);
float mse = 0;
for (int i = 0; i < 8; i++) {
mse += evaluate_mse(colors[i], palette[sub_block->indices[i]], options.color_weights) * weights[i];
}
return mse;
}
static void optimize_base_color(const Vector3 colors[8], const float weights[8], ETC_SubBlock * sub_block) {
// @@ For a given index selection, find color that minimizes the error. RGB components are independent.
float D_sum = 0;
float R_sum = 0;
float G_sum = 0;
float B_sum = 0;
float W_sum = 0;
for (int i = 0; i < 8; i++) {
float Di = etc_intensity_modifiers[sub_block->table][sub_block->indices[i]] / 255.0f; // @@ precompute?
D_sum += Di * weights[i];
R_sum += colors[i].x * weights[i];
G_sum += colors[i].y * weights[i];
B_sum += colors[i].z * weights[i];
W_sum += weights[i];
}
sub_block->color.x = (R_sum - D_sum) / W_sum;
sub_block->color.y = (R_sum - D_sum) / W_sum;
sub_block->color.z = (R_sum - D_sum) / W_sum;
// @@ Estimate error (without quantization)
// @@ Repeat for all tables?
// @@ Given a new center, compute new indices, then update center?
}
static int reduce_colors(Vector3 * colors, float * weights, int count) {
int n = 0;
for (int i = 0; i < count; i++) {
if (weights[i] == 0.0f) {
// skip without incrementing n.
continue;
}
colors[n] = colors[i];
weights[n] = weights[i];
// find color[j] that matches color[i]
for (int j = i + 1; j < count; j++) {
if (colors[i] == colors[j]) { // @@ Compare within threshold?
weights[n] += weights[j];
weights[j] = 0.0f;
}
}
n++;
}
return n;
}
// stable sort. in place.
static void sort_colors(Vector3 * colors, float * weights, int count) {
assert(count <= 8);
// build the list of values
//int order[8];
float lum[8];
for (int i = 0; i < count; ++i) {
//order[i] = i;
lum[i] = colors[i].x + colors[i].y + colors[i].z;
}
// stable sort
for (int i = 0; i < count; ++i) {
for (int j = i; j > 0 && lum[j] < lum[j - 1]; --j) {
swap(lum[j], lum[j - 1]);
//swap(order[j], order[j - 1]);
swap(colors[j], colors[j - 1]);
}
}
}
/*
float optimize_center(float colors[4][10], uniform int p, uniform int table_level)
{
float best_center = 0;
for (uniform int q = 0; q < 4; q++)
{
best_center += (colors[q][7 + p] - get_etc1_dY(table_level, q)) * colors[q][3];
}
best_center /= 8;
float best_err = 0;
for (uniform int q = 0; q < 4; q++)
{
float dY = get_etc1_dY(table_level, q);
best_err += sq(clamp(best_center + dY, 0, 255) - colors[q][7 + p]) * colors[q][3];
}
for (uniform int branch = 0; branch < 4; branch++)
{
float new_center = 0;
float sum = 0;
for (uniform int q = 0; q < 4; q++)
{
if (branch <= 1 && q <= branch) continue;
if (branch >= 2 && q >= branch) continue;
new_center += (colors[q][7 + p] - get_etc1_dY(table_level, q)) * colors[q][3];
sum += colors[q][3];
}
new_center /= sum;
float err = 0;
for (uniform int q = 0; q < 4; q++)
{
float dY = get_etc1_dY(table_level, q);
err += sq(clamp(new_center + dY, 0, 255) - colors[q][7 + p]) * colors[q][3];
}
if (err < best_err)
{
best_err = err;
best_center = new_center;
}
}
return best_center;
}
*/
static void compress_etc1_test(const Vector4 input_colors[16], const float input_weights[16], const ETC_Options & options, ETC_Solution * result) {
Vector3 colors[8];
float weights[8];
//int xrefs[8];
ETC_SubBlock sub_block[2];
bool best_flip = false;
for (int flip = 0; flip <= 1; flip++) {
partition_input_block(input_colors, input_weights, !!flip, /*partition=*/0, colors, weights);
int count = reduce_colors(colors, weights, 8);
//sort_colors(colors, weights);
// @@ sort colors along luminance axis.
//sub_block[0].color
partition_input_block(input_colors, input_weights, !!flip, /*partition=*/1, colors, weights);
}
//pack_colors(sub_block[0].color, sub_block[1].color, &result->data);
result->error = update_selectors(input_colors, input_weights, result->data, options);
}
/*void pack_colors(const Vector3 & color0, const Vector3 & color1, const ETC_Options & options, ETC_Data * data) {
uint16 abs_c0 = U16(pack_color_444(color0));
uint16 abs_c1 = U16(pack_color_444(color1));
Vector3 abs_vc0 = unpack_color_444(abs_c0);
Vector3 abs_vc1 = unpack_color_444(abs_c1);
float abs_error = evaluate_mse(color0, abs_vc0, options.color_weights) + evaluate_mse(color1, abs_vc1, options.color_weights);
uint16 diff_c0 = U16(pack_color_555(color0));
Vector3 diff_vc0 = unpack_color_555(diff_c0);
uint16 diff_d1 = U16(pack_delta_333(color1 - diff_vc0));
Vector3 diff_vc1 = unpack_color_555(diff_c0, diff_d1);
float diff_error = evaluate_mse(color0, diff_vc0, options.color_weights) + evaluate_mse(color1, diff_vc1, options.color_weights);
if (diff_error < abs_error) {
data->etc.color0 = diff_c0;
data->etc.color1 = diff_c1;
return diff_error;
}
else {
if (abs_error < best_error) {
best_error = abs_error;
best_diff = false;
best_flip = flip;
best_c0 = abs_c0;
best_c1 = abs_c1;
best_vc0 = abs_vc0;
best_vc1 = abs_vc1;
}
}
}*/
static void compress_etc1_range_fit(const Vector4 input_colors[16], const float input_weights[16], const ETC_Options & options, ETC_Solution * result) {
float best_error = NV_FLOAT_MAX;
bool best_diff = false;
bool best_flip = false;
uint16 best_c0 = 0;
uint16 best_c1 = 0;
Vector3 best_vc0;
Vector3 best_vc1;
for (int flip = 0; flip <= 1; flip++) {
Vector3 color0 = get_partition_color_average(input_colors, input_weights, !!flip, /*partition=*/0);
Vector3 color1 = get_partition_color_average(input_colors, input_weights, !!flip, /*partition=*/1);
uint16 abs_c0 = U16(pack_color_444(color0));
uint16 abs_c1 = U16(pack_color_444(color1));
Vector3 abs_vc0 = unpack_color_444(abs_c0);
Vector3 abs_vc1 = unpack_color_444(abs_c1);
float abs_error = evaluate_mse(color0, abs_vc0, options.color_weights) + evaluate_mse(color1, abs_vc1, options.color_weights);
uint16 diff_c0 = U16(pack_color_555(color0));
Vector3 diff_vc0 = unpack_color_555(diff_c0);
uint16 diff_d1 = U16(pack_delta_333(color1 - diff_vc0));
Vector3 diff_vc1 = unpack_color_555(diff_c0, diff_d1);
float diff_error = evaluate_mse(color0, diff_vc0, options.color_weights) + evaluate_mse(color1, diff_vc1, options.color_weights);
if (diff_error < abs_error) {
if (diff_error < best_error) {
best_error = diff_error;
best_diff = true;
best_flip = !!flip;
best_c0 = diff_c0;
best_c1 = diff_d1;
best_vc0 = diff_vc0;
best_vc1 = diff_vc1;
}
}
else {
if (abs_error < best_error) {
best_error = abs_error;
best_diff = false;
best_flip = !!flip;
best_c0 = abs_c0;
best_c1 = abs_c1;
best_vc0 = abs_vc0;
best_vc1 = abs_vc1;
}
}
}
result->data.mode = ETC_Data::Mode_ETC1;
result->data.etc.flip = best_flip;
result->data.etc.diff = best_diff;
result->data.etc.table0 = select_table_index(best_vc0, input_colors, input_weights, best_flip, /*partition=*/0);
result->data.etc.table1 = select_table_index(best_vc1, input_colors, input_weights, best_flip, /*partition=*/1);
result->data.etc.color0 = best_c0;
result->data.etc.color1 = best_c1;
result->error = update_selectors(input_colors, input_weights, result->data, options);
result->error = evaluate_rgb_mse(input_colors, input_weights, options, result->data);
}
#if HAVE_RGETC
#include "nvimage/ColorBlock.h"
void compress_etc1_rg(const Vector4 input_colors[16], const float input_weights[16], const ETC_Options & options, ETC_Solution * result) {
rg_etc1::etc1_pack_params pack_params;
//pack_params.m_quality = rg_etc1::cLowQuality;
pack_params.m_quality = rg_etc1::cMediumQuality; // @@ Select quality based on compression options.
ColorBlock rgba;
for (uint i = 0; i < 16; i++) {
rgba.color(i) = toColor32(input_colors[i]);
}
rgba.swizzle(2, 1, 0, 3);
BlockETC block;
rg_etc1::pack_etc1_block((void *)&block, (const uint *)rgba.colors(), pack_params);
unpack_etc2_block(&block, &result->data);
result->error = evaluate_rgb_mse(input_colors, input_weights, options, result->data);
}
#endif
static void compress_etc2_planar_solid(const Vector4 input_colors[16], const float input_weights[16], const ETC_Options & options, ETC_Solution * result) {
Vector3 C(0);
float W = 0;
for (int i = 0; i < 16; i++) {
C += input_colors[i].xyz() * input_weights[i];
W += input_weights[i];
}
C /= W;
// Convert colors to 676
result->data.mode = ETC_Data::Mode_Planar;
result->data.planar.ro = pack_float_6(C.x);
result->data.planar.go = pack_float_7(C.y);
result->data.planar.bo = pack_float_6(C.z);
result->data.planar.rh = result->data.planar.ro;
result->data.planar.gh = result->data.planar.go;
result->data.planar.bh = result->data.planar.bo;
result->data.planar.rv = result->data.planar.ro;
result->data.planar.gv = result->data.planar.go;
result->data.planar.bv = result->data.planar.bo;
// Evaluate error.
result->error = evaluate_rgb_mse(input_colors, input_weights, options, result->data);
}
// Least squares optimization of planar endpoints.
static void compress_etc2_planar_lsqr(const Vector4 input_colors[16], const float input_weights[16], const ETC_Options & options, ETC_Solution * result) {
// Isn't this a simple least squares problem?
// - Yes, but that doesn't take clamping and quantization into account.
// - Solve the least squares problem, then refine endpoints?
// This matrix is always the same! But not when using arbitrary weights!
// This would be faster computing the matrix first, then multiplying by the weight covariance matrix.
Matrix3 m(0);
// For every pixel, decoder does:
// int r = (4 * ro + x * (rh - ro) + y * (rv - ro) + 2) >> 2;
// R(x,y) = (4 * ro + x * (rh - ro) + y * (rv - ro) + 2) / 4;
// R(x,y) = ro * (1 - x/4 - y/4) + rh * x/4 + rv * y/4 + 1/2;
// a = x/4
// b = y/4
// c = 1 - a - b
// R(x,y) = ro * c + rh * a + rv * b + 1/2;
float A[3 * 16];
for (int y = 0; y < 4; y++) {
for (int x = 0; x < 4; x++) {
float w = input_weights[4*y+x];
//if ((x == 1 || x == 2) && (y == 1 && y == 2)) w *= 0.5;
float a = float(x) / 4 * w;
float b = float(y) / 4 * w;
float c = (1 - a - b) * w;
int i = y*4 + x;
A[3 * i + 0] = a;
A[3 * i + 1] = b;
A[3 * i + 2] = c;
/*for (int yy = 0; yy < 4; yy++) {
for (int xx = 0; xx < 4; xx++) {
float ww = input_weights[4*yy+xx];
//if ((xx == 1 || xx == 2) && (yy == 1 && yy == 2)) ww *= 0.5;
float aa = float(xx) / 4 * ww;
float bb = float(yy) / 4 * ww;
float cc = (1 - aa - bb) * ww;
m(0,0) += a * aa;
m(1,0) += b * aa;
m(2,0) += c * aa;
m(0,1) += a * bb;
m(1,1) += b * bb;
m(2,1) += c * bb;
m(0,2) += a * cc;
m(1,2) += b * cc;
m(2,2) += c * cc;
}
}*/
}
}
// At*A
for (int y = 0; y < 3; y++) {
for (int x = 0; x < 3; x++) {
float d = 0;
for (int i = 0; i < 16; i++) {
d += A[3*i+x] * A[3*i+y];
}
m(x, y) = d;
}
}
// Compute right side:
Vector3 Ca(0), Cb(0), Cc(0);
for (int y = 0; y < 4; y++) {
for (int x = 0; x < 4; x++) {
float a = float(x) / 4;
float b = float(y) / 4;
float c = 1 - a - b;
Vector3 C = input_colors[4*y+x].xyz() - Vector3(0.5f / 255);
Ca += C * a;
Cb += C * b;
Cc += C * c;
}
}
// Now we have 3 equations (one for each color component).
Vector3 R(Ca.x, Cb.x, Cc.x);
Vector3 G(Ca.y, Cb.y, Cc.y);
Vector3 B(Ca.z, Cb.z, Cc.z);
Vector3 r, g, b;
if (!solveLU(m, R, &r)) {
result->error = NV_FLOAT_MAX;
return;
}
if (!solveLU(m, G, &g)) {
result->error = NV_FLOAT_MAX;
return;
}
if (!solveLU(m, B, &b)) {
result->error = NV_FLOAT_MAX;
return;
}
Vector3 Ch(r.x, g.x, b.x);
Vector3 Cv(r.y, g.y, b.y);
Vector3 Co(r.z, g.z, b.z);
// Convert colors to 676
result->data.mode = ETC_Data::Mode_Planar;
result->data.planar.ro = pack_float_6(Co.x);
result->data.planar.go = pack_float_7(Co.y);
result->data.planar.bo = pack_float_6(Co.z);
result->data.planar.rh = pack_float_6(Ch.x);
result->data.planar.gh = pack_float_7(Ch.y);
result->data.planar.bh = pack_float_6(Ch.z);
result->data.planar.rv = pack_float_6(Cv.x);
result->data.planar.gv = pack_float_7(Cv.y);
result->data.planar.bv = pack_float_6(Cv.z);
// Evaluate error.
result->error = evaluate_rgb_mse(input_colors, input_weights, options, result->data);
bool refine_endpoints = true;
if (refine_endpoints) {
ETC_Solution best = *result;
// @@ The per-component errors are not correllated, test 8 combinations 3 times.
for (int i = 0; i < 8; i++) {
result->data.planar.ro = pack_float_6(Co.x, (i & 1) != 0);
result->data.planar.rh = pack_float_6(Ch.x, (i & 2) != 0);
result->data.planar.rv = pack_float_6(Cv.x, (i & 4) != 0);
result->error = evaluate_rgb_mse(input_colors, input_weights, options, result->data);
if (result->error < best.error) {
best = *result;
}
}
*result = best;
for (int i = 0; i < 8; i++) {
result->data.planar.go = pack_float_7(Co.y, (i & 1) != 0);
result->data.planar.gh = pack_float_7(Ch.y, (i & 2) != 0);
result->data.planar.gv = pack_float_7(Cv.y, (i & 4) != 0);
result->error = evaluate_rgb_mse(input_colors, input_weights, options, result->data);
if (result->error < best.error) {
best = *result;
}
}
*result = best;
for (int i = 0; i < 8; i++) {
result->data.planar.bo = pack_float_6(Co.z, (i & 1) != 0);
result->data.planar.bh = pack_float_6(Ch.z, (i & 2) != 0);
result->data.planar.bv = pack_float_6(Cv.z, (i & 4) != 0);
result->error = evaluate_rgb_mse(input_colors, input_weights, options, result->data);
if (result->error < best.error) {
best = *result;
}
}
*result = best;
}
}
static void process_input_colors(Vector4 input_colors[16]) {
for (int i = 0; i < 16; i++) {
input_colors[i] = saturate(input_colors[i]);
// @@ Sanitize input_weights?
// - Avoid blocks with all zero weight.
// - Normalize weights to avoid too small values?
// - Remove NaNs, infinites, etc.
}
}
static void process_input_alphas(Vector4 input_colors[16], int input_channel) {
for (int i = 0; i < 16; i++) {
input_colors[i].component[input_channel] = saturate(input_colors[i].component[input_channel]);
}
}
static void process_input_weights(float input_weights[16]) {
float max_weight = 0.0f;
for (int i = 0; i < 16; i++) {
max_weight = nv::max(max_weight, input_weights[i]);
}
const float min_weight = 0.0001f;
if (max_weight <= min_weight) {
// Handle degenerate case.
for (int i = 0; i < 16; i++) {
input_weights[i] = 1;
}
}
else {
for (int i = 0; i < 16; i++) {
// Clamp to positive.
input_weights[i] = nv::max(input_weights[i], 0.0f);
// Flush to zero.
if (input_weights[i] < min_weight) input_weights[i] = 0.0f;
// Normalize.
input_weights[i] /= max_weight;
}
}
}
static float compress_etc_a1(Vector4 input_colors[16], float input_weights[16], const ETC_Options & options, void * output) {
assert(options.onebit_alpha == true);
// Classify block.
bool transparent_block = true;
bool opaque_block = true;
for (int i = 0; i < 16; i++) {
if (input_colors[i].w != 0) transparent_block = false;
if (input_colors[i].w != 1) opaque_block = false;
}
if (transparent_block) {
// @@ Encode trivial transparent block.
return 0;
}
if (opaque_block) {
// @@ Encode block with opaque bit set. @@ Isn't this like the standard encoder?
}
// @@ Encode mixed block.
nvCheck(false); // Not implemented!
//uint8 color_rgb[16*3];
//uint8 alpha[16];
//uint etc_word1, etc_word2;
//compressBlockDifferentialWithAlpha(bool isTransparent, uint8* img, uint8* alphaimg, uint8* imgdec, 4, 4, 0, 0, &etc_word1, &etc_word2);
return NV_FLOAT_MAX;
}
//uint etc_blocks = 0;
//uint planar_blocks = 0;
//#include "nvthread/Atomic.h"
static float compress_etc(Vector4 input_colors[16], float input_weights[16], const ETC_Options & options, void * output) {
assert(options.onebit_alpha == false);
ETC_Solution result;
compress_etc1_range_fit(input_colors, input_weights, options, &result);
if (options.use_rg_etc) {
#if HAVE_RGETC
ETC_Solution rg_result;
compress_etc1_rg(input_colors, input_weights, options, &rg_result);
if (rg_result.error < result.error) {
result = rg_result;
}
#else
// @@ Print warning?
#endif
}
if (options.enable_etc2) {
if (options.use_planar) {
ETC_Solution planar_result;
compress_etc2_planar_lsqr(input_colors, input_weights, options, &planar_result);
if (planar_result.error < result.error) {
result = planar_result;
//nv::atomicIncrement(&planar_blocks);
}
else {
//nv::atomicIncrement(&etc_blocks);
}
}
if (options.use_t_mode) {
// @@
}
if (options.use_h_mode) {
// @@
}
}
pack_etc2_block(result.data, (BlockETC *)output);
return result.error;
}
// Range search EAC compressor, slightly modified from ETCLib.
float compress_eac_range_search(Vector4 input_colors[16], float input_weights[16], int input_channel, const EAC_Options & options, void * output) {
// Find alpha range
float min_a = 1.0f;
float max_a = 0.0f;
for (uint i = 0; i < 16; i++) {
float a = input_colors[i].component[input_channel];
min_a = nv::min(min_a, a);
max_a = nv::max(max_a, a);
}
const float range_a = max_a - min_a;
EAC_Solution best;
best.error = NV_FLOAT_MAX;
// try each modifier table entry
static const uint MODIFIER_TABLE_ENTRYS = 16;
for (uint t = 0; t < MODIFIER_TABLE_ENTRYS; t++) {
static const uint MIN_VALUE_SELECTOR = 3;
static const uint MAX_VALUE_SELECTOR = 7;
const float fTableEntryCenter = (float)-eac_intensity_modifiers[t][MIN_VALUE_SELECTOR];
const float fTableEntryRange = (float)eac_intensity_modifiers[t][MAX_VALUE_SELECTOR] - eac_intensity_modifiers[t][MIN_VALUE_SELECTOR];
const float fCenterRatio = fTableEntryCenter / fTableEntryRange;
const int center = ftoi_round(255.0f * (min_a + fCenterRatio * range_a));
const int min_base = max(0, center - options.search_radius);
const int max_base = min(center + options.search_radius, 255);
for (int base = min_base; base <= max_base; base++) {
int range_multiplier = ftoi_round(255 * range_a / fTableEntryRange);
const int min_multiplier = clamp(range_multiplier - options.search_radius, 1, 15);
const int max_multiplier = clamp(range_multiplier + options.search_radius, 1, 15);
for (int multiplier = min_multiplier; multiplier <= max_multiplier; multiplier++) {
// find best selector for each pixel
float block_error = 0;
uint best_selector[16];
for (uint i = 0; i < 16; i++) {
float best_error_a = NV_FLOAT_MAX;
static const uint ALPHA_SELECTOR_BITS = 3;
static const uint ALPHA_SELECTORS = 1 << ALPHA_SELECTOR_BITS;
for (uint s = 0; s < ALPHA_SELECTORS; s++) {
float alpha;
if (options.use_11bit_mode) {
alpha = get_alpha11(base, t, multiplier, s);
}
else {
alpha = get_alpha8(base, t, multiplier, s);
}
float error_a = alpha - input_colors[i].component[input_channel];
error_a = error_a * error_a;
if (error_a < best_error_a) {
best_error_a = error_a;
best_selector[i] = s;
}
}
block_error += best_error_a * input_weights[i];
if (block_error > best.error) {
break; // Don't waste more time.
}
}
if (block_error < best.error) {
best.error = block_error;
best.data.alpha = base;
best.data.multiplier = multiplier;
best.data.table_index = t;
for (uint i = 0; i < 16; i++) {
// Flip selectors.
best.data.selector[i] = best_selector[4*(i%4) + i/4];
}
}
}
}
}
pack_eac_block(best.data, (BlockEAC *)output);
return best.error;
}
// Public API:
void nv::decompress_etc(const void * input_block, Vector4 output_colors[16]) {
#if 1 // Our code
ETC_Data data;
unpack_etc2_block((const BlockETC *)input_block, &data);
decode_etc2(data, output_colors);
#elif HAVE_RGETC && 0
Color32 colors[16];
rg_etc1::unpack_etc1_block(input_block, &colors->u);
for (int i = 0; i < 16; i++) {
output_colors[i].x = colors[i].b * (1.0f / 255.0f);
output_colors[i].y = colors[i].g * (1.0f / 255.0f);
output_colors[i].z = colors[i].r * (1.0f / 255.0f);
output_colors[i].w = colors[i].a * (1.0f / 255.0f);
}
#elif HAVE_ETCPACK // Use etcpack for reference.
const BlockETC * block = (const BlockETC *)input_block;
uint8 colors[3*16];
uint part1 = POSH_SwapU32(block->data32[0]);
uint part2 = POSH_SwapU32(block->data32[1]);
decompressBlockETC2(part1, part2, colors, 4, 4, 0, 0);
for (int i = 0; i < 16; i++) {
output_colors[i].x = colors[3*i+0] * (1.0f / 255.0f);
output_colors[i].y = colors[3*i+1] * (1.0f / 255.0f);
output_colors[i].z = colors[3*i+2] * (1.0f / 255.0f);
output_colors[i].w = 1.0f;
}
#endif
}
void nv::decompress_eac(const void * input_block, Vector4 output_colors[16], int output_channel) {
nvCheck(output_channel >= 0 && output_channel < 4);
#if 1
EAC_Data data;
unpack_eac_block((const BlockEAC *)input_block, &data);
decode_eac_11(data, output_colors, output_channel);
#elif HAVE_ETCPACK
// Use etcpack for reference.
formatSigned = 0;
uint16 alphas[16];
decompressBlockAlpha16bit((uint8*)input_block, (uint8*)alphas, 4, 4, 0, 0);
for (int i = 0; i < 16; i++) {
uint16 alpha = POSH_SwapU16(alphas[i]);
output_colors[i].component[output_channel] = alpha * (1.0f / 65535.0f);
}
#endif
}
void nv::decompress_etc_eac(const void * input, Vector4 output_colors[16]) {
#if 1
BlockETC_EAC * input_block = (BlockETC_EAC *)input;
ETC_Data etc;
unpack_etc2_block(&input_block->etc, &etc);
decode_etc2(etc, output_colors);
EAC_Data eac;
unpack_eac_block(&input_block->eac, &eac);
decode_eac_8(eac, output_colors, 3);
#elif HAVE_ETCPACK
// Use etcpack for reference.
uint8 colors[4*16];
decompressBlockAlpha((uint8*)input_block, colors, 4, 4, 0, 0);
for (int i = 0; i < 16; i++) {
output_colors[i].x = colors[4*i+0] * (1.0f / 255.0f);
output_colors[i].y = colors[4*i+1] * (1.0f / 255.0f);
output_colors[i].z = colors[4*i+2] * (1.0f / 255.0f);
output_colors[i].w = colors[4*i+3] * (1.0f / 255.0f);
}
#endif
}
float nv::compress_etc1(Vector4 input_colors[16], float input_weights[16], const Vector3 & color_weights, void * output) {
process_input_colors(input_colors);
// @@ Use same options for all blocks?
ETC_Options options;
options.use_rg_etc = true;
options.enable_etc2 = false;
options.use_t_mode = false;
options.use_h_mode = false;
options.use_planar = false;
options.color_weights = color_weights;
return compress_etc(input_colors, input_weights, options, output);
}
float nv::compress_etc2(Vector4 input_colors[16], float input_weights[16], const Vector3 & color_weights, void * output) {
process_input_colors(input_colors);
process_input_weights(input_weights);
ETC_Options options;
options.use_rg_etc = true;
options.enable_etc2 = true;
options.use_t_mode = false; // @@ Not implemented.
options.use_h_mode = false; // @@ Not implemented.
options.use_planar = true;
options.color_weights = color_weights;
return compress_etc(input_colors, input_weights, options, output);
}
float nv::compress_etc2_a1(Vector4 input_colors[16], float input_weights[16], const Vector3 & color_weights, void * output) {
process_input_colors(input_colors);
process_input_weights(input_weights);
ETC_Options options;
options.use_rg_etc = true;
options.enable_etc2 = true;
options.use_t_mode = false; // @@ Not implemented.
options.use_h_mode = false; // @@ Not implemented.
options.use_planar = true;
options.onebit_alpha = true;
options.color_weights = color_weights;
return compress_etc_a1(input_colors, input_weights, options, output);
}
float nv::compress_eac(Vector4 input_colors[16], float input_weights[16], int input_channel, int search_radius, bool use_11bit_mode, void * output) {
nvCheck(input_channel >= 0 && input_channel < 4);
process_input_alphas(input_colors, input_channel);
process_input_weights(input_weights);
EAC_Options options;
options.search_radius = search_radius;
options.use_11bit_mode = use_11bit_mode;
return compress_eac_range_search(input_colors, input_weights, input_channel, options, output);
}
float nv::compress_etc2_eac(Vector4 input_colors[16], float input_weights[16], const Vector3 & color_weights, void * output) {
BlockETC_EAC * output_block = (BlockETC_EAC *)output;
float error = compress_etc2(input_colors, input_weights, color_weights, &output_block->etc);
error += compress_eac(input_colors, input_weights, /*input_channel=*/3, /*search_radius=*/1, /*use_11bit_mode=*/false, &output_block->eac);
return error;
}