diff --git a/src/nvtt/ClusterFit.cpp b/src/nvtt/ClusterFit.cpp index 4f5602a..7b91e2b 100644 --- a/src/nvtt/ClusterFit.cpp +++ b/src/nvtt/ClusterFit.cpp @@ -411,19 +411,19 @@ bool ClusterFit::compress4( Vector3 * start, Vector3 * end ) #else inline Vector3 round565(const Vector3 & v) { - uint r = ftoi_floor(v.x * 31.0f); + uint r = ftoi_trunc(v.x * 31.0f); float r0 = float(((r+0) << 3) | ((r+0) >> 2)); float r1 = float(((r+1) << 3) | ((r+1) >> 2)); if (fabs(v.x - r1) < fabs(v.x - r0)) r = min(r+1, 31U); r = (r << 3) | (r >> 2); - uint g = ftoi_floor(v.y * 63.0f); + uint g = ftoi_trunc(v.y * 63.0f); float g0 = float(((g+0) << 2) | ((g+0) >> 4)); float g1 = float(((g+1) << 2) | ((g+1) >> 4)); if (fabs(v.y - g1) < fabs(v.y - g0)) g = min(g+1, 63U); g = (g << 2) | (g >> 4); - uint b = ftoi_floor(v.z * 31.0f); + uint b = ftoi_trunc(v.z * 31.0f); float b0 = float(((b+0) << 3) | ((b+0) >> 2)); float b1 = float(((b+1) << 3) | ((b+1) >> 2)); if (fabs(v.z - b1) < fabs(v.z - b0)) b = min(b+1, 31U); @@ -474,8 +474,10 @@ bool ClusterFit::compress3(Vector3 * start, Vector3 * end) // clamp to the grid a = clamp(a, 0, 1); b = clamp(b, 0, 1); - //a = floor(grid * a + 0.5f) * gridrcp; - //b = floor(grid * b + 0.5f) * gridrcp; +#if 1 + a = floor(grid * a + 0.5f) * gridrcp; + b = floor(grid * b + 0.5f) * gridrcp; +#else //int ar = ftoi_round(31 * a.x); ar = (ar << 3) | (ar >> 2); a.x = float(ar) / 255.0f; //int ag = ftoi_round(63 * a.y); ar = (ag << 2) | (ag >> 4); a.y = float(ag) / 255.0f; @@ -496,7 +498,7 @@ bool ClusterFit::compress3(Vector3 * start, Vector3 * end) a = round565(a); b = round565(b); - +#endif // compute the error Vector3 e1 = a*a*alpha2_sum + b*b*beta2_sum + 2.0f*( a*b*alphabeta_sum - a*alphax_sum - b*betax_sum ); @@ -582,9 +584,10 @@ bool ClusterFit::compress4(Vector3 * start, Vector3 * end) // clamp to the grid a = clamp(a, 0, 1); b = clamp(b, 0, 1); - //a = floor(a * grid + 0.5f) * gridrcp; - //b = floor(b * grid + 0.5f) * gridrcp; - +#if 0 + a = floor(a * grid + 0.5f) * gridrcp; + b = floor(b * grid + 0.5f) * gridrcp; +#else //int ar = ftoi_round(31 * a.x); ar = (ar << 3) | (ar >> 2); a.x = float(ar) / 255.0f; //int ag = ftoi_round(63 * a.y); ar = (ag << 2) | (ag >> 4); a.y = float(ag) / 255.0f; //int ab = ftoi_round(31 * a.z); ar = (ab << 3) | (ab >> 2); a.z = float(ab) / 255.0f; @@ -606,6 +609,8 @@ bool ClusterFit::compress4(Vector3 * start, Vector3 * end) a = round565(a); b = round565(b); +#endif + // @@ It would be much more accurate to evaluate the error exactly. // compute the error Vector3 e1 = a*a*alpha2_sum + b*b*beta2_sum + 2.0f*( a*b*alphabeta_sum - a*alphax_sum - b*betax_sum ); diff --git a/src/nvtt/ClusterFit.h b/src/nvtt/ClusterFit.h index 39dcbfb..72c9ef9 100644 --- a/src/nvtt/ClusterFit.h +++ b/src/nvtt/ClusterFit.h @@ -31,8 +31,8 @@ #include "nvmath/Vector.h" // Use SIMD version if altivec or SSE are available. -//#define NVTT_USE_SIMD (NV_USE_ALTIVEC || NV_USE_SSE) -#define NVTT_USE_SIMD 0 +#define NVTT_USE_SIMD (NV_USE_ALTIVEC || NV_USE_SSE) +//#define NVTT_USE_SIMD 0 namespace nv { diff --git a/src/nvtt/CompressorDX9.cpp b/src/nvtt/CompressorDX9.cpp index 1fc3254..aaef88d 100644 --- a/src/nvtt/CompressorDX9.cpp +++ b/src/nvtt/CompressorDX9.cpp @@ -113,9 +113,40 @@ void FastCompressorDXT5n::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alpha } +namespace nv { + float compress_dxt1(const Vector3 input_colors[16], const float input_weights[16], const Vector3 & color_weights, BlockDXT1 * output); +} + #if 1 void CompressorDXT1::compressBlock(ColorSet & set, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) { +#if 1 + // @@ This setup is the same for all compressors. + Vector3 input_colors[16]; + float input_weights[16]; + + uint x, y; + for (y = 0; y < set.h; y++) { + for (x = 0; x < set.w; x++) { + input_colors[4*y+x] = set.color(x, y).xyz(); + input_weights[4*y+x] = 1.0f; + if (alphaMode == nvtt::AlphaMode_Transparency) input_weights[4*y+x] = set.color(x, y).z; + } + for (; x < 4; x++) { + input_colors[4*y+x] = Vector3(0); + input_weights[4*y+x] = 0.0f; + } + } + for (; y < 4; y++) { + for (x = 0; x < 4; x++) { + input_colors[4*y+x] = Vector3(0); + input_weights[4*y+x] = 0.0f; + } + } + + compress_dxt1(input_colors, input_weights, compressionOptions.colorWeight.xyz(), (BlockDXT1 *)output); + +#else set.setUniformWeights(); set.createMinimalSet(/*ignoreTransparent*/false); @@ -145,8 +176,9 @@ void CompressorDXT1::compressBlock(ColorSet & set, nvtt::AlphaMode alphaMode, co QuickCompress::outputBlock4(set, start, end, block); } } +#endif } -#elif 1 +#elif 0 extern void compress_dxt1_bounding_box_exhaustive(const ColorBlock & input, BlockDXT1 * output); diff --git a/src/nvtt/CompressorDXT1.cpp b/src/nvtt/CompressorDXT1.cpp index f884a9a..62653f6 100644 --- a/src/nvtt/CompressorDXT1.cpp +++ b/src/nvtt/CompressorDXT1.cpp @@ -20,13 +20,81 @@ using namespace nv; -inline static void color_block_to_vector_block(const ColorBlock & rgba, Vector3 block[16]) -{ - for (int i = 0; i < 16; i++) - { - const Color32 c = rgba.color(i); - block[i] = Vector3(c.r, c.g, c.b); - } +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Color conversion functions. + +static const float midpoints5[32] = { + 0.015686f, 0.047059f, 0.078431f, 0.111765f, 0.145098f, 0.176471f, 0.207843f, 0.241176f, 0.274510f, 0.305882f, 0.337255f, 0.370588f, 0.403922f, 0.435294f, 0.466667f, 0.5f, + 0.533333f, 0.564706f, 0.596078f, 0.629412f, 0.662745f, 0.694118f, 0.725490f, 0.758824f, 0.792157f, 0.823529f, 0.854902f, 0.888235f, 0.921569f, 0.952941f, 0.984314f, 1.0f +}; + +static const float midpoints6[64] = { + 0.007843f, 0.023529f, 0.039216f, 0.054902f, 0.070588f, 0.086275f, 0.101961f, 0.117647f, 0.133333f, 0.149020f, 0.164706f, 0.180392f, 0.196078f, 0.211765f, 0.227451f, 0.245098f, + 0.262745f, 0.278431f, 0.294118f, 0.309804f, 0.325490f, 0.341176f, 0.356863f, 0.372549f, 0.388235f, 0.403922f, 0.419608f, 0.435294f, 0.450980f, 0.466667f, 0.482353f, 0.500000f, + 0.517647f, 0.533333f, 0.549020f, 0.564706f, 0.580392f, 0.596078f, 0.611765f, 0.627451f, 0.643137f, 0.658824f, 0.674510f, 0.690196f, 0.705882f, 0.721569f, 0.737255f, 0.754902f, + 0.772549f, 0.788235f, 0.803922f, 0.819608f, 0.835294f, 0.850980f, 0.866667f, 0.882353f, 0.898039f, 0.913725f, 0.929412f, 0.945098f, 0.960784f, 0.976471f, 0.992157f, 1.0f +}; + +/*void init_tables() { + for (int i = 0; i < 31; i++) { + float f0 = float(((i+0) << 3) | ((i+0) >> 2)) / 255.0f; + float f1 = float(((i+1) << 3) | ((i+1) >> 2)) / 255.0f; + midpoints5[i] = (f0 + f1) * 0.5; + } + midpoints5[31] = 1.0f; + + for (int i = 0; i < 63; i++) { + float f0 = float(((i+0) << 2) | ((i+0) >> 4)) / 255.0f; + float f1 = float(((i+1) << 2) | ((i+1) >> 4)) / 255.0f; + midpoints6[i] = (f0 + f1) * 0.5; + } + midpoints6[63] = 1.0f; +}*/ + +static Color16 vector3_to_color16(const Vector3 & v) { + // Truncate. + uint r = ftoi_trunc(clamp(v.x * 31.0f, 0.0f, 31.0f)); + uint g = ftoi_trunc(clamp(v.y * 63.0f, 0.0f, 63.0f)); + uint b = ftoi_trunc(clamp(v.z * 31.0f, 0.0f, 31.0f)); + + // Round exactly according to 565 bit-expansion. + r += (v.x > midpoints5[r]); + g += (v.y > midpoints6[g]); + b += (v.z > midpoints5[b]); + + return Color16((r << 11) | (g << 5) | b); +} + + +static Color32 bitexpand_color16_to_color32(Color16 c16) { + Color32 c32; + //c32.b = (c16.b << 3) | (c16.b >> 2); + //c32.g = (c16.g << 2) | (c16.g >> 4); + //c32.r = (c16.r << 3) | (c16.r >> 2); + //c32.a = 0xFF; + + c32.u = ((c16.u << 3) & 0xf8) | ((c16.u << 5) & 0xfc00) | ((c16.u << 8) & 0xf80000); + c32.u |= (c32.u >> 5) & 0x070007; + c32.u |= (c32.u >> 6) & 0x000300; + + return c32; +} + +static Color32 bitexpand_color16_to_color32(int r, int g, int b) { + Color32 c32; + c32.b = (b << 3) | (b >> 2); + c32.g = (g << 2) | (g >> 4); + c32.r = (r << 3) | (r >> 2); + c32.a = 0xFF; + return c32; +} + +static Color16 truncate_color32_to_color16(Color32 c32) { + Color16 c16; + c16.b = (c32.b >> 3); + c16.g = (c32.g >> 2); + c16.r = (c32.r >> 3); + return c16; } inline Vector3 r5g6b5_to_vector3(int r, int g, int b) @@ -40,20 +108,32 @@ inline Vector3 r5g6b5_to_vector3(int r, int g, int b) inline Vector3 color_to_vector3(Color32 c) { - const float scale = 1.0f / 255.0f; + const float scale = 1.0f / 255.0f; return Vector3(c.r * scale, c.g * scale, c.b * scale); } inline Color32 vector3_to_color(Vector3 v) { - Color32 color; - color.r = U8(ftoi_round(saturate(v.x) * 255)); - color.g = U8(ftoi_round(saturate(v.y) * 255)); - color.b = U8(ftoi_round(saturate(v.z) * 255)); - color.a = 255; + Color32 color; + color.r = U8(ftoi_round(saturate(v.x) * 255)); + color.g = U8(ftoi_round(saturate(v.y) * 255)); + color.b = U8(ftoi_round(saturate(v.z) * 255)); + color.a = 255; + return color; } +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Input block processing. + +inline static void color_block_to_vector_block(const ColorBlock & rgba, Vector3 block[16]) +{ + for (int i = 0; i < 16; i++) + { + const Color32 c = rgba.color(i); + block[i] = Vector3(c.r, c.g, c.b); + } +} // Find first valid color. static bool find_valid_color_rgb(const Vector3 * colors, const float * weights, int count, Vector3 * valid_color) @@ -114,46 +194,67 @@ static int reduce_colors(const Vector3 * input_colors, const float * input_weigh } +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Error evaluation. // Different ways of estimating the error. -static float evaluate_mse(const Vector3 & p, const Vector3 & c) { - return square(p.x-c.x) + square(p.y-c.y) + square(p.z-c.z); +/*static float evaluate_mse(const Vector3 & p, const Vector3 & c) { + //return (square(p.x-c.x) * w2.x + square(p.y-c.y) * w2.y + square(p.z-c.z) * w2.z); + Vector3 d = (p - c); + return dot(d, d); +}*/ + +static float evaluate_mse(const Vector3 & p, const Vector3 & c, const Vector3 & w) { + //return (square(p.x-c.x) * w2.x + square(p.y-c.y) * w2.y + square(p.z-c.z) * w2.z); + Vector3 d = (p - c) * w; + return dot(d, d); } /*static float evaluate_mse(const Vector3 & p, const Vector3 & c, const Vector3 & w) { return ww.x * square(p.x-c.x) + ww.y * square(p.y-c.y) + ww.z * square(p.z-c.z); }*/ -static int evaluate_mse_rgb(const Color32 & p, const Color32 & c) { - return square(int(p.r)-c.r) + square(int(p.g)-c.g) + square(int(p.b)-c.b); +static int evaluate_mse(const Color32 & p, const Color32 & c) { + return (square(int(p.r)-c.r) + square(int(p.g)-c.g) + square(int(p.b)-c.b)); } -static float evaluate_mse(const Vector3 palette[4], const Vector3 & c) { - float e0 = evaluate_mse(palette[0], c); - float e1 = evaluate_mse(palette[1], c); - float e2 = evaluate_mse(palette[2], c); - float e3 = evaluate_mse(palette[3], c); +static float evaluate_mse(const Vector3 palette[4], const Vector3 & c, const Vector3 & w) { + float e0 = evaluate_mse(palette[0], c, w); + float e1 = evaluate_mse(palette[1], c, w); + float e2 = evaluate_mse(palette[2], c, w); + float e3 = evaluate_mse(palette[3], c, w); return min(min(e0, e1), min(e2, e3)); } static int evaluate_mse(const Color32 palette[4], const Color32 & c) { - int e0 = evaluate_mse_rgb(palette[0], c); - int e1 = evaluate_mse_rgb(palette[1], c); - int e2 = evaluate_mse_rgb(palette[2], c); - int e3 = evaluate_mse_rgb(palette[3], c); + int e0 = evaluate_mse(palette[0], c); + int e1 = evaluate_mse(palette[1], c); + int e2 = evaluate_mse(palette[2], c); + int e3 = evaluate_mse(palette[3], c); return min(min(e0, e1), min(e2, e3)); } -static float evaluate_mse(const Vector3 palette[4], const Vector3 & c, int index) { - return evaluate_mse(palette[index], c); -} +// Returns MSE error in [0-255] range. +static int evaluate_mse(const BlockDXT1 * output, Color32 color, int index) { + Color32 palette[4]; + output->evaluatePalette(palette, /*d3d9=*/false); -static int evaluate_mse(const Color32 palette[4], const Color32 & c, int index) { - return evaluate_mse_rgb(palette[index], c); + return evaluate_mse(palette[index], color); } +// Returns weighted MSE error in [0-255] range. +static float evaluate_palette_error(Color32 palette[4], const Color32 * colors, const float * weights, int count) { + + float total = 0.0f; + for (int i = 0; i < count; i++) { + total += weights[i] * evaluate_mse(palette, colors[i]); + } + + return total; +} -static float evaluate_mse(const BlockDXT1 * output, Vector3 colors[16]) { +#if 0 +static float evaluate_mse(const BlockDXT1 * output, const Vector3 colors[16]) { Color32 palette[4]; output->evaluatePalette(palette, /*d3d9=*/false); @@ -167,39 +268,171 @@ static float evaluate_mse(const BlockDXT1 * output, Vector3 colors[16]) { float error = 0.0f; for (int i = 0; i < 16; i++) { int index = (output->indices >> (2*i)) & 3; // @@ Is this the right order? - error += evaluate_mse(vector_palette, colors[i], index); + error += evaluate_mse(vector_palette[index], colors[i]); } return error; } +#endif -static int evaluate_mse(const BlockDXT1 * output, Color32 color, int index) { +static float evaluate_mse(const Vector3 colors[16], const float weights[16], const Vector3 & color_weights, const BlockDXT1 * output) { Color32 palette[4]; output->evaluatePalette(palette, /*d3d9=*/false); - return evaluate_mse(palette, color, index); + // convert palette to float. + Vector3 vector_palette[4]; + for (int i = 0; i < 4; i++) { + vector_palette[i] = color_to_vector3(palette[i]); + } + + // evaluate error for each index. + float error = 0.0f; + for (int i = 0; i < 16; i++) { + int index = (output->indices >> (2 * i)) & 3; + error += weights[i] * evaluate_mse(vector_palette[index], colors[i], color_weights); + } + return error; } -/*void output_block3(const ColorSet & set, const Vector3 & start, const Vector3 & end, BlockDXT1 * block) + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Palette evaluation. + +static void evaluate_palette4(Color32 palette[4]) { + palette[2].r = (2 * palette[0].r + palette[1].r) / 3; + palette[2].g = (2 * palette[0].g + palette[1].g) / 3; + palette[2].b = (2 * palette[0].b + palette[1].b) / 3; + palette[3].r = (2 * palette[1].r + palette[0].r) / 3; + palette[3].g = (2 * palette[1].g + palette[0].g) / 3; + palette[3].b = (2 * palette[1].b + palette[0].b) / 3; +} + +static void evaluate_palette3(Color32 palette[4]) { + palette[2].r = (palette[0].r + palette[1].r) / 2; + palette[2].g = (palette[0].g + palette[1].g) / 2; + palette[2].b = (palette[0].b + palette[1].b) / 2; + palette[3].r = 0; + palette[3].g = 0; + palette[3].b = 0; +} + +static void evaluate_palette(Color16 c0, Color16 c1, Color32 palette[4]) { + palette[0] = bitexpand_color16_to_color32(c0); + palette[1] = bitexpand_color16_to_color32(c1); + if (c0.u > c1.u) { + evaluate_palette4(palette); + } + else { + evaluate_palette3(palette); + } +} + +static void evaluate_palette(Color16 c0, Color16 c1, Vector3 palette[4]) { + Color32 palette32[4]; + evaluate_palette(c0, c1, palette32); + + for (int i = 0; i < 4; i++) { + palette[i] = color_to_vector3(palette32[i]); + } +} + +static void evaluate_palette3(Color16 c0, Color16 c1, Vector3 palette[4]) { + nvDebugCheck(c0.u > c1.u); + + Color32 palette32[4]; + evaluate_palette(c0, c1, palette32); + + for (int i = 0; i < 4; i++) { + palette[i] = color_to_vector3(palette32[i]); + } +} + + + + + +static uint compute_indices4(const Vector3 input_colors[16], const Vector3 & color_weights, const Vector3 palette[4]) { + + uint indices = 0; + for (int i = 0; i < 16; i++) { + float d0 = evaluate_mse(palette[0], input_colors[i], color_weights); + float d1 = evaluate_mse(palette[1], input_colors[i], color_weights); + float d2 = evaluate_mse(palette[2], input_colors[i], color_weights); + float d3 = evaluate_mse(palette[3], input_colors[i], color_weights); + + uint b0 = d0 > d3; + uint b1 = d1 > d2; + uint b2 = d0 > d2; + uint b3 = d1 > d3; + uint b4 = d2 > d3; + + uint x0 = b1 & b2; + uint x1 = b0 & b3; + uint x2 = b0 & b4; + + indices |= (x2 | ((x0 | x1) << 1)) << (2 * i); + } + + return indices; +} + + +static uint compute_indices(const Vector3 input_colors[16], const Vector3 & color_weights, const Vector3 palette[4]) { + + uint indices = 0; + for (int i = 0; i < 16; i++) { + float d0 = evaluate_mse(palette[0], input_colors[i], color_weights); + float d1 = evaluate_mse(palette[1], input_colors[i], color_weights); + float d2 = evaluate_mse(palette[2], input_colors[i], color_weights); + float d3 = evaluate_mse(palette[3], input_colors[i], color_weights); + + uint index; + if (d0 < d1 && d0 < d2 && d0 < d3) index = 0; + else if (d1 < d2 && d1 < d3) index = 1; + else if (d2 < d3) index = 2; + else index = 3; + + indices |= index << (2 * i); + } + + return indices; +} + + +static void output_block3(const Vector3 input_colors[16], const Vector3 & color_weights, const Vector3 & v0, const Vector3 & v1, BlockDXT1 * block) { - Vector3 minColor = start * 255.0f; - Vector3 maxColor = end * 255.0f; - uint16 color0 = roundAndExpand(&minColor); - uint16 color1 = roundAndExpand(&maxColor); + Color16 color0 = vector3_to_color16(v0); + Color16 color1 = vector3_to_color16(v1); - if (color0 > color1) { - swap(maxColor, minColor); + if (color0.u > color1.u) { swap(color0, color1); } - block->col0 = Color16(color0); - block->col1 = Color16(color1); - block->indices = compute_indices3(colors, weights, count, maxColor / 255.0f, minColor / 255.0f); + Vector3 palette[4]; + evaluate_palette(color0, color1, palette); - //optimizeEndPoints3(set, block); -}*/ + block->col0 = color0; + block->col1 = color1; + block->indices = compute_indices(input_colors, color_weights, palette); +} + +static void output_block4(const Vector3 input_colors[16], const Vector3 & color_weights, const Vector3 & v0, const Vector3 & v1, BlockDXT1 * block) +{ + Color16 color0 = vector3_to_color16(v0); + Color16 color1 = vector3_to_color16(v1); + + if (color0.u < color1.u) { + swap(color0, color1); + } + Vector3 palette[4]; + evaluate_palette(color0, color1, palette); + + block->col0 = color0; + block->col1 = color1; + block->indices = compute_indices4(input_colors, color_weights, palette); +} @@ -207,7 +440,7 @@ static int evaluate_mse(const BlockDXT1 * output, Color32 color, int index) { // Single color compressor, based on: // https://mollyrocket.com/forums/viewtopic.php?t=392 -float nv::compress_dxt1_single_color_optimal(Color32 c, BlockDXT1 * output) +static void compress_dxt1_single_color_optimal(Color32 c, BlockDXT1 * output) { output->col0.r = OMatch5[c.r][0]; output->col0.g = OMatch6[c.g][0]; @@ -222,92 +455,66 @@ float nv::compress_dxt1_single_color_optimal(Color32 c, BlockDXT1 * output) swap(output->col0.u, output->col1.u); output->indices ^= 0x55555555; } - - return (float) evaluate_mse(output, c, output->indices & 3); } -float nv::compress_dxt1_single_color_optimal(const Vector3 & color, BlockDXT1 * output) -{ - return compress_dxt1_single_color_optimal(vector3_to_color(color), output); -} - - -// Low quality baseline compressor. -float nv::compress_dxt1_least_squares_fit(const Vector3 * input_colors, const Vector3 * colors, const float * weights, int count, BlockDXT1 * output) +float nv::compress_dxt1_single_color_optimal(Color32 c, BlockDXT1 * output) { - // @@ Iterative best end point fit. + ::compress_dxt1_single_color_optimal(c, output); - return FLT_MAX; + // Multiply by 16^2, the weight associated to a single color. + // Divide by 255*255 to covert error to [0-1] range. + return (256.0f / (255*255)) * evaluate_mse(output, c, output->indices & 3); } -static Color32 bitexpand_color16_to_color32(Color16 c16) { - Color32 c32; - c32.b = (c16.b << 3) | (c16.b >> 2); - c32.g = (c16.g << 2) | (c16.g >> 4); - c32.r = (c16.r << 3) | (c16.r >> 2); - c32.a = 0xFF; - - //c32.u = ((c16.u << 3) & 0xf8) | ((c16.u << 5) & 0xfc00) | ((c16.u << 8) & 0xf80000); - //c32.u |= (c32.u >> 5) & 0x070007; - //c32.u |= (c32.u >> 6) & 0x000300; - - return c32; +float nv::compress_dxt1_single_color_optimal(const Vector3 & color, BlockDXT1 * output) +{ + return compress_dxt1_single_color_optimal(vector3_to_color(color), output); } -static Color32 bitexpand_color16_to_color32(int r, int g, int b) { - Color32 c32; - c32.b = (b << 3) | (b >> 2); - c32.g = (g << 2) | (g >> 4); - c32.r = (r << 3) | (r >> 2); - c32.a = 0xFF; - return c32; -} -static Color16 truncate_color32_to_color16(Color32 c32) { - Color16 c16; - c16.b = (c32.b >> 3); - c16.g = (c32.g >> 2); - c16.r = (c32.r >> 3); - return c16; -} +// Compress block using the average color. +float nv::compress_dxt1_single_color(const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, BlockDXT1 * output) +{ + // Compute block average. + Vector3 color_sum(0); + float weight_sum = 0; + for (int i = 0; i < count; i++) { + color_sum += colors[i] * weights[i]; + weight_sum += weights[i]; + } + // Compress optimally. + ::compress_dxt1_single_color_optimal(vector3_to_color(color_sum / weight_sum), output); + // Decompress block color. + Color32 palette[4]; + output->evaluatePalette(palette, /*d3d9=*/false); -static float evaluate_palette4(Color32 palette[4]) { - palette[2].r = (2 * palette[0].r + palette[1].r) / 3; - palette[2].g = (2 * palette[0].g + palette[1].g) / 3; - palette[2].b = (2 * palette[0].b + palette[1].b) / 3; - palette[3].r = (2 * palette[1].r + palette[0].r) / 3; - palette[3].g = (2 * palette[1].g + palette[0].g) / 3; - palette[3].b = (2 * palette[1].b + palette[0].b) / 3; -} + Vector3 block_color = color_to_vector3(palette[output->indices & 0x3]); -static float evaluate_palette3(Color32 palette[4]) { - palette[2].r = (palette[0].r + palette[1].r) / 2; - palette[2].g = (palette[0].g + palette[1].g) / 2; - palette[2].b = (palette[0].b + palette[1].b) / 2; - palette[3].r = 0; - palette[3].g = 0; - palette[3].b = 0; + // Evaluate error. + float error = 0; + for (int i = 0; i < count; i++) { + error += weights[i] * evaluate_mse(block_color, colors[i], color_weights); + } + return error; } -static float evaluate_palette_error(Color32 palette[4], const Color32 * colors, const float * weights, int count) { - - float total = 0.0f; - for (int i = 0; i < count; i++) { - total += (weights[i] * weights[i]) * evaluate_mse(palette, colors[i]); - } - - return total; -} +/* @@ Not implemented yet. +// Low quality baseline compressor. +float nv::compress_dxt1_least_squares_fit(const Vector3 * input_colors, const Vector3 * colors, const float * weights, int count, BlockDXT1 * output) +{ + // @@ Iterative best end point fit. + return FLT_MAX; +}*/ -float nv::compress_dxt1_bounding_box_exhaustive(const Vector3 input_colors[16], const Vector3 * colors, const float * weights, int count, int max_volume, BlockDXT1 * output) +float nv::compress_dxt1_bounding_box_exhaustive(const Vector3 input_colors[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, int max_volume, BlockDXT1 * output) { // Compute bounding box. Vector3 min_color(1.0f); @@ -331,85 +538,92 @@ float nv::compress_dxt1_bounding_box_exhaustive(const Vector3 input_colors[16], int range_g = max_g - min_g; int range_b = max_b - min_b; - min_r = max(0, min_r - (range_r + 1) / 1 - 1); - min_g = max(0, min_g - (range_g + 1) / 1 - 1); - min_b = max(0, min_b - (range_b + 1) / 1 - 1); + min_r = max(0, min_r - range_r / 2 - 2); + min_g = max(0, min_g - range_g / 2 - 2); + min_b = max(0, min_b - range_b / 2 - 2); - max_r = min(31, max_r + (range_r + 1) / 2 + 1); - max_g = min(63, max_g + (range_g + 1) / 2 + 1); - max_b = min(31, max_b + (range_b + 1) / 2 + 1); + max_r = min(31, max_r + range_r / 2 + 2); + max_g = min(63, max_g + range_g / 2 + 2); + max_b = min(31, max_b + range_b / 2 + 2); // Estimate size of search space. int volume = (max_r-min_r+1) * (max_g-min_g+1) * (max_b-min_b+1); - // if size under search_limit, then proceed. Note that search_limit is sqrt of number of evaluations. + // if size under search_limit, then proceed. Note that search_volume is sqrt of number of evaluations. if (volume > max_volume) { return FLT_MAX; } + // @@ Convert to fixed point before building box? Color32 colors32[16]; for (int i = 0; i < count; i++) { colors32[i] = toColor32(Vector4(colors[i], 1)); } float best_error = FLT_MAX; - Color32 best0, best1; + Color16 best0, best1; // @@ Record endpoints as Color16? + + Color16 c0, c1; + Color32 palette[4]; for(int r0 = min_r; r0 <= max_r; r0++) - for(int r1 = max_r; r1 >= r0; r1--) for(int g0 = min_g; g0 <= max_g; g0++) - for(int g1 = max_g; g1 >= g0; g1--) for(int b0 = min_b; b0 <= max_b; b0++) - for(int b1 = max_b; b1 >= b0; b1--) { - Color32 palette[4]; - palette[0] = bitexpand_color16_to_color32(r1, g1, b1); - palette[1] = bitexpand_color16_to_color32(r0, g0, b0); - - // Evaluate error in 4 color mode. - evaluate_palette4(palette); + c0.r = r0; c0.g = g0; c0.b = b0; + palette[0] = bitexpand_color16_to_color32(c0); + + for(int r1 = min_r; r1 <= max_r; r1++) + for(int g1 = min_g; g1 <= max_g; g1++) + for(int b1 = min_b; b1 <= max_b; b1++) + { + c1.r = r1; c1.g = g1; c1.b = b1; + palette[1] = bitexpand_color16_to_color32(c1); + + if (c0.u > c1.u) { + // Evaluate error in 4 color mode. + evaluate_palette4(palette); + } + else { + #if 1 + // Evaluate error in 3 color mode. + evaluate_palette3(palette); + #else + // Skip 3 color mode. + continue; + #endif + } - float error = evaluate_palette_error(palette, colors32, weights, count); + float error = evaluate_palette_error(palette, colors32, weights, count); - if (error < best_error) { - best_error = error; - best0 = palette[0]; - best1 = palette[1]; + if (error < best_error) { + best_error = error; + best0 = c0; + best1 = c1; + } } + } -#if 0 - // Evaluate error in 3 color mode. - evaluate_palette3(palette); - - float error = evaluate_palette_error(palette, colors, weights, count); + output->col0 = best0; + output->col1 = best1; - if (error < best_error) { - best_error = error; - best0 = palette[1]; - best1 = palette[0]; - } -#endif + if (output->col0.u < output->col1.u) { + int k = 1; } - output->col0 = truncate_color32_to_color16(best0); - output->col1 = truncate_color32_to_color16(best1); + Vector3 vector_palette[4]; + evaluate_palette(output->col0, output->col1, vector_palette); - if (output->col0.u <= output->col1.u) { - //output->indices = computeIndices3(colors, best0, best1); - } - else { - //output->indices = computeIndices4(colors, best0, best1); - } + output->indices = compute_indices(input_colors, color_weights, vector_palette); - return FLT_MAX; + return best_error / (255 * 255); } -float nv::compress_dxt1_cluster_fit(const Vector3 input_colors[16], const Vector3 * colors, const float * weights, int count, BlockDXT1 * output) +void nv::compress_dxt1_cluster_fit(const Vector3 input_colors[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, BlockDXT1 * output) { ClusterFit fit; - //fit.setColorWeights(compressionOptions.colorWeight); - fit.setColorWeights(Vector4(1)); // @@ Set color weights. + fit.setColorWeights(Vector4(color_weights, 1)); fit.setColorSet(colors, weights, count); // start & end are in [0, 1] range. @@ -417,18 +631,17 @@ float nv::compress_dxt1_cluster_fit(const Vector3 input_colors[16], const Vector fit.compress4(&start, &end); if (fit.compress3(&start, &end)) { - //output_block3(input_colors, start, end, block); - // @@ Output block. + output_block3(input_colors, color_weights, start, end, output); } else { - //output_block4(input_colors, start, end, block); - // @@ Output block. + output_block4(input_colors, color_weights, start, end, output); } } -float nv::compress_dxt1(const Vector3 input_colors[16], const float input_weights[16], BlockDXT1 * output) + +float nv::compress_dxt1(const Vector3 input_colors[16], const float input_weights[16], const Vector3 & color_weights, BlockDXT1 * output) { Vector3 colors[16]; float weights[16]; @@ -442,20 +655,59 @@ float nv::compress_dxt1(const Vector3 input_colors[16], const float input_weight return 0; } - if (count == 1) { - return compress_dxt1_single_color_optimal(colors[0], output); + + float error = FLT_MAX; + + // Sometimes the single color compressor produces better results than the exhaustive. This introduces discontinuities between blocks that + // use different compressors. For this reason, this is not enabled by default. + if (1) { + error = compress_dxt1_single_color(colors, weights, count, color_weights, output); + + if (error == 0.0f || count == 1) { + // Early out. + return error; + } } + // This is too expensive, even with a low threshold. // If high quality: - //error = compress_dxt1_bounding_box_exhaustive(colors, weigths, count, 3200, error, output); - //if (error < FLT_MAX) return error; + if (0) { + BlockDXT1 exhaustive_output; + float exhaustive_error = compress_dxt1_bounding_box_exhaustive(input_colors, colors, weights, count, color_weights, 400, &exhaustive_output); + + if (exhaustive_error != FLT_MAX) { + float exhaustive_error2 = evaluate_mse(input_colors, input_weights, color_weights, &exhaustive_output); + + // The exhaustive compressor does not use color_weights, so the results may be different. + //nvCheck(equal(exhaustive_error, exhaustive_error2)); + + if (exhaustive_error2 < error) { + *output = exhaustive_output; + error = exhaustive_error; + } + } + } + // @@ TODO. // This is pretty fast and in some cases can produces better quality than cluster fit. -// error = compress_dxt1_least_squares_fit(colors, weigths, error, output); + //error = compress_dxt1_least_squares_fit(colors, weigths, error, output); - // - float error = compress_dxt1_cluster_fit(input_colors, colors, weights, count, output); + // Cluster fit cannot handle single color blocks, so encode them optimally if we haven't encoded them already. + if (error == FLT_MAX && count == 1) { + error = compress_dxt1_single_color_optimal(colors[0], output); + } + + if (count > 1) { + BlockDXT1 cluster_fit_output; + compress_dxt1_cluster_fit(input_colors, colors, weights, count, color_weights, &cluster_fit_output); + + float cluster_fit_error = evaluate_mse(input_colors, input_weights, color_weights, &cluster_fit_output); + + if (cluster_fit_error < error) { + *output = cluster_fit_output; + error = cluster_fit_error; + } + } return error; } - diff --git a/src/nvtt/CompressorDXT1.h b/src/nvtt/CompressorDXT1.h index 3c2fe57..fb9d655 100644 --- a/src/nvtt/CompressorDXT1.h +++ b/src/nvtt/CompressorDXT1.h @@ -28,11 +28,12 @@ namespace nv { float compress_dxt1_single_color_optimal(Color32 c, BlockDXT1 * output); float compress_dxt1_single_color_optimal(const Vector3 & color, BlockDXT1 * output); - float compress_dxt1_least_squares_fit(const Vector3 input_colors[16], const Vector3 * colors, const float * weights, int count, BlockDXT1 * output); - float compress_dxt1_bounding_box_exhaustive(const Vector3 input_colors[16], const Vector3 * colors, const float * weights, int count, int search_limit, BlockDXT1 * output); - float compress_dxt1_cluster_fit(const Vector3 input_colors[16], const Vector3 * colors, const float * weights, int count, BlockDXT1 * output); + float compress_dxt1_single_color(const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, BlockDXT1 * output); + float compress_dxt1_least_squares_fit(const Vector3 input_colors[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, BlockDXT1 * output); + float compress_dxt1_bounding_box_exhaustive(const Vector3 input_colors[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, int search_limit, BlockDXT1 * output); + void compress_dxt1_cluster_fit(const Vector3 input_colors[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, BlockDXT1 * output); - float compress_dxt1(const Vector3 colors[16], const float weights[16], BlockDXT1 * output); + float compress_dxt1(const Vector3 colors[16], const float weights[16], const Vector3 & color_weights, BlockDXT1 * output); } \ No newline at end of file