From daff42781de2a0e014d2827a320b330914a7e267 Mon Sep 17 00:00:00 2001 From: Ignacio Date: Sun, 5 Apr 2020 12:22:25 -0700 Subject: [PATCH] Work toward packaging dxt1 compressor as a single header library. --- src/nvmath/PackedFloat.h | 16 +- src/nvtt/ClusterFit.cpp | 94 ++++++- src/nvtt/ClusterFit.h | 162 ++++++------ src/nvtt/CompressorDXT1.cpp | 496 ++++++++++++++++++++++++------------ src/nvtt/CompressorDXT1.h | 6 +- 5 files changed, 521 insertions(+), 253 deletions(-) diff --git a/src/nvmath/PackedFloat.h b/src/nvmath/PackedFloat.h index bf84b85..dc3ed99 100755 --- a/src/nvmath/PackedFloat.h +++ b/src/nvmath/PackedFloat.h @@ -62,17 +62,17 @@ namespace nv }; }; - NVMATH_API Vector3 rgb9e5_to_vector3(FloatRGB9E5 v); - NVMATH_API FloatRGB9E5 vector3_to_rgb9e5(const Vector3 & v); + Vector3 rgb9e5_to_vector3(FloatRGB9E5 v); + FloatRGB9E5 vector3_to_rgb9e5(const Vector3 & v); - NVMATH_API float float11_to_float32(uint v); - NVMATH_API float float10_to_float32(uint v); + float float11_to_float32(uint v); + float float10_to_float32(uint v); - NVMATH_API Vector3 r11g11b10_to_vector3(FloatR11G11B10 v); - NVMATH_API FloatR11G11B10 vector3_to_r11g11b10(const Vector3 & v); + Vector3 r11g11b10_to_vector3(FloatR11G11B10 v); + FloatR11G11B10 vector3_to_r11g11b10(const Vector3 & v); - NVMATH_API Vector3 rgbe8_to_vector3(FloatRGBE8 v); - NVMATH_API FloatRGBE8 vector3_to_rgbe8(const Vector3 & v); + Vector3 rgbe8_to_vector3(FloatRGBE8 v); + FloatRGBE8 vector3_to_rgbe8(const Vector3 & v); } // nv diff --git a/src/nvtt/ClusterFit.cpp b/src/nvtt/ClusterFit.cpp index 54652e9..4e8728e 100644 --- a/src/nvtt/ClusterFit.cpp +++ b/src/nvtt/ClusterFit.cpp @@ -1,7 +1,6 @@ // MIT license see full LICENSE text at end of file #include "ClusterFit.h" -#include "nvmath/Fitting.h" #include "nvmath/Vector.inl" #include // FLT_MAX @@ -9,6 +8,96 @@ using namespace nv; +static Vector3 computeCentroid(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric) +{ + Vector3 centroid(0.0f); + float total = 0.0f; + + for (int i = 0; i < n; i++) + { + total += weights[i]; + centroid += weights[i] * points[i]; + } + centroid *= (1.0f / total); + + return centroid; +} + +static Vector3 computeCovariance(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric, float *__restrict covariance) +{ + // compute the centroid + Vector3 centroid = computeCentroid(n, points, weights, metric); + + // compute covariance matrix + for (int i = 0; i < 6; i++) + { + covariance[i] = 0.0f; + } + + for (int i = 0; i < n; i++) + { + Vector3 a = (points[i] - centroid) * metric; // @@ I think weight should be squared, but that seems to increase the error slightly. + Vector3 b = weights[i] * a; + + covariance[0] += a.x * b.x; + covariance[1] += a.x * b.y; + covariance[2] += a.x * b.z; + covariance[3] += a.y * b.y; + covariance[4] += a.y * b.z; + covariance[5] += a.z * b.z; + } + + return centroid; +} + +// @@ We should be able to do something cheaper... +static Vector3 estimatePrincipalComponent(const float * __restrict matrix) +{ + const Vector3 row0(matrix[0], matrix[1], matrix[2]); + const Vector3 row1(matrix[1], matrix[3], matrix[4]); + const Vector3 row2(matrix[2], matrix[4], matrix[5]); + + float r0 = lengthSquared(row0); + float r1 = lengthSquared(row1); + float r2 = lengthSquared(row2); + + if (r0 > r1 && r0 > r2) return row0; + if (r1 > r2) return row1; + return row2; +} + +static inline Vector3 firstEigenVector_PowerMethod(const float *__restrict matrix) +{ + if (matrix[0] == 0 && matrix[3] == 0 && matrix[5] == 0) + { + return Vector3(0.0f); + } + + Vector3 v = estimatePrincipalComponent(matrix); + + const int NUM = 8; + for (int i = 0; i < NUM; i++) + { + float x = v.x * matrix[0] + v.y * matrix[1] + v.z * matrix[2]; + float y = v.x * matrix[1] + v.y * matrix[3] + v.z * matrix[4]; + float z = v.x * matrix[2] + v.y * matrix[4] + v.z * matrix[5]; + + float norm = max(max(x, y), z); + + v = Vector3(x, y, z) * (1.0f / norm); + } + + return v; +} + +static Vector3 computePrincipalComponent_PowerMethod(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric) +{ + float matrix[6]; + computeCovariance(n, points, weights, metric, matrix); + + return firstEigenVector_PowerMethod(matrix); +} + void ClusterFit::setColorSet(const Vector3 * colors, const float * weights, int count) { // initialise the best error @@ -23,8 +112,7 @@ void ClusterFit::setColorSet(const Vector3 * colors, const float * weights, int m_count = count; // I've tried using a lower quality approximation of the principal direction, but the best fit line seems to produce best results. - Vector3 principal = Fit::computePrincipalComponent_PowerMethod(count, colors, weights, metric); - //Vector3 principal = Fit::computePrincipalComponent_EigenSolver(count, colors, weights, metric); + Vector3 principal = computePrincipalComponent_PowerMethod(count, colors, weights, metric); // build the list of values int order[16]; diff --git a/src/nvtt/ClusterFit.h b/src/nvtt/ClusterFit.h index 3597143..9a15f33 100644 --- a/src/nvtt/ClusterFit.h +++ b/src/nvtt/ClusterFit.h @@ -1,76 +1,86 @@ -// MIT license see full LICENSE text at end of file -#pragma once - -#include "nvmath/SimdVector.h" -#include "nvmath/Vector.h" -#include "nvcore/Memory.h" - -// Use SIMD version if altivec or SSE are available. -#define NVTT_USE_SIMD (NV_USE_ALTIVEC || NV_USE_SSE) -//#define NVTT_USE_SIMD 0 - -namespace nv { - - struct ColorSet; - - class ClusterFit - { - public: - ClusterFit() {} - - void setColorSet(const Vector3 * colors, const float * weights, int count); - - void setColorWeights(const Vector4 & w); - float bestError() const; - - bool compress3(Vector3 * start, Vector3 * end); - bool compress4(Vector3 * start, Vector3 * end); - - private: - - uint m_count; - - // IC: Color and weight arrays are larger than necessary to avoid compiler warning. - - #if NVTT_USE_SIMD - NV_ALIGN_16 SimdVector m_weighted[17]; // color | weight - SimdVector m_metric; // vec3 - SimdVector m_metricSqr; // vec3 - SimdVector m_xxsum; // color | weight - SimdVector m_xsum; // color | weight (wsum) - SimdVector m_besterror; // scalar - #else - Vector3 m_weighted[17]; - float m_weights[17]; - Vector3 m_metric; - Vector3 m_metricSqr; - Vector3 m_xxsum; - Vector3 m_xsum; - float m_wsum; - float m_besterror; - #endif - }; - -} // nv namespace - -// Copyright (c) 2006-2020 Ignacio Castano icastano@nvidia.com -// Copyright (c) 2006 Simon Brown si@sjbrown.co.uk -// -// Permission is hereby granted, free of charge, to any person obtaining -// a copy of this software and associated documentation files (the -// "Software"), to deal in the Software without restriction, including -// without limitation the rights to use, copy, modify, merge, publish, -// distribute, sublicense, and/or sell copies of the Software, and to -// permit persons to whom the Software is furnished to do so, subject to -// the following conditions: -// -// The above copyright notice and this permission notice shall be included -// in all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -// IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -// CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -// TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -// SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// MIT license see full LICENSE text at end of file +#pragma once + +#include "nvmath/SimdVector.h" +#include "nvmath/Vector.h" + +// Use SIMD version if altivec or SSE are available. +#define NVTT_USE_SIMD (NV_USE_ALTIVEC || NV_USE_SSE) +//#define NVTT_USE_SIMD 0 + +#include +#if (NV_USE_SSE > 1) +#include +#endif + +#ifndef NV_ALIGN_16 +#if NV_CC_GNUC +# define NV_ALIGN_16 __attribute__ ((__aligned__ (16))) +#else +# define NV_ALIGN_16 __declspec(align(16)) +#endif +#endif + +namespace nv { + + class ClusterFit + { + public: + ClusterFit() {} + + void setColorSet(const Vector3 * colors, const float * weights, int count); + + void setColorWeights(const Vector4 & w); + float bestError() const; + + bool compress3(Vector3 * start, Vector3 * end); + bool compress4(Vector3 * start, Vector3 * end); + + private: + + uint m_count; + + // IC: Color and weight arrays are larger than necessary to avoid compiler warning. + + #if NVTT_USE_SIMD + NV_ALIGN_16 SimdVector m_weighted[17]; // color | weight + SimdVector m_metric; // vec3 + SimdVector m_metricSqr; // vec3 + SimdVector m_xxsum; // color | weight + SimdVector m_xsum; // color | weight (wsum) + SimdVector m_besterror; // scalar + #else + Vector3 m_weighted[17]; + float m_weights[17]; + Vector3 m_metric; + Vector3 m_metricSqr; + Vector3 m_xxsum; + Vector3 m_xsum; + float m_wsum; + float m_besterror; + #endif + }; + +} // nv namespace + +// Copyright (c) 2006-2020 Ignacio Castano icastano@nvidia.com +// Copyright (c) 2006 Simon Brown si@sjbrown.co.uk +// +// Permission is hereby granted, free of charge, to any person obtaining +// a copy of this software and associated documentation files (the +// "Software"), to deal in the Software without restriction, including +// without limitation the rights to use, copy, modify, merge, publish, +// distribute, sublicense, and/or sell copies of the Software, and to +// permit persons to whom the Software is furnished to do so, subject to +// the following conditions: +// +// The above copyright notice and this permission notice shall be included +// in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +// IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +// CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +// TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +// SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/src/nvtt/CompressorDXT1.cpp b/src/nvtt/CompressorDXT1.cpp index f68ab3c..1faad8c 100644 --- a/src/nvtt/CompressorDXT1.cpp +++ b/src/nvtt/CompressorDXT1.cpp @@ -1,17 +1,8 @@ #include "CompressorDXT1.h" -#include "SingleColorLookup.h" #include "ClusterFit.h" -#include "nvimage/ColorBlock.h" -#include "nvimage/BlockDXT.h" - -#include "nvmath/Color.inl" -#include "nvmath/Vector.inl" -#include "nvmath/Fitting.h" -#include "nvmath/ftoi.h" - -#include "nvcore/Utils.h" // swap +#include "nvmath/nvmath.h" #include // memset #include // FLT_MAX @@ -19,6 +10,104 @@ using namespace nv; +/// Swap two values. +/*template +inline void swap(T & a, T & b) +{ + T temp(a); + a = b; + b = temp; +}*/ + + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Basic Types + +struct Color16 { + union { + struct { + uint16 b : 5; + uint16 g : 6; + uint16 r : 5; + }; + uint16 u; + }; +}; + +struct Color32 { + union { + struct { + uint8 b, g, r, a; + }; + uint32 u; + }; +}; + +namespace nv { + struct BlockDXT1 { + Color16 col0; + Color16 col1; + uint32 indices; + }; + + + /*struct Vector3 { + float x, y, z; + };*/ + + inline Vector3 operator*(Vector3 v, float s) { + return { v.x * s, v.y * s, v.z * s }; + } + + inline Vector3 operator*(float s, Vector3 v) { + return { v.x * s, v.y * s, v.z * s }; + } + + inline Vector3 operator*(Vector3 a, Vector3 b) { + return { a.x * b.x, a.y * b.y, a.z * b.z }; + } + + inline float dot(Vector3 a, Vector3 b) { + return a.x * b.x + a.y * b.y + a.z * b.z; + } + + inline Vector3 operator+(Vector3 a, Vector3 b) { + return { a.x + b.x, a.y + b.y, a.z + b.z }; + } + + inline Vector3 operator-(Vector3 a, Vector3 b) { + return { a.x - b.x, a.y - b.y, a.z - b.z }; + } + + inline Vector3 operator/(Vector3 v, float s) { + return { v.x / s, v.y / s, v.z / s }; + } + + /*inline float saturate(float x) { + return x < 0 ? 0 : (x > 1 ? 1 : x); + }*/ + + inline Vector3 saturate(Vector3 v) { + return { saturate(v.x), saturate(v.y), saturate(v.z) }; + } + + inline Vector3 min(Vector3 a, Vector3 b) { + return { min(a.x, b.x), min(a.y, b.y), min(a.z, b.z) }; + } + + inline Vector3 max(Vector3 a, Vector3 b) { + return { max(a.x, b.x), max(a.y, b.y), max(a.z, b.z) }; + } + + inline bool operator==(const Vector3 & a, const Vector3 & b) { + return memcmp(&a, &b, sizeof(Vector3)); + } + + inline void Vector3::set(float x, float y, float z) { + this->x = x; this->y = y; this->z = z; + } + +} /////////////////////////////////////////////////////////////////////////////////////////////////// // Color conversion functions. @@ -54,16 +143,18 @@ static const float midpoints6[64] = { static Color16 vector3_to_color16(const Vector3 & v) { // Truncate. - uint r = ftoi_trunc(clamp(v.x * 31.0f, 0.0f, 31.0f)); - uint g = ftoi_trunc(clamp(v.y * 63.0f, 0.0f, 63.0f)); - uint b = ftoi_trunc(clamp(v.z * 31.0f, 0.0f, 31.0f)); + uint r = uint(clamp(v.x * 31.0f, 0.0f, 31.0f)); + uint g = uint(clamp(v.y * 63.0f, 0.0f, 63.0f)); + uint b = uint(clamp(v.z * 31.0f, 0.0f, 31.0f)); // Round exactly according to 565 bit-expansion. r += (v.x > midpoints5[r]); g += (v.y > midpoints6[g]); b += (v.z > midpoints5[b]); - return Color16((r << 11) | (g << 5) | b); + Color16 c; + c.u = (r << 11) | (g << 5) | b; + return c; } @@ -87,12 +178,12 @@ inline Vector3 color_to_vector3(Color32 c) return Vector3(c.r / 255.0f, c.g / 255.0f, c.b / 255.0f); } -inline Color32 vector3_to_color(Vector3 v) +inline Color32 vector3_to_color32(Vector3 v) { Color32 color; - color.r = U8(ftoi_round(saturate(v.x) * 255)); - color.g = U8(ftoi_round(saturate(v.y) * 255)); - color.b = U8(ftoi_round(saturate(v.z) * 255)); + color.r = uint8(saturate(v.x) * 255 + 0.5f); + color.g = uint8(saturate(v.y) * 255 + 0.5f); + color.b = uint8(saturate(v.z) * 255 + 0.5f); color.a = 255; return color; } @@ -101,15 +192,6 @@ inline Color32 vector3_to_color(Vector3 v) /////////////////////////////////////////////////////////////////////////////////////////////////// // Input block processing. -/*inline static void color_block_to_vector_block(const ColorBlock & rgba, Vector3 block[16]) -{ - for (int i = 0; i < 16; i++) - { - const Color32 c = rgba.color(i); - block[i] = Vector3(c.r, c.g, c.b); - } -}*/ - // Find first valid color. /*static bool find_valid_color_rgb(const Vector3 * colors, const float * weights, int count, Vector3 * valid_color) { @@ -201,6 +283,107 @@ static int reduce_colors(const uint8 * input_colors, Vector3 * colors, float * w } +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Palette evaluation. + +#define DECODER 0 + +inline void evaluate_palette4(Color16 c0, Color16 c1, Color32 palette[4], bool d3d9_bias) { +#if DECODER == 0 || DECODER == 1 + palette[2].r = (2 * palette[0].r + palette[1].r + d3d9_bias) / 3; + palette[2].g = (2 * palette[0].g + palette[1].g + d3d9_bias) / 3; + palette[2].b = (2 * palette[0].b + palette[1].b + d3d9_bias) / 3; + palette[3].r = (2 * palette[1].r + palette[0].r + d3d9_bias) / 3; + palette[3].g = (2 * palette[1].g + palette[0].g + d3d9_bias) / 3; + palette[3].b = (2 * palette[1].b + palette[0].b + d3d9_bias) / 3; +#else + int dg = palette[1].g - palette[0].g; + palette[2].r = ((2 * c0.r + c1.r) * 22) / 8; + palette[2].g = (256 * palette[0].g + dg * 80 + dg / 4 + 128) / 256; + palette[2].b = ((2 * c0.b + c1.b) * 22) / 8; + palette[3].r = ((2 * c1.r + c0.r) * 22) / 8; + palette[3].g = (256 * palette[1].g - dg * 80 - dg / 4 + 128) / 256; + palette[3].b = ((2 * c1.b + c0.b) * 22) / 8; +#endif +} + +inline void evaluate_palette3(Color16 c0, Color16 c1, Color32 palette[4]) { +#if DECODER == 0 || DECODER == 1 + palette[2].r = (palette[0].r + palette[1].r) / 2; + palette[2].g = (palette[0].g + palette[1].g) / 2; + palette[2].b = (palette[0].b + palette[1].b) / 2; +#else + int dg = palette[1].g - palette[0].g; + palette[2].r = ((c0.r + c1.r) * 33) / 8; + palette[2].g = (256 * palette[0].g + dg * 128 + dg / 4 + 128) / 256; + palette[2].b = ((c0.b + c1.b) * 33) / 8; +#endif + palette[3].r = 0; + palette[3].g = 0; + palette[3].b = 0; +} + +static void evaluate_palette(Color16 c0, Color16 c1, Color32 palette[4], bool d3d9_bias) { + palette[0] = bitexpand_color16_to_color32(c0); + palette[1] = bitexpand_color16_to_color32(c1); + if (c0.u > c1.u) { + evaluate_palette4(c0, c1, palette, d3d9_bias); + } + else { + evaluate_palette3(c0, c1, palette); + } +} + +static void evaluate_palette_nv(Color16 c0, Color16 c1, Color32 palette[4]) { + palette[0].r = (3 * c0.r * 22) / 8; + palette[0].g = (c0.g << 2) | (c0.g >> 4); + palette[0].b = (3 * c0.b * 22) / 8; + palette[1].a = 255; + palette[1].r = (3 * c1.r * 22) / 8; + palette[1].g = (c1.g << 2) | (c1.g >> 4); + palette[1].b = (3 * c1.b * 22) / 8; + palette[1].a = 255; + + int gdiff = palette[1].g - palette[0].g; + if (c0.u > c1.u) { + palette[2].r = ((2 * c0.r + c1.r) * 22) / 8; + palette[2].g = (256 * palette[0].g + gdiff / 4 + 128 + gdiff * 80) / 256; + palette[2].b = ((2 * c0.b + c1.b) * 22) / 8; + palette[2].a = 0xFF; + + palette[3].r = ((2 * c1.r + c0.r) * 22) / 8; + palette[3].g = (256 * palette[1].g - gdiff / 4 + 128 - gdiff * 80) / 256; + palette[3].b = ((2 * c1.b + c0.b) * 22) / 8; + palette[3].a = 0xFF; + } + else { + palette[2].r = ((c0.r + c1.r) * 33) / 8; + palette[2].g = (256 * palette[0].g + gdiff / 4 + 128 + gdiff * 128) / 256; + palette[2].b = ((c0.b + c1.b) * 33) / 8; + palette[2].a = 0xFF; + palette[3].u = 0; + } +} + +static void evaluate_palette(Color16 c0, Color16 c1, Color32 palette[4]) { +#if DECODER == 0 + evaluate_palette(c0, c1, palette, false); +#elif DECODER == 1 + evaluate_palette(c0, c1, palette, true); +#elif DECODER == 2 + evaluate_palette_nv(c0, c1, palette); +#endif +} + +static void evaluate_palette(Color16 c0, Color16 c1, Vector3 palette[4]) { + Color32 palette32[4]; + evaluate_palette(c0, c1, palette32); + + for (int i = 0; i < 4; i++) { + palette[i] = color_to_vector3(palette32[i]); + } +} + /////////////////////////////////////////////////////////////////////////////////////////////////// // Error evaluation. @@ -245,8 +428,8 @@ static int evaluate_mse(const Color32 palette[4], const Color32 & c) { // Returns MSE error in [0-255] range. static int evaluate_mse(const BlockDXT1 * output, Color32 color, int index) { Color32 palette[4]; - //output->evaluatePalette(palette, /*d3d9=*/false); - output->evaluatePaletteNV5x(palette); + evaluate_palette(output->col0, output->col1, palette); + //evaluate_palette_nv(output->col0, output->col1, palette); return evaluate_mse(palette[index], color); } @@ -296,8 +479,8 @@ static float evaluate_mse(const BlockDXT1 * output, const Vector3 colors[16]) { static float evaluate_mse(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, const BlockDXT1 * output) { Color32 palette[4]; - output->evaluatePalette(palette, /*d3d9=*/false); - //output->evaluatePaletteNV5x(palette); + evaluate_palette(output->col0, output->col1, palette); + //evaluate_palette_nv5x(output->col0, output->col1, palette); // convert palette to float. /*Vector3 vector_palette[4]; @@ -317,105 +500,30 @@ static float evaluate_mse(const Vector4 input_colors[16], const float input_weig float nv::evaluate_dxt1_error(const uint8 rgba_block[16*4], const BlockDXT1 * block, int decoder) { Color32 palette[4]; if (decoder == 2) { - block->evaluatePaletteNV5x(palette); + evaluate_palette_nv(block->col0, block->col1, palette); + } else { - block->evaluatePalette(palette, /*d3d9=*/decoder); + evaluate_palette(block->col0, block->col1, palette, /*d3d9=*/decoder); } // evaluate error for each index. float error = 0.0f; for (int i = 0; i < 16; i++) { int index = (block->indices >> (2 * i)) & 3; - Color32 c(rgba_block[4 * i + 0], rgba_block[4 * i + 1], rgba_block[4 * i + 2]); + Color32 c; + c.r = rgba_block[4 * i + 0]; + c.g = rgba_block[4 * i + 1]; + c.b = rgba_block[4 * i + 2]; + c.a = 255; error += evaluate_mse(palette[index], c); } return error; } - /////////////////////////////////////////////////////////////////////////////////////////////////// -// Palette evaluation. - -#define DECODER 0 - -inline void evaluate_palette4(Color16 c0, Color16 c1, Color32 palette[4]) { -#if DECODER == 0 - palette[2].r = (2 * palette[0].r + palette[1].r) / 3; - palette[2].g = (2 * palette[0].g + palette[1].g) / 3; - palette[2].b = (2 * palette[0].b + palette[1].b) / 3; - palette[3].r = (2 * palette[1].r + palette[0].r) / 3; - palette[3].g = (2 * palette[1].g + palette[0].g) / 3; - palette[3].b = (2 * palette[1].b + palette[0].b) / 3; -#elif DECODER == 1 - palette[2].r = (2 * palette[0].r + palette[1].r + 1) / 3; - palette[2].g = (2 * palette[0].g + palette[1].g + 1) / 3; - palette[2].b = (2 * palette[0].b + palette[1].b + 1) / 3; - palette[3].r = (2 * palette[1].r + palette[0].r + 1) / 3; - palette[3].g = (2 * palette[1].g + palette[0].g + 1) / 3; - palette[3].b = (2 * palette[1].b + palette[0].b + 1) / 3; -#else - int dg = palette[1].g - palette[0].g; - palette[2].r = ((2 * c0.r + c1.r) * 22) / 8; - palette[2].g = (256 * palette[0].g + dg * 80 + dg / 4 + 128) / 256; - palette[2].b = ((2 * c0.b + c1.b) * 22) / 8; - palette[3].r = ((2 * c1.r + c0.r) * 22) / 8; - palette[3].g = (256 * palette[1].g - dg * 80 - dg / 4 + 128) / 256; - palette[3].b = ((2 * c1.b + c0.b) * 22) / 8; -#endif -} - -inline void evaluate_palette3(Color16 c0, Color16 c1, Color32 palette[4]) { -#if DECODER == 0 || DECODER == 1 - palette[2].r = (palette[0].r + palette[1].r) / 2; - palette[2].g = (palette[0].g + palette[1].g) / 2; - palette[2].b = (palette[0].b + palette[1].b) / 2; -#else - int dg = palette[1].g - palette[0].g; - palette[2].r = ((c0.r + c1.r) * 33) / 8; - palette[2].g = (256 * palette[0].g + dg * 128 + dg / 4 + 128) / 256; - palette[2].b = ((c0.b + c1.b) * 33) / 8; -#endif - palette[3].r = 0; - palette[3].g = 0; - palette[3].b = 0; -} - -static void evaluate_palette(Color16 c0, Color16 c1, Color32 palette[4]) { - palette[0] = bitexpand_color16_to_color32(c0); - palette[1] = bitexpand_color16_to_color32(c1); - if (c0.u > c1.u) { - evaluate_palette4(c0, c1, palette); - } - else { - evaluate_palette3(c0, c1, palette); - } -} - -static void evaluate_palette(Color16 c0, Color16 c1, Vector3 palette[4]) { - Color32 palette32[4]; - evaluate_palette(c0, c1, palette32); - - for (int i = 0; i < 4; i++) { - palette[i] = color_to_vector3(palette32[i]); - } -} - -/*static void evaluate_palette3(Color16 c0, Color16 c1, Vector3 palette[4]) { - nvDebugCheck(c0.u > c1.u); - - Color32 palette32[4]; - evaluate_palette(c0, c1, palette32); - - for (int i = 0; i < 4; i++) { - palette[i] = color_to_vector3(palette32[i]); - } -}*/ - - - - +// Index selection static uint compute_indices4(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 palette[4]) { @@ -678,10 +786,12 @@ inline static void select_diagonal(const Vector3 * colors, int count, Vector3 * } center /= count;*/ - Vector2 covariance = Vector2(0); + float cov_xz = 0.0f; + float cov_yz = 0.0f; for (int i = 0; i < count; i++) { Vector3 t = colors[i] - center; - covariance += t.xy() * t.z; + cov_xz += t.x * t.z; + cov_yz += t.y * t.z; } float x0 = c0->x; @@ -689,10 +799,10 @@ inline static void select_diagonal(const Vector3 * colors, int count, Vector3 * float x1 = c1->x; float y1 = c1->y; - if (covariance.x < 0) { + if (cov_xz < 0) { swap(x0, x1); } - if (covariance.y < 0) { + if (cov_yz < 0) { swap(y0, y1); } @@ -702,22 +812,89 @@ inline static void select_diagonal(const Vector3 * colors, int count, Vector3 * inline static void inset_bbox(Vector3 * restrict c0, Vector3 * restrict c1) { - Vector3 inset = (*c0 - *c1) / 16.0f - (8.0f / 255.0f) / 16.0f; + Vector3 inset = (*c0 - *c1) / 16.0f - Vector3((8.0f / 255.0f) / 16.0f); *c0 = saturate(*c0 - inset); *c1 = saturate(*c1 + inset); } + +// Single color lookup tables from: +// https://github.com/nothings/stb/blob/master/stb_dxt.h +static uint8 match5[256][2]; +static uint8 match6[256][2]; + +static int Mul8Bit(int a, int b) +{ + int t = a * b + 128; + return (t + (t >> 8)) >> 8; +} + +static inline int Lerp13(int a, int b) +{ +#ifdef DXT_USE_ROUNDING_BIAS + // with rounding bias + return a + Mul8Bit(b - a, 0x55); +#else + // without rounding bias + // replace "/ 3" by "* 0xaaab) >> 17" if your compiler sucks or you really need every ounce of speed. + return (a * 2 + b) / 3; +#endif +} + +static void PrepareOptTable(uint8 * table, const uint8 * expand, int size) +{ + for (int i = 0; i < 256; i++) { + int bestErr = 256 * 100; + + for (int min = 0; min < size; min++) { + for (int max = 0; max < size; max++) { + int mine = expand[min]; + int maxe = expand[max]; + + int err = abs(Lerp13(maxe, mine) - i) * 100; + + // DX10 spec says that interpolation must be within 3% of "correct" result, + // add this as error term. (normally we'd expect a random distribution of + // +-1.5% error, but nowhere in the spec does it say that the error has to be + // unbiased - better safe than sorry). + err += abs(max - min) * 3; + + if (err < bestErr) { + bestErr = err; + table[i * 2 + 0] = max; + table[i * 2 + 1] = min; + } + } + } + } +} + +// @@ Make this explicit. +NV_AT_STARTUP(nv::init_dxt1()); + +void nv::init_dxt1() +{ + // Prepare single color lookup tables. + uint8 expand5[32]; + uint8 expand6[64]; + for (int i = 0; i < 32; i++) expand5[i] = (i << 3) | (i >> 2); + for (int i = 0; i < 64; i++) expand6[i] = (i << 2) | (i >> 4); + + PrepareOptTable(&match5[0][0], expand5, 32); + PrepareOptTable(&match6[0][0], expand6, 64); +} + // Single color compressor, based on: // https://mollyrocket.com/forums/viewtopic.php?t=392 static void compress_dxt1_single_color_optimal(Color32 c, BlockDXT1 * output) { - output->col0.r = OMatch5[c.r][0]; - output->col0.g = OMatch6[c.g][0]; - output->col0.b = OMatch5[c.b][0]; - output->col1.r = OMatch5[c.r][1]; - output->col1.g = OMatch6[c.g][1]; - output->col1.b = OMatch5[c.b][1]; + output->col0.r = match5[c.r][0]; + output->col0.g = match6[c.g][0]; + output->col0.b = match5[c.b][0]; + output->col1.r = match5[c.r][1]; + output->col1.g = match6[c.g][1]; + output->col1.b = match5[c.b][1]; output->indices = 0xaaaaaaaa; if (output->col0.u < output->col1.u) @@ -728,24 +905,23 @@ static void compress_dxt1_single_color_optimal(Color32 c, BlockDXT1 * output) } -float nv::compress_dxt1_single_color_optimal(Color32 c, BlockDXT1 * output) +/*float nv::compress_dxt1_single_color_optimal(Color32 c, BlockDXT1 * output) { ::compress_dxt1_single_color_optimal(c, output); // Multiply by 16^2, the weight associated to a single color. // Divide by 255*255 to covert error to [0-1] range. return (256.0f / (255*255)) * evaluate_mse(output, c, output->indices & 3); -} - +}*/ -float nv::compress_dxt1_single_color_optimal(const Vector3 & color, BlockDXT1 * output) +/*float nv::compress_dxt1_single_color_optimal(const Vector3 & color, BlockDXT1 * output) { - return compress_dxt1_single_color_optimal(vector3_to_color(color), output); -} + return compress_dxt1_single_color_optimal(vector3_to_color32(color), output); +}*/ // Compress block using the average color. -float nv::compress_dxt1_single_color(const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, BlockDXT1 * output) +float nv::compress_dxt1_single_color(const nv::Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, BlockDXT1 * output) { // Compute block average. Vector3 color_sum(0); @@ -757,7 +933,7 @@ float nv::compress_dxt1_single_color(const Vector3 * colors, const float * weigh } // Compress optimally. - ::compress_dxt1_single_color_optimal(vector3_to_color(color_sum / weight_sum), output); + ::compress_dxt1_single_color_optimal(vector3_to_color32(color_sum / weight_sum), output); // Decompress block color. Color32 palette[4]; @@ -787,12 +963,12 @@ float nv::compress_dxt1_bounding_box_exhaustive(const Vector4 input_colors[16], } // Convert to 5:6:5 - int min_r = ftoi_floor(31 * min_color.x); - int min_g = ftoi_floor(63 * min_color.y); - int min_b = ftoi_floor(31 * min_color.z); - int max_r = ftoi_ceil(31 * max_color.x); - int max_g = ftoi_ceil(63 * max_color.y); - int max_b = ftoi_ceil(31 * max_color.z); + int min_r = int(31 * min_color.x); + int min_g = int(63 * min_color.y); + int min_b = int(31 * min_color.z); + int max_r = int(31 * max_color.x + 1); + int max_g = int(63 * max_color.y + 1); + int max_b = int(31 * max_color.z + 1); // Expand the box. int range_r = max_r - min_r; @@ -818,7 +994,7 @@ float nv::compress_dxt1_bounding_box_exhaustive(const Vector4 input_colors[16], // @@ Convert to fixed point before building box? Color32 colors32[16]; for (int i = 0; i < count; i++) { - colors32[i] = toColor32(Vector4(colors[i], 1)); + colors32[i] = vector3_to_color32(colors[i]); } float best_error = FLT_MAX; @@ -843,7 +1019,7 @@ float nv::compress_dxt1_bounding_box_exhaustive(const Vector4 input_colors[16], if (c0.u > c1.u) { // Evaluate error in 4 color mode. - evaluate_palette4(c0, c1, palette); + evaluate_palette4(c0, c1, palette, false); } else { if (three_color_mode) { @@ -942,19 +1118,6 @@ void nv::compress_dxt1_cluster_fit(const Vector4 input_colors[16], const Vector3 return mask; }*/ - -inline uint32 mod3(uint32 a) { - a = (a >> 16) + (a & 0xFFFF); /* sum base 2**16 digits a <= 0x1FFFE */ - a = (a >> 8) + (a & 0xFF); /* sum base 2**8 digits a <= 0x2FD */ - a = (a >> 4) + (a & 0xF); /* sum base 2**4 digits a <= 0x3C; worst case 0x3B */ - a = (a >> 2) + (a & 0x3); /* sum base 2**2 digits a <= 0x1D; worst case 0x1B */ - a = (a >> 2) + (a & 0x3); /* sum base 2**2 digits a <= 0x9; worst case 0x7 */ - a = (a >> 2) + (a & 0x3); /* sum base 2**2 digits a <= 0x4 */ - if (a > 2) a = a - 3; - return a; -} - - float nv::compress_dxt1(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, bool three_color_mode, bool hq, BlockDXT1 * output) { Vector3 colors[16]; @@ -1004,7 +1167,8 @@ float nv::compress_dxt1(const Vector4 input_colors[16], const float input_weight // Cluster fit cannot handle single color blocks, so encode them optimally if we haven't encoded them already. if (error == FLT_MAX && count == 1) { - error = compress_dxt1_single_color_optimal(colors[0], output); + ::compress_dxt1_single_color_optimal(vector3_to_color32(colors[0]), output); + return evaluate_mse(input_colors, input_weights, color_weights, output); } if (count > 1) { @@ -1107,6 +1271,11 @@ float nv::compress_dxt1(const Vector4 input_colors[16], const float input_weight refined.col1.b += delta[2]; } + if (!three_color_mode) { + if (refined.col0.u == refined.col1.u) refined.col1.g += 1; + if (refined.col0.u < refined.col1.u) swap(refined.col0.u, refined.col1.u); + } + Vector3 palette[4]; evaluate_palette(output->col0, output->col1, palette); @@ -1159,7 +1328,7 @@ float nv::compress_dxt1_fast(const Vector4 input_colors[16], const float input_w Vector3 c0, c1; fit_colors_bbox(colors, count, &c0, &c1); if (c0 == c1) { - ::compress_dxt1_single_color_optimal(vector3_to_color(c0), output); + ::compress_dxt1_single_color_optimal(vector3_to_color32(c0), output); return evaluate_mse(input_colors, input_weights, color_weights, output); } inset_bbox(&c0, &c1); @@ -1208,7 +1377,7 @@ void nv::compress_dxt1_fast2(const uint8 input_colors[16*4], BlockDXT1 * output) //select_diagonal(colors, count, &c0, &c1); fit_colors_bbox(vec_colors, 16, &c0, &c1); if (c0 == c1) { - ::compress_dxt1_single_color_optimal(vector3_to_color(c0), output); + ::compress_dxt1_single_color_optimal(vector3_to_color32(c0), output); return; } inset_bbox(&c0, &c1); @@ -1222,11 +1391,11 @@ void nv::compress_dxt1_fast2(const uint8 input_colors[16*4], BlockDXT1 * output) } -static int Mul8Bit(int a, int b) +/*static int Mul8Bit(int a, int b) { int t = a * b + 128; return (t + (t >> 8)) >> 8; -} +}*/ static bool compute_least_squares_endpoints(const uint8 *block, uint32 mask, Vector3 *pmax, Vector3 *pmin) { @@ -1487,7 +1656,10 @@ void nv::compress_dxt1_fast_geld(const uint8 input_colors[16 * 4], BlockDXT1 * b Vector3 c0, c1; if (!compute_least_squares_endpoints(input_colors, selectors, &c0, &c1)) { // @@ Single color compressor. - Color32 c(lr, lg, lb); + Color32 c; + c.r = lr; + c.g = lg; + c.b = lb; ::compress_dxt1_single_color_optimal(c, block); } else { @@ -1512,7 +1684,7 @@ void nv::compress_dxt1_fast_geld(const uint8 input_colors[16 * 4], BlockDXT1 * b //select_diagonal(colors, count, &c0, &c1); fit_colors_bbox(vec_colors, 16, &c0, &c1); if (c0 == c1) { - ::compress_dxt1_single_color_optimal(vector3_to_color(c0), output); + ::compress_dxt1_single_color_optimal(vector3_to_color32(c0), output); return; } inset_bbox(&c0, &c1); diff --git a/src/nvtt/CompressorDXT1.h b/src/nvtt/CompressorDXT1.h index bd1422c..ca46b6c 100644 --- a/src/nvtt/CompressorDXT1.h +++ b/src/nvtt/CompressorDXT1.h @@ -1,15 +1,13 @@ namespace nv { - class Color32; struct BlockDXT1; class Vector3; class Vector4; - // All these functions return MSE. + void init_dxt1(); - float compress_dxt1_single_color_optimal(Color32 c, BlockDXT1 * output); - float compress_dxt1_single_color_optimal(const Vector3 & color, BlockDXT1 * output); + // All these functions return MSE. float compress_dxt1_single_color(const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, BlockDXT1 * output); //float compress_dxt1_least_squares_fit(const Vector4 input_colors[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, BlockDXT1 * output);