diff --git a/src/nvtt/BlockCompressor.cpp b/src/nvtt/BlockCompressor.cpp index fded226..ca1a1d5 100644 --- a/src/nvtt/BlockCompressor.cpp +++ b/src/nvtt/BlockCompressor.cpp @@ -214,8 +214,8 @@ void FastCompressorDXT1::compressBlock(Vector4 colors[16], float weights[16], co } void CompressorDXT1::compressBlock(Vector4 colors[16], float weights[16], const CompressionOptions::Private & compressionOptions, void * output) { - bool hq = compressionOptions.quality > Quality_Normal; - icbc::compress_dxt1((float*)colors, weights, compressionOptions.colorWeight.component, /*three_color_mode*/true, hq, output); + auto quality_level = compressionOptions.quality > Quality_Normal ? icbc::Quality_Max : icbc::Quality_Default; + icbc::compress_dxt1(quality_level, (float*)colors, weights, compressionOptions.colorWeight.component, /*three_color_mode*/true, /*three_color_black*/true, output); } diff --git a/src/nvtt/CompressorDXT5_RGBM.cpp b/src/nvtt/CompressorDXT5_RGBM.cpp index 0ebacc3..65d547c 100644 --- a/src/nvtt/CompressorDXT5_RGBM.cpp +++ b/src/nvtt/CompressorDXT5_RGBM.cpp @@ -61,7 +61,7 @@ float nv::compress_dxt5_rgbm(const Vector4 input_colors[16], const float input_w float color_weights[3] = { 1.0f,1.0f,1.0f }; // Compress RGB. - icbc::compress_dxt1((float *)input_colors_rgbm, rgb_weights, color_weights, /*three_color_mode=*/false, /*hq=*/false, &output->color); + icbc::compress_dxt1(icbc::Quality_Default, (float *)input_colors_rgbm, rgb_weights, color_weights, /*three_color_mode=*/false, /*hq=*/false, &output->color); // Decompress RGB/M block. nv::ColorBlock RGB; diff --git a/src/nvtt/Context.cpp b/src/nvtt/Context.cpp index 695e737..af0b0f1 100644 --- a/src/nvtt/Context.cpp +++ b/src/nvtt/Context.cpp @@ -69,7 +69,7 @@ Compressor::Compressor() : m(*new Compressor::Private()) m.dispatcher = &m.defaultDispatcher; - icbc::init(); + icbc::init_dxt1(); } Compressor::~Compressor() diff --git a/src/nvtt/icbc.cpp b/src/nvtt/icbc.cpp index cb2215c..934f099 100644 --- a/src/nvtt/icbc.cpp +++ b/src/nvtt/icbc.cpp @@ -1,2 +1,3 @@ #define ICBC_IMPLEMENTATION +#define ICBC_SIMD 2 // SSE4.1 #include "icbc.h" diff --git a/src/nvtt/icbc.h b/src/nvtt/icbc.h index 56bf3b7..19abad7 100644 --- a/src/nvtt/icbc.h +++ b/src/nvtt/icbc.h @@ -1,3922 +1,3839 @@ - - -namespace icbc { - - void init(); - - float compress_dxt1(const float input_colors[16 * 4], const float input_weights[16], const float color_weights[3], bool three_color_mode, bool hq, void * output); - float compress_dxt1_fast(const float input_colors[16 * 4], const float input_weights[16], const float color_weights[3], void * output); - void compress_dxt1_fast(const unsigned char input_colors[16 * 4], void * output); - - enum Decoder { - Decoder_D3D10 = 0, - Decoder_NVIDIA = 1, - Decoder_AMD = 2 - }; - - float evaluate_dxt1_error(const unsigned char rgba_block[16 * 4], const void * block, Decoder decoder = Decoder_D3D10); - -} - -#ifdef ICBC_IMPLEMENTATION - -#ifndef ICBC_USE_SSE -#define ICBC_USE_SSE 2 -#endif - -#ifndef ICBC_DECODER -#define ICBC_DECODER 0 // 0 = d3d10, 1 = d3d9, 2 = nvidia, 3 = amd -#endif - -#define ICBC_USE_SIMD ICBC_USE_SSE - -// Some testing knobs: -#define ICBC_FAST_CLUSTER_FIT 0 // This ignores input weights for a moderate speedup. -#define ICBC_PERFECT_ROUND 0 // Enable perfect rounding in scalar code path only. - -#include -#include // memset -#include // floorf -#include // FLT_MAX - -#ifndef ICBC_ASSERT -#define ICBC_ASSERT assert -#include -#endif - -namespace icbc { - -/////////////////////////////////////////////////////////////////////////////////////////////////// -// Basic Templates - -template inline void swap(T & a, T & b) { - T temp(a); - a = b; - b = temp; -} - -template inline T max(const T & a, const T & b) { - return (b < a) ? a : b; -} - -template inline T min(const T & a, const T & b) { - return (a < b) ? a : b; -} - -template inline T clamp(const T & x, const T & a, const T & b) { - return min(max(x, a), b); -} - -template inline T square(const T & a) { - return a * a; -} - - -/////////////////////////////////////////////////////////////////////////////////////////////////// -// Basic Types - -typedef uint8_t uint8; -typedef int8_t int8; -typedef uint16_t uint16; -typedef uint32_t uint32; -typedef uint32_t uint; - - -struct Color16 { - union { - struct { - uint16 b : 5; - uint16 g : 6; - uint16 r : 5; - }; - uint16 u; - }; -}; - -struct Color32 { - union { - struct { - uint8 b, g, r, a; - }; - uint32 u; - }; -}; - -struct BlockDXT1 { - Color16 col0; - Color16 col1; - uint32 indices; -}; - - -struct Vector3 { - float x; - float y; - float z; - - inline void operator+=(Vector3 v) { - x += v.x; y += v.y; z += v.z; - } - inline void operator*=(Vector3 v) { - x *= v.x; y *= v.y; z *= v.z; - } - inline void operator*=(float s) { - x *= s; y *= s; z *= s; - } -}; - -struct Vector4 { - union { - struct { - float x, y, z, w; - }; - Vector3 xyz; - }; -}; - - -inline Vector3 operator*(Vector3 v, float s) { - return { v.x * s, v.y * s, v.z * s }; -} - -inline Vector3 operator*(float s, Vector3 v) { - return { v.x * s, v.y * s, v.z * s }; -} - -inline Vector3 operator*(Vector3 a, Vector3 b) { - return { a.x * b.x, a.y * b.y, a.z * b.z }; -} - -inline float dot(Vector3 a, Vector3 b) { - return a.x * b.x + a.y * b.y + a.z * b.z; -} - -inline Vector3 operator+(Vector3 a, Vector3 b) { - return { a.x + b.x, a.y + b.y, a.z + b.z }; -} - -inline Vector3 operator-(Vector3 a, Vector3 b) { - return { a.x - b.x, a.y - b.y, a.z - b.z }; -} - -inline Vector3 operator/(Vector3 v, float s) { - return { v.x / s, v.y / s, v.z / s }; -} - -inline float saturate(float x) { - return clamp(x, 0.0f, 1.0f); -} - -inline Vector3 saturate(Vector3 v) { - return { saturate(v.x), saturate(v.y), saturate(v.z) }; -} - -inline Vector3 min(Vector3 a, Vector3 b) { - return { min(a.x, b.x), min(a.y, b.y), min(a.z, b.z) }; -} - -inline Vector3 max(Vector3 a, Vector3 b) { - return { max(a.x, b.x), max(a.y, b.y), max(a.z, b.z) }; -} - -inline Vector3 round(Vector3 v) { - return { floorf(v.x+0.5f), floorf(v.y + 0.5f), floorf(v.z + 0.5f) }; -} - -inline Vector3 floor(Vector3 v) { - return { floorf(v.x), floorf(v.y), floorf(v.z) }; -} - -inline bool operator==(const Vector3 & a, const Vector3 & b) { - return a.x == b.x && a.y == b.y && a.z == b.z; -} - -inline Vector3 scalar_to_vector3(float f) { - return {f, f, f}; -} - -inline float lengthSquared(Vector3 v) { - return dot(v, v); -} - -inline bool equal(float a, float b, float epsilon = 0.0001) { - // http://realtimecollisiondetection.net/blog/?p=89 - //return fabsf(a - b) < epsilon * max(1.0f, max(fabsf(a), fabsf(b))); - return fabsf(a - b) < epsilon; -} - -inline bool equal(Vector3 a, Vector3 b, float epsilon) { - return equal(a.x, b.x, epsilon) && equal(a.y, b.y, epsilon) && equal(a.z, b.z, epsilon); -} - - -/////////////////////////////////////////////////////////////////////////////////////////////////// -// SIMD - -#ifndef ICBC_ALIGN_16 -#if __GNUC__ -# define ICBC_ALIGN_16 __attribute__ ((__aligned__ (16))) -#else // _MSC_VER -# define ICBC_ALIGN_16 __declspec(align(16)) -#endif -#endif - -#if ICBC_USE_SIMD - -#include -#include - -#define SIMD_INLINE inline -#define SIMD_NATIVE __forceinline - -class SimdVector -{ -public: - __m128 vec; - - typedef SimdVector const& Arg; - - SIMD_NATIVE SimdVector() {} - - SIMD_NATIVE explicit SimdVector(__m128 v) : vec(v) {} - - SIMD_NATIVE explicit SimdVector(float f) { - vec = _mm_set1_ps(f); - } - - SIMD_NATIVE explicit SimdVector(const float * v) - { - vec = _mm_load_ps(v); - } - - SIMD_NATIVE SimdVector(float x, float y, float z, float w) - { - vec = _mm_setr_ps(x, y, z, w); - } - - SIMD_NATIVE SimdVector(const SimdVector & arg) : vec(arg.vec) {} - - SIMD_NATIVE SimdVector & operator=(const SimdVector & arg) - { - vec = arg.vec; - return *this; - } - - SIMD_INLINE float toFloat() const - { - ICBC_ALIGN_16 float f; - _mm_store_ss(&f, vec); - return f; - } - - SIMD_INLINE Vector3 toVector3() const - { - ICBC_ALIGN_16 float c[4]; - _mm_store_ps(c, vec); - return { c[0], c[1], c[2] }; - } - -#define SSE_SPLAT( a ) ((a) | ((a) << 2) | ((a) << 4) | ((a) << 6)) - SIMD_NATIVE SimdVector splatX() const { return SimdVector(_mm_shuffle_ps(vec, vec, SSE_SPLAT(0))); } - SIMD_NATIVE SimdVector splatY() const { return SimdVector(_mm_shuffle_ps(vec, vec, SSE_SPLAT(1))); } - SIMD_NATIVE SimdVector splatZ() const { return SimdVector(_mm_shuffle_ps(vec, vec, SSE_SPLAT(2))); } - SIMD_NATIVE SimdVector splatW() const { return SimdVector(_mm_shuffle_ps(vec, vec, SSE_SPLAT(3))); } -#undef SSE_SPLAT - - SIMD_NATIVE SimdVector& operator+=(Arg v) - { - vec = _mm_add_ps(vec, v.vec); - return *this; - } - - SIMD_NATIVE SimdVector& operator-=(Arg v) - { - vec = _mm_sub_ps(vec, v.vec); - return *this; - } - - SIMD_NATIVE SimdVector& operator*=(Arg v) - { - vec = _mm_mul_ps(vec, v.vec); - return *this; - } -}; - - -SIMD_NATIVE SimdVector operator+(SimdVector::Arg left, SimdVector::Arg right) -{ - return SimdVector(_mm_add_ps(left.vec, right.vec)); -} - -SIMD_NATIVE SimdVector operator-(SimdVector::Arg left, SimdVector::Arg right) -{ - return SimdVector(_mm_sub_ps(left.vec, right.vec)); -} - -SIMD_NATIVE SimdVector operator*(SimdVector::Arg left, SimdVector::Arg right) -{ - return SimdVector(_mm_mul_ps(left.vec, right.vec)); -} - -// Returns a*b + c -SIMD_INLINE SimdVector multiplyAdd(SimdVector::Arg a, SimdVector::Arg b, SimdVector::Arg c) -{ - return SimdVector(_mm_add_ps(_mm_mul_ps(a.vec, b.vec), c.vec)); -} - -// Returns -( a*b - c ) -SIMD_INLINE SimdVector negativeMultiplySubtract(SimdVector::Arg a, SimdVector::Arg b, SimdVector::Arg c) -{ - return SimdVector(_mm_sub_ps(c.vec, _mm_mul_ps(a.vec, b.vec))); -} - -SIMD_INLINE SimdVector reciprocal(SimdVector::Arg v) -{ - // get the reciprocal estimate - __m128 estimate = _mm_rcp_ps(v.vec); - - // one round of Newton-Rhaphson refinement - __m128 diff = _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(estimate, v.vec)); - return SimdVector(_mm_add_ps(_mm_mul_ps(diff, estimate), estimate)); -} - -SIMD_NATIVE SimdVector min(SimdVector::Arg left, SimdVector::Arg right) -{ - return SimdVector(_mm_min_ps(left.vec, right.vec)); -} - -SIMD_NATIVE SimdVector max(SimdVector::Arg left, SimdVector::Arg right) -{ - return SimdVector(_mm_max_ps(left.vec, right.vec)); -} - -SIMD_INLINE SimdVector truncate(SimdVector::Arg v) -{ -#if (ICBC_USE_SSE == 1) - // convert to ints - __m128 input = v.vec; - __m64 lo = _mm_cvttps_pi32(input); - __m64 hi = _mm_cvttps_pi32(_mm_movehl_ps(input, input)); - - // convert to floats - __m128 part = _mm_movelh_ps(input, _mm_cvtpi32_ps(input, hi)); - __m128 truncated = _mm_cvtpi32_ps(part, lo); - - // clear out the MMX multimedia state to allow FP calls later - _mm_empty(); - return SimdVector(truncated); -#else - // use SSE2 instructions - return SimdVector(_mm_cvtepi32_ps(_mm_cvttps_epi32(v.vec))); -#endif -} - -SIMD_INLINE SimdVector select(SimdVector::Arg off, SimdVector::Arg on, SimdVector::Arg bits) -{ - __m128 a = _mm_andnot_ps(bits.vec, off.vec); - __m128 b = _mm_and_ps(bits.vec, on.vec); - - return SimdVector(_mm_or_ps(a, b)); -} - -SIMD_INLINE bool compareAnyLessThan(SimdVector::Arg left, SimdVector::Arg right) -{ - __m128 bits = _mm_cmplt_ps(left.vec, right.vec); - int value = _mm_movemask_ps(bits); - return value != 0; -} - -#endif // ICBC_USE_SIMD - - -/////////////////////////////////////////////////////////////////////////////////////////////////// -// Color conversion functions. - -static const float midpoints5[32] = { - 0.015686f, 0.047059f, 0.078431f, 0.111765f, 0.145098f, 0.176471f, 0.207843f, 0.241176f, 0.274510f, 0.305882f, 0.337255f, 0.370588f, 0.403922f, 0.435294f, 0.466667f, 0.5f, - 0.533333f, 0.564706f, 0.596078f, 0.629412f, 0.662745f, 0.694118f, 0.725490f, 0.758824f, 0.792157f, 0.823529f, 0.854902f, 0.888235f, 0.921569f, 0.952941f, 0.984314f, 1.0f -}; - -static const float midpoints6[64] = { - 0.007843f, 0.023529f, 0.039216f, 0.054902f, 0.070588f, 0.086275f, 0.101961f, 0.117647f, 0.133333f, 0.149020f, 0.164706f, 0.180392f, 0.196078f, 0.211765f, 0.227451f, 0.245098f, - 0.262745f, 0.278431f, 0.294118f, 0.309804f, 0.325490f, 0.341176f, 0.356863f, 0.372549f, 0.388235f, 0.403922f, 0.419608f, 0.435294f, 0.450980f, 0.466667f, 0.482353f, 0.500000f, - 0.517647f, 0.533333f, 0.549020f, 0.564706f, 0.580392f, 0.596078f, 0.611765f, 0.627451f, 0.643137f, 0.658824f, 0.674510f, 0.690196f, 0.705882f, 0.721569f, 0.737255f, 0.754902f, - 0.772549f, 0.788235f, 0.803922f, 0.819608f, 0.835294f, 0.850980f, 0.866667f, 0.882353f, 0.898039f, 0.913725f, 0.929412f, 0.945098f, 0.960784f, 0.976471f, 0.992157f, 1.0f -}; - -/*void init_tables() { - for (int i = 0; i < 31; i++) { - float f0 = float(((i+0) << 3) | ((i+0) >> 2)) / 255.0f; - float f1 = float(((i+1) << 3) | ((i+1) >> 2)) / 255.0f; - midpoints5[i] = (f0 + f1) * 0.5; - } - midpoints5[31] = 1.0f; - - for (int i = 0; i < 63; i++) { - float f0 = float(((i+0) << 2) | ((i+0) >> 4)) / 255.0f; - float f1 = float(((i+1) << 2) | ((i+1) >> 4)) / 255.0f; - midpoints6[i] = (f0 + f1) * 0.5; - } - midpoints6[63] = 1.0f; -}*/ - -static Color16 vector3_to_color16(const Vector3 & v) { - - // Truncate. - uint r = uint(clamp(v.x * 31.0f, 0.0f, 31.0f)); - uint g = uint(clamp(v.y * 63.0f, 0.0f, 63.0f)); - uint b = uint(clamp(v.z * 31.0f, 0.0f, 31.0f)); - - // Round exactly according to 565 bit-expansion. - r += (v.x > midpoints5[r]); - g += (v.y > midpoints6[g]); - b += (v.z > midpoints5[b]); - - Color16 c; - c.u = (r << 11) | (g << 5) | b; - return c; -} - -static Color32 bitexpand_color16_to_color32(Color16 c16) { - Color32 c32; - //c32.b = (c16.b << 3) | (c16.b >> 2); - //c32.g = (c16.g << 2) | (c16.g >> 4); - //c32.r = (c16.r << 3) | (c16.r >> 2); - //c32.a = 0xFF; - - c32.u = ((c16.u << 3) & 0xf8) | ((c16.u << 5) & 0xfc00) | ((c16.u << 8) & 0xf80000); - c32.u |= (c32.u >> 5) & 0x070007; - c32.u |= (c32.u >> 6) & 0x000300; - - return c32; -} - -inline Vector3 color_to_vector3(Color32 c) { - return { c.r / 255.0f, c.g / 255.0f, c.b / 255.0f }; -} - -inline Color32 vector3_to_color32(Vector3 v) { - Color32 color; - color.r = uint8(saturate(v.x) * 255 + 0.5f); - color.g = uint8(saturate(v.y) * 255 + 0.5f); - color.b = uint8(saturate(v.z) * 255 + 0.5f); - color.a = 255; - return color; -} - - -/////////////////////////////////////////////////////////////////////////////////////////////////// -// Input block processing. - -// Find similar colors and combine them together. -static int reduce_colors(const Vector4 * input_colors, const float * input_weights, Vector3 * colors, float * weights) -{ -#if 0 - for (int i = 0; i < 16; i++) { - colors[i] = input_colors[i].xyz; - weights[i] = input_weights[i]; - } - return 16; -#else - int n = 0; - for (int i = 0; i < 16; i++) - { - Vector3 ci = input_colors[i].xyz; - float wi = input_weights[i]; - - float threshold = 1.0 / 256; - - if (wi > 0) { - // Find matching color. - int j; - for (j = 0; j < n; j++) { - if (equal(colors[j], ci, threshold)) { - weights[j] += wi; - break; - } - } - - // No match found. Add new color. - if (j == n) { - colors[n] = ci; - weights[n] = wi; - n++; - } - } - } - - ICBC_ASSERT(n <= 16); - - return n; -#endif -} - -static int reduce_colors(const uint8 * input_colors, Vector3 * colors, float * weights) -{ - int n = 0; - for (int i = 0; i < 16; i++) - { - Vector3 ci; - ci.x = float(input_colors[4 * i + 0]); - ci.y = float(input_colors[4 * i + 1]); - ci.z = float(input_colors[4 * i + 2]); - - float threshold = 1.0 / 256; - - // Find matching color. - int j; - for (j = 0; j < n; j++) { - if (equal(colors[j], ci, threshold)) { - weights[j] += 1.0f; - break; - } - } - - // No match found. Add new color. - if (j == n) { - colors[n] = ci; - weights[n] = 1.0f; - n++; - } - } - - ICBC_ASSERT(n <= 16); - - return n; -} - - - - -/////////////////////////////////////////////////////////////////////////////////////////////////// -// Cluster Fit - -class ClusterFit -{ -public: - ClusterFit() {} - - void setErrorMetric(const Vector3 & metric); - - void setColorSet(const Vector3 * colors, const float * weights, int count, const Vector3 & metric); - void setColorSet(const Vector4 * colors, const Vector3 & metric); - - float bestError() const; - - bool compress3(Vector3 * start, Vector3 * end); - bool compress4(Vector3 * start, Vector3 * end); - - bool fastCompress3(Vector3 * start, Vector3 * end); - bool fastCompress4(Vector3 * start, Vector3 * end); - - -private: - - uint m_count; - -#if ICBC_USE_SIMD - ICBC_ALIGN_16 SimdVector m_weighted[16]; // color | weight - SimdVector m_metric; // vec3 - SimdVector m_metricSqr; // vec3 - SimdVector m_xxsum; // color | weight - SimdVector m_xsum; // color | weight (wsum) - SimdVector m_besterror; // scalar -#else - Vector3 m_weighted[16]; - float m_weights[16]; - Vector3 m_metric; - Vector3 m_metricSqr; - Vector3 m_xxsum; - Vector3 m_xsum; - float m_wsum; - float m_besterror; -#endif -}; - - -static Vector3 computeCentroid(int n, const Vector3 *__restrict points, const float *__restrict weights) -{ - Vector3 centroid = { 0 }; - float total = 0.0f; - - for (int i = 0; i < n; i++) - { - total += weights[i]; - centroid += weights[i] * points[i]; - } - centroid *= (1.0f / total); - - return centroid; -} - -static Vector3 computeCovariance(int n, const Vector3 *__restrict points, const float *__restrict weights, float *__restrict covariance) -{ - // compute the centroid - Vector3 centroid = computeCentroid(n, points, weights); - - // compute covariance matrix - for (int i = 0; i < 6; i++) - { - covariance[i] = 0.0f; - } - - for (int i = 0; i < n; i++) - { - Vector3 a = (points[i] - centroid); // @@ I think weight should be squared, but that seems to increase the error slightly. - Vector3 b = weights[i] * a; - - covariance[0] += a.x * b.x; - covariance[1] += a.x * b.y; - covariance[2] += a.x * b.z; - covariance[3] += a.y * b.y; - covariance[4] += a.y * b.z; - covariance[5] += a.z * b.z; - } - - return centroid; -} - -// @@ We should be able to do something cheaper... -static Vector3 estimatePrincipalComponent(const float * __restrict matrix) -{ - const Vector3 row0 = { matrix[0], matrix[1], matrix[2] }; - const Vector3 row1 = { matrix[1], matrix[3], matrix[4] }; - const Vector3 row2 = { matrix[2], matrix[4], matrix[5] }; - - float r0 = lengthSquared(row0); - float r1 = lengthSquared(row1); - float r2 = lengthSquared(row2); - - if (r0 > r1 && r0 > r2) return row0; - if (r1 > r2) return row1; - return row2; -} - -static inline Vector3 firstEigenVector_PowerMethod(const float *__restrict matrix) -{ - if (matrix[0] == 0 && matrix[3] == 0 && matrix[5] == 0) - { - return {0}; - } - - Vector3 v = estimatePrincipalComponent(matrix); - - const int NUM = 8; - for (int i = 0; i < NUM; i++) - { - float x = v.x * matrix[0] + v.y * matrix[1] + v.z * matrix[2]; - float y = v.x * matrix[1] + v.y * matrix[3] + v.z * matrix[4]; - float z = v.x * matrix[2] + v.y * matrix[4] + v.z * matrix[5]; - - float norm = max(max(x, y), z); - - v = { x, y, z }; - v *= (1.0f / norm); - } - - return v; -} - -static Vector3 computePrincipalComponent_PowerMethod(int n, const Vector3 *__restrict points, const float *__restrict weights) -{ - float matrix[6]; - computeCovariance(n, points, weights, matrix); - - return firstEigenVector_PowerMethod(matrix); -} - - -void ClusterFit::setErrorMetric(const Vector3 & metric) -{ -#if ICBC_USE_SIMD - ICBC_ALIGN_16 Vector4 tmp; - tmp.xyz = metric; - tmp.w = 1; - m_metric = SimdVector(&tmp.x); -#else - m_metric = metric; -#endif - m_metricSqr = m_metric * m_metric; -} - -void ClusterFit::setColorSet(const Vector3 * colors, const float * weights, int count, const Vector3 & metric) -{ - setErrorMetric(metric); - - // initialise the best error -#if ICBC_USE_SIMD - m_besterror = SimdVector(FLT_MAX); -#else - m_besterror = FLT_MAX; -#endif - - m_count = count; - - // I've tried using a lower quality approximation of the principal direction, but the best fit line seems to produce best results. - Vector3 principal = computePrincipalComponent_PowerMethod(count, colors, weights); - - // build the list of values - int order[16]; - float dps[16]; - for (uint i = 0; i < m_count; ++i) - { - order[i] = i; - dps[i] = dot(colors[i], principal); - } - - // stable sort - for (uint i = 0; i < m_count; ++i) - { - for (uint j = i; j > 0 && dps[j] < dps[j - 1]; --j) - { - swap(dps[j], dps[j - 1]); - swap(order[j], order[j - 1]); - } - } - - // weight all the points -#if ICBC_USE_SIMD - m_xxsum = SimdVector(0.0f); - m_xsum = SimdVector(0.0f); -#else - m_xxsum = { 0.0f }; - m_xsum = { 0.0f }; - m_wsum = 0.0f; -#endif - - for (uint i = 0; i < m_count; ++i) - { - int p = order[i]; -#if ICBC_USE_SIMD - ICBC_ALIGN_16 Vector4 tmp; - tmp.xyz = colors[p]; - tmp.w = 1; - m_weighted[i] = SimdVector(&tmp.x) * SimdVector(weights[p]); - m_xxsum += m_weighted[i] * m_weighted[i]; - m_xsum += m_weighted[i]; -#else - m_weighted[i] = colors[p] * weights[p]; - m_xxsum += m_weighted[i] * m_weighted[i]; - m_xsum += m_weighted[i]; - m_weights[i] = weights[p]; - m_wsum += m_weights[i]; -#endif - } -} - -void ClusterFit::setColorSet(const Vector4 * colors, const Vector3 & metric) -{ - setErrorMetric(metric); - - // initialise the best error -#if ICBC_USE_SIMD - m_besterror = SimdVector(FLT_MAX); -#else - m_besterror = FLT_MAX; -#endif - - m_count = 16; - - static const float weights[16] = {1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1}; - Vector3 vc[16]; - for (int i = 0; i < 16; i++) vc[i] = colors[i].xyz; - - // I've tried using a lower quality approximation of the principal direction, but the best fit line seems to produce best results. - Vector3 principal = computePrincipalComponent_PowerMethod(16, vc, weights); - - // build the list of values - int order[16]; - float dps[16]; - for (uint i = 0; i < m_count; ++i) - { - order[i] = i; - dps[i] = dot(colors[i].xyz, principal); - } - - // stable sort - for (uint i = 0; i < m_count; ++i) - { - for (uint j = i; j > 0 && dps[j] < dps[j - 1]; --j) - { - swap(dps[j], dps[j - 1]); - swap(order[j], order[j - 1]); - } - } - - // weight all the points -#if ICBC_USE_SIMD - m_xxsum = SimdVector(0.0f); - m_xsum = SimdVector(0.0f); -#else - m_xxsum = { 0.0f }; - m_xsum = { 0.0f }; - m_wsum = 0.0f; -#endif - - for (uint i = 0; i < 16; ++i) - { - int p = order[i]; -#if ICBC_USE_SIMD - ICBC_ALIGN_16 Vector4 tmp; - tmp.xyz = colors[p].xyz; - tmp.w = 1; - m_weighted[i] = SimdVector(&tmp.x); - m_xxsum += m_weighted[i] * m_weighted[i]; - m_xsum += m_weighted[i]; -#else - m_weighted[i] = colors[p].xyz; - m_xxsum += m_weighted[i] * m_weighted[i]; - m_xsum += m_weighted[i]; - m_weights[i] = 1.0f; - m_wsum += m_weights[i]; -#endif - } -} - -float ClusterFit::bestError() const -{ -#if ICBC_USE_SIMD - SimdVector x = m_xxsum * m_metricSqr; - SimdVector error = m_besterror + x.splatX() + x.splatY() + x.splatZ(); - return error.toFloat(); -#else - return m_besterror + dot(m_xxsum, m_metricSqr); -#endif -} - -struct Precomp { - float alpha2_sum; - float beta2_sum; - float alphabeta_sum; - float factor; -}; - -static const ICBC_ALIGN_16 Precomp s_threeElement[153] = { - { 0.000000f, 16.000000f, 0.000000f, FLT_MAX }, // 0 (0 0 16) - { 0.250000f, 15.250000f, 0.250000f, 0.266667f }, // 1 (0 1 15) - { 0.500000f, 14.500000f, 0.500000f, 0.142857f }, // 2 (0 2 14) - { 0.750000f, 13.750000f, 0.750000f, 0.102564f }, // 3 (0 3 13) - { 1.000000f, 13.000000f, 1.000000f, 0.083333f }, // 4 (0 4 12) - { 1.250000f, 12.250000f, 1.250000f, 0.072727f }, // 5 (0 5 11) - { 1.500000f, 11.500000f, 1.500000f, 0.066667f }, // 6 (0 6 10) - { 1.750000f, 10.750000f, 1.750000f, 0.063492f }, // 7 (0 7 9) - { 2.000000f, 10.000000f, 2.000000f, 0.062500f }, // 8 (0 8 8) - { 2.250000f, 9.250000f, 2.250000f, 0.063492f }, // 9 (0 9 7) - { 2.500000f, 8.500000f, 2.500000f, 0.066667f }, // 10 (0 10 6) - { 2.750000f, 7.750000f, 2.750000f, 0.072727f }, // 11 (0 11 5) - { 3.000000f, 7.000000f, 3.000000f, 0.083333f }, // 12 (0 12 4) - { 3.250000f, 6.250000f, 3.250000f, 0.102564f }, // 13 (0 13 3) - { 3.500000f, 5.500000f, 3.500000f, 0.142857f }, // 14 (0 14 2) - { 3.750000f, 4.750000f, 3.750000f, 0.266667f }, // 15 (0 15 1) - { 4.000000f, 4.000000f, 4.000000f, FLT_MAX }, // 16 (0 16 0) - { 1.000000f, 15.000000f, 0.000000f, 0.066667f }, // 17 (1 0 15) - { 1.250000f, 14.250000f, 0.250000f, 0.056338f }, // 18 (1 1 14) - { 1.500000f, 13.500000f, 0.500000f, 0.050000f }, // 19 (1 2 13) - { 1.750000f, 12.750000f, 0.750000f, 0.045977f }, // 20 (1 3 12) - { 2.000000f, 12.000000f, 1.000000f, 0.043478f }, // 21 (1 4 11) - { 2.250000f, 11.250000f, 1.250000f, 0.042105f }, // 22 (1 5 10) - { 2.500000f, 10.500000f, 1.500000f, 0.041667f }, // 23 (1 6 9) - { 2.750000f, 9.750000f, 1.750000f, 0.042105f }, // 24 (1 7 8) - { 3.000000f, 9.000000f, 2.000000f, 0.043478f }, // 25 (1 8 7) - { 3.250000f, 8.250000f, 2.250000f, 0.045977f }, // 26 (1 9 6) - { 3.500000f, 7.500000f, 2.500000f, 0.050000f }, // 27 (1 10 5) - { 3.750000f, 6.750000f, 2.750000f, 0.056338f }, // 28 (1 11 4) - { 4.000000f, 6.000000f, 3.000000f, 0.066667f }, // 29 (1 12 3) - { 4.250000f, 5.250000f, 3.250000f, 0.085106f }, // 30 (1 13 2) - { 4.500000f, 4.500000f, 3.500000f, 0.125000f }, // 31 (1 14 1) - { 4.750000f, 3.750000f, 3.750000f, 0.266667f }, // 32 (1 15 0) - { 2.000000f, 14.000000f, 0.000000f, 0.035714f }, // 33 (2 0 14) - { 2.250000f, 13.250000f, 0.250000f, 0.033613f }, // 34 (2 1 13) - { 2.500000f, 12.500000f, 0.500000f, 0.032258f }, // 35 (2 2 12) - { 2.750000f, 11.750000f, 0.750000f, 0.031496f }, // 36 (2 3 11) - { 3.000000f, 11.000000f, 1.000000f, 0.031250f }, // 37 (2 4 10) - { 3.250000f, 10.250000f, 1.250000f, 0.031496f }, // 38 (2 5 9) - { 3.500000f, 9.500000f, 1.500000f, 0.032258f }, // 39 (2 6 8) - { 3.750000f, 8.750000f, 1.750000f, 0.033613f }, // 40 (2 7 7) - { 4.000000f, 8.000000f, 2.000000f, 0.035714f }, // 41 (2 8 6) - { 4.250000f, 7.250000f, 2.250000f, 0.038835f }, // 42 (2 9 5) - { 4.500000f, 6.500000f, 2.500000f, 0.043478f }, // 43 (2 10 4) - { 4.750000f, 5.750000f, 2.750000f, 0.050633f }, // 44 (2 11 3) - { 5.000000f, 5.000000f, 3.000000f, 0.062500f }, // 45 (2 12 2) - { 5.250000f, 4.250000f, 3.250000f, 0.085106f }, // 46 (2 13 1) - { 5.500000f, 3.500000f, 3.500000f, 0.142857f }, // 47 (2 14 0) - { 3.000000f, 13.000000f, 0.000000f, 0.025641f }, // 48 (3 0 13) - { 3.250000f, 12.250000f, 0.250000f, 0.025157f }, // 49 (3 1 12) - { 3.500000f, 11.500000f, 0.500000f, 0.025000f }, // 50 (3 2 11) - { 3.750000f, 10.750000f, 0.750000f, 0.025157f }, // 51 (3 3 10) - { 4.000000f, 10.000000f, 1.000000f, 0.025641f }, // 52 (3 4 9) - { 4.250000f, 9.250000f, 1.250000f, 0.026490f }, // 53 (3 5 8) - { 4.500000f, 8.500000f, 1.500000f, 0.027778f }, // 54 (3 6 7) - { 4.750000f, 7.750000f, 1.750000f, 0.029630f }, // 55 (3 7 6) - { 5.000000f, 7.000000f, 2.000000f, 0.032258f }, // 56 (3 8 5) - { 5.250000f, 6.250000f, 2.250000f, 0.036036f }, // 57 (3 9 4) - { 5.500000f, 5.500000f, 2.500000f, 0.041667f }, // 58 (3 10 3) - { 5.750000f, 4.750000f, 2.750000f, 0.050633f }, // 59 (3 11 2) - { 6.000000f, 4.000000f, 3.000000f, 0.066667f }, // 60 (3 12 1) - { 6.250000f, 3.250000f, 3.250000f, 0.102564f }, // 61 (3 13 0) - { 4.000000f, 12.000000f, 0.000000f, 0.020833f }, // 62 (4 0 12) - { 4.250000f, 11.250000f, 0.250000f, 0.020942f }, // 63 (4 1 11) - { 4.500000f, 10.500000f, 0.500000f, 0.021277f }, // 64 (4 2 10) - { 4.750000f, 9.750000f, 0.750000f, 0.021858f }, // 65 (4 3 9) - { 5.000000f, 9.000000f, 1.000000f, 0.022727f }, // 66 (4 4 8) - { 5.250000f, 8.250000f, 1.250000f, 0.023952f }, // 67 (4 5 7) - { 5.500000f, 7.500000f, 1.500000f, 0.025641f }, // 68 (4 6 6) - { 5.750000f, 6.750000f, 1.750000f, 0.027972f }, // 69 (4 7 5) - { 6.000000f, 6.000000f, 2.000000f, 0.031250f }, // 70 (4 8 4) - { 6.250000f, 5.250000f, 2.250000f, 0.036036f }, // 71 (4 9 3) - { 6.500000f, 4.500000f, 2.500000f, 0.043478f }, // 72 (4 10 2) - { 6.750000f, 3.750000f, 2.750000f, 0.056338f }, // 73 (4 11 1) - { 7.000000f, 3.000000f, 3.000000f, 0.083333f }, // 74 (4 12 0) - { 5.000000f, 11.000000f, 0.000000f, 0.018182f }, // 75 (5 0 11) - { 5.250000f, 10.250000f, 0.250000f, 0.018605f }, // 76 (5 1 10) - { 5.500000f, 9.500000f, 0.500000f, 0.019231f }, // 77 (5 2 9) - { 5.750000f, 8.750000f, 0.750000f, 0.020101f }, // 78 (5 3 8) - { 6.000000f, 8.000000f, 1.000000f, 0.021277f }, // 79 (5 4 7) - { 6.250000f, 7.250000f, 1.250000f, 0.022857f }, // 80 (5 5 6) - { 6.500000f, 6.500000f, 1.500000f, 0.025000f }, // 81 (5 6 5) - { 6.750000f, 5.750000f, 1.750000f, 0.027972f }, // 82 (5 7 4) - { 7.000000f, 5.000000f, 2.000000f, 0.032258f }, // 83 (5 8 3) - { 7.250000f, 4.250000f, 2.250000f, 0.038835f }, // 84 (5 9 2) - { 7.500000f, 3.500000f, 2.500000f, 0.050000f }, // 85 (5 10 1) - { 7.750000f, 2.750000f, 2.750000f, 0.072727f }, // 86 (5 11 0) - { 6.000000f, 10.000000f, 0.000000f, 0.016667f }, // 87 (6 0 10) - { 6.250000f, 9.250000f, 0.250000f, 0.017316f }, // 88 (6 1 9) - { 6.500000f, 8.500000f, 0.500000f, 0.018182f }, // 89 (6 2 8) - { 6.750000f, 7.750000f, 0.750000f, 0.019324f }, // 90 (6 3 7) - { 7.000000f, 7.000000f, 1.000000f, 0.020833f }, // 91 (6 4 6) - { 7.250000f, 6.250000f, 1.250000f, 0.022857f }, // 92 (6 5 5) - { 7.500000f, 5.500000f, 1.500000f, 0.025641f }, // 93 (6 6 4) - { 7.750000f, 4.750000f, 1.750000f, 0.029630f }, // 94 (6 7 3) - { 8.000000f, 4.000000f, 2.000000f, 0.035714f }, // 95 (6 8 2) - { 8.250000f, 3.250000f, 2.250000f, 0.045977f }, // 96 (6 9 1) - { 8.500000f, 2.500000f, 2.500000f, 0.066667f }, // 97 (6 10 0) - { 7.000000f, 9.000000f, 0.000000f, 0.015873f }, // 98 (7 0 9) - { 7.250000f, 8.250000f, 0.250000f, 0.016736f }, // 99 (7 1 8) - { 7.500000f, 7.500000f, 0.500000f, 0.017857f }, // 100 (7 2 7) - { 7.750000f, 6.750000f, 0.750000f, 0.019324f }, // 101 (7 3 6) - { 8.000000f, 6.000000f, 1.000000f, 0.021277f }, // 102 (7 4 5) - { 8.250000f, 5.250000f, 1.250000f, 0.023952f }, // 103 (7 5 4) - { 8.500000f, 4.500000f, 1.500000f, 0.027778f }, // 104 (7 6 3) - { 8.750000f, 3.750000f, 1.750000f, 0.033613f }, // 105 (7 7 2) - { 9.000000f, 3.000000f, 2.000000f, 0.043478f }, // 106 (7 8 1) - { 9.250000f, 2.250000f, 2.250000f, 0.063492f }, // 107 (7 9 0) - { 8.000000f, 8.000000f, 0.000000f, 0.015625f }, // 108 (8 0 8) - { 8.250000f, 7.250000f, 0.250000f, 0.016736f }, // 109 (8 1 7) - { 8.500000f, 6.500000f, 0.500000f, 0.018182f }, // 110 (8 2 6) - { 8.750000f, 5.750000f, 0.750000f, 0.020101f }, // 111 (8 3 5) - { 9.000000f, 5.000000f, 1.000000f, 0.022727f }, // 112 (8 4 4) - { 9.250000f, 4.250000f, 1.250000f, 0.026490f }, // 113 (8 5 3) - { 9.500000f, 3.500000f, 1.500000f, 0.032258f }, // 114 (8 6 2) - { 9.750000f, 2.750000f, 1.750000f, 0.042105f }, // 115 (8 7 1) - { 10.000000f, 2.000000f, 2.000000f, 0.062500f }, // 116 (8 8 0) - { 9.000000f, 7.000000f, 0.000000f, 0.015873f }, // 117 (9 0 7) - { 9.250000f, 6.250000f, 0.250000f, 0.017316f }, // 118 (9 1 6) - { 9.500000f, 5.500000f, 0.500000f, 0.019231f }, // 119 (9 2 5) - { 9.750000f, 4.750000f, 0.750000f, 0.021858f }, // 120 (9 3 4) - { 10.000000f, 4.000000f, 1.000000f, 0.025641f }, // 121 (9 4 3) - { 10.250000f, 3.250000f, 1.250000f, 0.031496f }, // 122 (9 5 2) - { 10.500000f, 2.500000f, 1.500000f, 0.041667f }, // 123 (9 6 1) - { 10.750000f, 1.750000f, 1.750000f, 0.063492f }, // 124 (9 7 0) - { 10.000000f, 6.000000f, 0.000000f, 0.016667f }, // 125 (10 0 6) - { 10.250000f, 5.250000f, 0.250000f, 0.018605f }, // 126 (10 1 5) - { 10.500000f, 4.500000f, 0.500000f, 0.021277f }, // 127 (10 2 4) - { 10.750000f, 3.750000f, 0.750000f, 0.025157f }, // 128 (10 3 3) - { 11.000000f, 3.000000f, 1.000000f, 0.031250f }, // 129 (10 4 2) - { 11.250000f, 2.250000f, 1.250000f, 0.042105f }, // 130 (10 5 1) - { 11.500000f, 1.500000f, 1.500000f, 0.066667f }, // 131 (10 6 0) - { 11.000000f, 5.000000f, 0.000000f, 0.018182f }, // 132 (11 0 5) - { 11.250000f, 4.250000f, 0.250000f, 0.020942f }, // 133 (11 1 4) - { 11.500000f, 3.500000f, 0.500000f, 0.025000f }, // 134 (11 2 3) - { 11.750000f, 2.750000f, 0.750000f, 0.031496f }, // 135 (11 3 2) - { 12.000000f, 2.000000f, 1.000000f, 0.043478f }, // 136 (11 4 1) - { 12.250000f, 1.250000f, 1.250000f, 0.072727f }, // 137 (11 5 0) - { 12.000000f, 4.000000f, 0.000000f, 0.020833f }, // 138 (12 0 4) - { 12.250000f, 3.250000f, 0.250000f, 0.025157f }, // 139 (12 1 3) - { 12.500000f, 2.500000f, 0.500000f, 0.032258f }, // 140 (12 2 2) - { 12.750000f, 1.750000f, 0.750000f, 0.045977f }, // 141 (12 3 1) - { 13.000000f, 1.000000f, 1.000000f, 0.083333f }, // 142 (12 4 0) - { 13.000000f, 3.000000f, 0.000000f, 0.025641f }, // 143 (13 0 3) - { 13.250000f, 2.250000f, 0.250000f, 0.033613f }, // 144 (13 1 2) - { 13.500000f, 1.500000f, 0.500000f, 0.050000f }, // 145 (13 2 1) - { 13.750000f, 0.750000f, 0.750000f, 0.102564f }, // 146 (13 3 0) - { 14.000000f, 2.000000f, 0.000000f, 0.035714f }, // 147 (14 0 2) - { 14.250000f, 1.250000f, 0.250000f, 0.056338f }, // 148 (14 1 1) - { 14.500000f, 0.500000f, 0.500000f, 0.142857f }, // 149 (14 2 0) - { 15.000000f, 1.000000f, 0.000000f, 0.066667f }, // 150 (15 0 1) - { 15.250000f, 0.250000f, 0.250000f, 0.266667f }, // 151 (15 1 0) - { 16.000000f, 0.000000f, 0.000000f, FLT_MAX }, // 152 (16 0 0) -}; // 153 three cluster elements - -static const ICBC_ALIGN_16 Precomp s_fourElement[969] = { - { 0.000000f, 16.000000f, 0.000000f, FLT_MAX }, // 0 (0 0 0 16) - { 0.111111f, 15.444445f, 0.222222f, 0.600000f }, // 1 (0 0 1 15) - { 0.222222f, 14.888889f, 0.444444f, 0.321429f }, // 2 (0 0 2 14) - { 0.333333f, 14.333333f, 0.666667f, 0.230769f }, // 3 (0 0 3 13) - { 0.444444f, 13.777778f, 0.888889f, 0.187500f }, // 4 (0 0 4 12) - { 0.555556f, 13.222222f, 1.111111f, 0.163636f }, // 5 (0 0 5 11) - { 0.666667f, 12.666667f, 1.333333f, 0.150000f }, // 6 (0 0 6 10) - { 0.777778f, 12.111111f, 1.555556f, 0.142857f }, // 7 (0 0 7 9) - { 0.888889f, 11.555555f, 1.777778f, 0.140625f }, // 8 (0 0 8 8) - { 1.000000f, 11.000000f, 2.000000f, 0.142857f }, // 9 (0 0 9 7) - { 1.111111f, 10.444445f, 2.222222f, 0.150000f }, // 10 (0 0 10 6) - { 1.222222f, 9.888889f, 2.444444f, 0.163636f }, // 11 (0 0 11 5) - { 1.333333f, 9.333333f, 2.666667f, 0.187500f }, // 12 (0 0 12 4) - { 1.444444f, 8.777778f, 2.888889f, 0.230769f }, // 13 (0 0 13 3) - { 1.555556f, 8.222222f, 3.111111f, 0.321429f }, // 14 (0 0 14 2) - { 1.666667f, 7.666667f, 3.333333f, 0.600000f }, // 15 (0 0 15 1) - { 1.777778f, 7.111111f, 3.555556f, FLT_MAX }, // 16 (0 0 16 0) - { 0.444444f, 15.111111f, 0.222222f, 0.150000f }, // 17 (0 1 0 15) - { 0.555556f, 14.555555f, 0.444444f, 0.126761f }, // 18 (0 1 1 14) - { 0.666667f, 14.000000f, 0.666667f, 0.112500f }, // 19 (0 1 2 13) - { 0.777778f, 13.444445f, 0.888889f, 0.103448f }, // 20 (0 1 3 12) - { 0.888889f, 12.888889f, 1.111111f, 0.097826f }, // 21 (0 1 4 11) - { 1.000000f, 12.333333f, 1.333333f, 0.094737f }, // 22 (0 1 5 10) - { 1.111111f, 11.777778f, 1.555556f, 0.093750f }, // 23 (0 1 6 9) - { 1.222222f, 11.222222f, 1.777778f, 0.094737f }, // 24 (0 1 7 8) - { 1.333333f, 10.666667f, 2.000000f, 0.097826f }, // 25 (0 1 8 7) - { 1.444444f, 10.111111f, 2.222222f, 0.103448f }, // 26 (0 1 9 6) - { 1.555556f, 9.555555f, 2.444444f, 0.112500f }, // 27 (0 1 10 5) - { 1.666667f, 9.000000f, 2.666667f, 0.126761f }, // 28 (0 1 11 4) - { 1.777778f, 8.444445f, 2.888889f, 0.150000f }, // 29 (0 1 12 3) - { 1.888889f, 7.888889f, 3.111111f, 0.191489f }, // 30 (0 1 13 2) - { 2.000000f, 7.333333f, 3.333333f, 0.281250f }, // 31 (0 1 14 1) - { 2.111111f, 6.777778f, 3.555556f, 0.600000f }, // 32 (0 1 15 0) - { 0.888889f, 14.222222f, 0.444444f, 0.080357f }, // 33 (0 2 0 14) - { 1.000000f, 13.666667f, 0.666667f, 0.075630f }, // 34 (0 2 1 13) - { 1.111111f, 13.111111f, 0.888889f, 0.072581f }, // 35 (0 2 2 12) - { 1.222222f, 12.555555f, 1.111111f, 0.070866f }, // 36 (0 2 3 11) - { 1.333333f, 12.000000f, 1.333333f, 0.070313f }, // 37 (0 2 4 10) - { 1.444444f, 11.444445f, 1.555556f, 0.070866f }, // 38 (0 2 5 9) - { 1.555556f, 10.888889f, 1.777778f, 0.072581f }, // 39 (0 2 6 8) - { 1.666667f, 10.333333f, 2.000000f, 0.075630f }, // 40 (0 2 7 7) - { 1.777778f, 9.777778f, 2.222222f, 0.080357f }, // 41 (0 2 8 6) - { 1.888889f, 9.222222f, 2.444444f, 0.087379f }, // 42 (0 2 9 5) - { 2.000000f, 8.666667f, 2.666667f, 0.097826f }, // 43 (0 2 10 4) - { 2.111111f, 8.111111f, 2.888889f, 0.113924f }, // 44 (0 2 11 3) - { 2.222222f, 7.555556f, 3.111111f, 0.140625f }, // 45 (0 2 12 2) - { 2.333333f, 7.000000f, 3.333333f, 0.191489f }, // 46 (0 2 13 1) - { 2.444444f, 6.444445f, 3.555556f, 0.321429f }, // 47 (0 2 14 0) - { 1.333333f, 13.333333f, 0.666667f, 0.057692f }, // 48 (0 3 0 13) - { 1.444444f, 12.777778f, 0.888889f, 0.056604f }, // 49 (0 3 1 12) - { 1.555556f, 12.222222f, 1.111111f, 0.056250f }, // 50 (0 3 2 11) - { 1.666667f, 11.666667f, 1.333333f, 0.056604f }, // 51 (0 3 3 10) - { 1.777778f, 11.111111f, 1.555556f, 0.057692f }, // 52 (0 3 4 9) - { 1.888889f, 10.555555f, 1.777778f, 0.059603f }, // 53 (0 3 5 8) - { 2.000000f, 10.000000f, 2.000000f, 0.062500f }, // 54 (0 3 6 7) - { 2.111111f, 9.444445f, 2.222222f, 0.066667f }, // 55 (0 3 7 6) - { 2.222222f, 8.888889f, 2.444444f, 0.072581f }, // 56 (0 3 8 5) - { 2.333333f, 8.333333f, 2.666667f, 0.081081f }, // 57 (0 3 9 4) - { 2.444444f, 7.777778f, 2.888889f, 0.093750f }, // 58 (0 3 10 3) - { 2.555556f, 7.222222f, 3.111111f, 0.113924f }, // 59 (0 3 11 2) - { 2.666667f, 6.666667f, 3.333333f, 0.150000f }, // 60 (0 3 12 1) - { 2.777778f, 6.111111f, 3.555556f, 0.230769f }, // 61 (0 3 13 0) - { 1.777778f, 12.444445f, 0.888889f, 0.046875f }, // 62 (0 4 0 12) - { 1.888889f, 11.888889f, 1.111111f, 0.047120f }, // 63 (0 4 1 11) - { 2.000000f, 11.333333f, 1.333333f, 0.047872f }, // 64 (0 4 2 10) - { 2.111111f, 10.777778f, 1.555556f, 0.049180f }, // 65 (0 4 3 9) - { 2.222222f, 10.222222f, 1.777778f, 0.051136f }, // 66 (0 4 4 8) - { 2.333333f, 9.666667f, 2.000000f, 0.053892f }, // 67 (0 4 5 7) - { 2.444444f, 9.111111f, 2.222222f, 0.057692f }, // 68 (0 4 6 6) - { 2.555556f, 8.555555f, 2.444444f, 0.062937f }, // 69 (0 4 7 5) - { 2.666667f, 8.000000f, 2.666667f, 0.070313f }, // 70 (0 4 8 4) - { 2.777778f, 7.444445f, 2.888889f, 0.081081f }, // 71 (0 4 9 3) - { 2.888889f, 6.888889f, 3.111111f, 0.097826f }, // 72 (0 4 10 2) - { 3.000000f, 6.333333f, 3.333333f, 0.126761f }, // 73 (0 4 11 1) - { 3.111111f, 5.777778f, 3.555556f, 0.187500f }, // 74 (0 4 12 0) - { 2.222222f, 11.555555f, 1.111111f, 0.040909f }, // 75 (0 5 0 11) - { 2.333333f, 11.000000f, 1.333333f, 0.041860f }, // 76 (0 5 1 10) - { 2.444444f, 10.444445f, 1.555556f, 0.043269f }, // 77 (0 5 2 9) - { 2.555556f, 9.888889f, 1.777778f, 0.045226f }, // 78 (0 5 3 8) - { 2.666667f, 9.333333f, 2.000000f, 0.047872f }, // 79 (0 5 4 7) - { 2.777778f, 8.777778f, 2.222222f, 0.051429f }, // 80 (0 5 5 6) - { 2.888889f, 8.222222f, 2.444444f, 0.056250f }, // 81 (0 5 6 5) - { 3.000000f, 7.666667f, 2.666667f, 0.062937f }, // 82 (0 5 7 4) - { 3.111111f, 7.111111f, 2.888889f, 0.072581f }, // 83 (0 5 8 3) - { 3.222222f, 6.555556f, 3.111111f, 0.087379f }, // 84 (0 5 9 2) - { 3.333333f, 6.000000f, 3.333333f, 0.112500f }, // 85 (0 5 10 1) - { 3.444444f, 5.444445f, 3.555556f, 0.163636f }, // 86 (0 5 11 0) - { 2.666667f, 10.666667f, 1.333333f, 0.037500f }, // 87 (0 6 0 10) - { 2.777778f, 10.111111f, 1.555556f, 0.038961f }, // 88 (0 6 1 9) - { 2.888889f, 9.555555f, 1.777778f, 0.040909f }, // 89 (0 6 2 8) - { 3.000000f, 9.000000f, 2.000000f, 0.043478f }, // 90 (0 6 3 7) - { 3.111111f, 8.444445f, 2.222222f, 0.046875f }, // 91 (0 6 4 6) - { 3.222222f, 7.888889f, 2.444444f, 0.051429f }, // 92 (0 6 5 5) - { 3.333333f, 7.333333f, 2.666667f, 0.057692f }, // 93 (0 6 6 4) - { 3.444444f, 6.777778f, 2.888889f, 0.066667f }, // 94 (0 6 7 3) - { 3.555556f, 6.222222f, 3.111111f, 0.080357f }, // 95 (0 6 8 2) - { 3.666667f, 5.666667f, 3.333333f, 0.103448f }, // 96 (0 6 9 1) - { 3.777778f, 5.111111f, 3.555556f, 0.150000f }, // 97 (0 6 10 0) - { 3.111111f, 9.777778f, 1.555556f, 0.035714f }, // 98 (0 7 0 9) - { 3.222222f, 9.222222f, 1.777778f, 0.037657f }, // 99 (0 7 1 8) - { 3.333333f, 8.666667f, 2.000000f, 0.040179f }, // 100 (0 7 2 7) - { 3.444444f, 8.111111f, 2.222222f, 0.043478f }, // 101 (0 7 3 6) - { 3.555556f, 7.555555f, 2.444444f, 0.047872f }, // 102 (0 7 4 5) - { 3.666667f, 7.000000f, 2.666667f, 0.053892f }, // 103 (0 7 5 4) - { 3.777778f, 6.444445f, 2.888889f, 0.062500f }, // 104 (0 7 6 3) - { 3.888889f, 5.888889f, 3.111111f, 0.075630f }, // 105 (0 7 7 2) - { 4.000000f, 5.333333f, 3.333333f, 0.097826f }, // 106 (0 7 8 1) - { 4.111111f, 4.777778f, 3.555556f, 0.142857f }, // 107 (0 7 9 0) - { 3.555556f, 8.888889f, 1.777778f, 0.035156f }, // 108 (0 8 0 8) - { 3.666667f, 8.333333f, 2.000000f, 0.037657f }, // 109 (0 8 1 7) - { 3.777778f, 7.777778f, 2.222222f, 0.040909f }, // 110 (0 8 2 6) - { 3.888889f, 7.222222f, 2.444444f, 0.045226f }, // 111 (0 8 3 5) - { 4.000000f, 6.666667f, 2.666667f, 0.051136f }, // 112 (0 8 4 4) - { 4.111111f, 6.111111f, 2.888889f, 0.059603f }, // 113 (0 8 5 3) - { 4.222222f, 5.555555f, 3.111111f, 0.072581f }, // 114 (0 8 6 2) - { 4.333333f, 5.000000f, 3.333333f, 0.094737f }, // 115 (0 8 7 1) - { 4.444445f, 4.444445f, 3.555556f, 0.140625f }, // 116 (0 8 8 0) - { 4.000000f, 8.000000f, 2.000000f, 0.035714f }, // 117 (0 9 0 7) - { 4.111111f, 7.444445f, 2.222222f, 0.038961f }, // 118 (0 9 1 6) - { 4.222222f, 6.888889f, 2.444444f, 0.043269f }, // 119 (0 9 2 5) - { 4.333333f, 6.333333f, 2.666667f, 0.049180f }, // 120 (0 9 3 4) - { 4.444445f, 5.777778f, 2.888889f, 0.057692f }, // 121 (0 9 4 3) - { 4.555556f, 5.222222f, 3.111111f, 0.070866f }, // 122 (0 9 5 2) - { 4.666667f, 4.666667f, 3.333333f, 0.093750f }, // 123 (0 9 6 1) - { 4.777778f, 4.111111f, 3.555556f, 0.142857f }, // 124 (0 9 7 0) - { 4.444445f, 7.111111f, 2.222222f, 0.037500f }, // 125 (0 10 0 6) - { 4.555556f, 6.555555f, 2.444444f, 0.041860f }, // 126 (0 10 1 5) - { 4.666667f, 6.000000f, 2.666667f, 0.047872f }, // 127 (0 10 2 4) - { 4.777778f, 5.444445f, 2.888889f, 0.056604f }, // 128 (0 10 3 3) - { 4.888889f, 4.888889f, 3.111111f, 0.070313f }, // 129 (0 10 4 2) - { 5.000000f, 4.333333f, 3.333333f, 0.094737f }, // 130 (0 10 5 1) - { 5.111111f, 3.777778f, 3.555556f, 0.150000f }, // 131 (0 10 6 0) - { 4.888889f, 6.222222f, 2.444444f, 0.040909f }, // 132 (0 11 0 5) - { 5.000000f, 5.666667f, 2.666667f, 0.047120f }, // 133 (0 11 1 4) - { 5.111111f, 5.111111f, 2.888889f, 0.056250f }, // 134 (0 11 2 3) - { 5.222222f, 4.555555f, 3.111111f, 0.070866f }, // 135 (0 11 3 2) - { 5.333333f, 4.000000f, 3.333333f, 0.097826f }, // 136 (0 11 4 1) - { 5.444445f, 3.444444f, 3.555556f, 0.163636f }, // 137 (0 11 5 0) - { 5.333333f, 5.333333f, 2.666667f, 0.046875f }, // 138 (0 12 0 4) - { 5.444445f, 4.777778f, 2.888889f, 0.056604f }, // 139 (0 12 1 3) - { 5.555556f, 4.222222f, 3.111111f, 0.072581f }, // 140 (0 12 2 2) - { 5.666667f, 3.666667f, 3.333333f, 0.103448f }, // 141 (0 12 3 1) - { 5.777778f, 3.111111f, 3.555556f, 0.187500f }, // 142 (0 12 4 0) - { 5.777778f, 4.444445f, 2.888889f, 0.057692f }, // 143 (0 13 0 3) - { 5.888889f, 3.888889f, 3.111111f, 0.075630f }, // 144 (0 13 1 2) - { 6.000000f, 3.333333f, 3.333333f, 0.112500f }, // 145 (0 13 2 1) - { 6.111111f, 2.777778f, 3.555556f, 0.230769f }, // 146 (0 13 3 0) - { 6.222222f, 3.555556f, 3.111111f, 0.080357f }, // 147 (0 14 0 2) - { 6.333333f, 3.000000f, 3.333333f, 0.126761f }, // 148 (0 14 1 1) - { 6.444445f, 2.444444f, 3.555556f, 0.321429f }, // 149 (0 14 2 0) - { 6.666667f, 2.666667f, 3.333333f, 0.150000f }, // 150 (0 15 0 1) - { 6.777778f, 2.111111f, 3.555556f, 0.600000f }, // 151 (0 15 1 0) - { 7.111111f, 1.777778f, 3.555556f, FLT_MAX }, // 152 (0 16 0 0) - { 1.000000f, 15.000000f, 0.000000f, 0.066667f }, // 153 (1 0 0 15) - { 1.111111f, 14.444445f, 0.222222f, 0.062500f }, // 154 (1 0 1 14) - { 1.222222f, 13.888889f, 0.444444f, 0.059603f }, // 155 (1 0 2 13) - { 1.333333f, 13.333333f, 0.666667f, 0.057692f }, // 156 (1 0 3 12) - { 1.444444f, 12.777778f, 0.888889f, 0.056604f }, // 157 (1 0 4 11) - { 1.555556f, 12.222222f, 1.111111f, 0.056250f }, // 158 (1 0 5 10) - { 1.666667f, 11.666667f, 1.333333f, 0.056604f }, // 159 (1 0 6 9) - { 1.777778f, 11.111111f, 1.555556f, 0.057692f }, // 160 (1 0 7 8) - { 1.888889f, 10.555555f, 1.777778f, 0.059603f }, // 161 (1 0 8 7) - { 2.000000f, 10.000000f, 2.000000f, 0.062500f }, // 162 (1 0 9 6) - { 2.111111f, 9.444445f, 2.222222f, 0.066667f }, // 163 (1 0 10 5) - { 2.222222f, 8.888889f, 2.444444f, 0.072581f }, // 164 (1 0 11 4) - { 2.333333f, 8.333333f, 2.666667f, 0.081081f }, // 165 (1 0 12 3) - { 2.444444f, 7.777778f, 2.888889f, 0.093750f }, // 166 (1 0 13 2) - { 2.555556f, 7.222222f, 3.111111f, 0.113924f }, // 167 (1 0 14 1) - { 2.666667f, 6.666667f, 3.333333f, 0.150000f }, // 168 (1 0 15 0) - { 1.444444f, 14.111111f, 0.222222f, 0.049180f }, // 169 (1 1 0 14) - { 1.555556f, 13.555555f, 0.444444f, 0.047872f }, // 170 (1 1 1 13) - { 1.666667f, 13.000000f, 0.666667f, 0.047120f }, // 171 (1 1 2 12) - { 1.777778f, 12.444445f, 0.888889f, 0.046875f }, // 172 (1 1 3 11) - { 1.888889f, 11.888889f, 1.111111f, 0.047120f }, // 173 (1 1 4 10) - { 2.000000f, 11.333333f, 1.333333f, 0.047872f }, // 174 (1 1 5 9) - { 2.111111f, 10.777778f, 1.555556f, 0.049180f }, // 175 (1 1 6 8) - { 2.222222f, 10.222222f, 1.777778f, 0.051136f }, // 176 (1 1 7 7) - { 2.333333f, 9.666667f, 2.000000f, 0.053892f }, // 177 (1 1 8 6) - { 2.444444f, 9.111111f, 2.222222f, 0.057692f }, // 178 (1 1 9 5) - { 2.555556f, 8.555555f, 2.444444f, 0.062937f }, // 179 (1 1 10 4) - { 2.666667f, 8.000000f, 2.666667f, 0.070313f }, // 180 (1 1 11 3) - { 2.777778f, 7.444445f, 2.888889f, 0.081081f }, // 181 (1 1 12 2) - { 2.888889f, 6.888889f, 3.111111f, 0.097826f }, // 182 (1 1 13 1) - { 3.000000f, 6.333333f, 3.333333f, 0.126761f }, // 183 (1 1 14 0) - { 1.888889f, 13.222222f, 0.444444f, 0.040359f }, // 184 (1 2 0 13) - { 2.000000f, 12.666667f, 0.666667f, 0.040179f }, // 185 (1 2 1 12) - { 2.111111f, 12.111111f, 0.888889f, 0.040359f }, // 186 (1 2 2 11) - { 2.222222f, 11.555555f, 1.111111f, 0.040909f }, // 187 (1 2 3 10) - { 2.333333f, 11.000000f, 1.333333f, 0.041860f }, // 188 (1 2 4 9) - { 2.444444f, 10.444445f, 1.555556f, 0.043269f }, // 189 (1 2 5 8) - { 2.555556f, 9.888889f, 1.777778f, 0.045226f }, // 190 (1 2 6 7) - { 2.666667f, 9.333333f, 2.000000f, 0.047872f }, // 191 (1 2 7 6) - { 2.777778f, 8.777778f, 2.222222f, 0.051429f }, // 192 (1 2 8 5) - { 2.888889f, 8.222222f, 2.444444f, 0.056250f }, // 193 (1 2 9 4) - { 3.000000f, 7.666667f, 2.666667f, 0.062937f }, // 194 (1 2 10 3) - { 3.111111f, 7.111111f, 2.888889f, 0.072581f }, // 195 (1 2 11 2) - { 3.222222f, 6.555556f, 3.111111f, 0.087379f }, // 196 (1 2 12 1) - { 3.333333f, 6.000000f, 3.333333f, 0.112500f }, // 197 (1 2 13 0) - { 2.333333f, 12.333333f, 0.666667f, 0.035294f }, // 198 (1 3 0 12) - { 2.444444f, 11.777778f, 0.888889f, 0.035714f }, // 199 (1 3 1 11) - { 2.555556f, 11.222222f, 1.111111f, 0.036437f }, // 200 (1 3 2 10) - { 2.666667f, 10.666667f, 1.333333f, 0.037500f }, // 201 (1 3 3 9) - { 2.777778f, 10.111111f, 1.555556f, 0.038961f }, // 202 (1 3 4 8) - { 2.888889f, 9.555555f, 1.777778f, 0.040909f }, // 203 (1 3 5 7) - { 3.000000f, 9.000000f, 2.000000f, 0.043478f }, // 204 (1 3 6 6) - { 3.111111f, 8.444445f, 2.222222f, 0.046875f }, // 205 (1 3 7 5) - { 3.222222f, 7.888889f, 2.444444f, 0.051429f }, // 206 (1 3 8 4) - { 3.333333f, 7.333333f, 2.666667f, 0.057692f }, // 207 (1 3 9 3) - { 3.444444f, 6.777778f, 2.888889f, 0.066667f }, // 208 (1 3 10 2) - { 3.555556f, 6.222222f, 3.111111f, 0.080357f }, // 209 (1 3 11 1) - { 3.666667f, 5.666667f, 3.333333f, 0.103448f }, // 210 (1 3 12 0) - { 2.777778f, 11.444445f, 0.888889f, 0.032258f }, // 211 (1 4 0 11) - { 2.888889f, 10.888889f, 1.111111f, 0.033088f }, // 212 (1 4 1 10) - { 3.000000f, 10.333333f, 1.333333f, 0.034221f }, // 213 (1 4 2 9) - { 3.111111f, 9.777778f, 1.555556f, 0.035714f }, // 214 (1 4 3 8) - { 3.222222f, 9.222222f, 1.777778f, 0.037657f }, // 215 (1 4 4 7) - { 3.333333f, 8.666667f, 2.000000f, 0.040179f }, // 216 (1 4 5 6) - { 3.444444f, 8.111111f, 2.222222f, 0.043478f }, // 217 (1 4 6 5) - { 3.555556f, 7.555555f, 2.444444f, 0.047872f }, // 218 (1 4 7 4) - { 3.666667f, 7.000000f, 2.666667f, 0.053892f }, // 219 (1 4 8 3) - { 3.777778f, 6.444445f, 2.888889f, 0.062500f }, // 220 (1 4 9 2) - { 3.888889f, 5.888889f, 3.111111f, 0.075630f }, // 221 (1 4 10 1) - { 4.000000f, 5.333333f, 3.333333f, 0.097826f }, // 222 (1 4 11 0) - { 3.222222f, 10.555555f, 1.111111f, 0.030508f }, // 223 (1 5 0 10) - { 3.333333f, 10.000000f, 1.333333f, 0.031690f }, // 224 (1 5 1 9) - { 3.444444f, 9.444445f, 1.555556f, 0.033210f }, // 225 (1 5 2 8) - { 3.555556f, 8.888889f, 1.777778f, 0.035156f }, // 226 (1 5 3 7) - { 3.666667f, 8.333333f, 2.000000f, 0.037657f }, // 227 (1 5 4 6) - { 3.777778f, 7.777778f, 2.222222f, 0.040909f }, // 228 (1 5 5 5) - { 3.888889f, 7.222222f, 2.444444f, 0.045226f }, // 229 (1 5 6 4) - { 4.000000f, 6.666667f, 2.666667f, 0.051136f }, // 230 (1 5 7 3) - { 4.111111f, 6.111111f, 2.888889f, 0.059603f }, // 231 (1 5 8 2) - { 4.222222f, 5.555556f, 3.111111f, 0.072581f }, // 232 (1 5 9 1) - { 4.333333f, 5.000000f, 3.333333f, 0.094737f }, // 233 (1 5 10 0) - { 3.666667f, 9.666667f, 1.333333f, 0.029703f }, // 234 (1 6 0 9) - { 3.777778f, 9.111111f, 1.555556f, 0.031250f }, // 235 (1 6 1 8) - { 3.888889f, 8.555555f, 1.777778f, 0.033210f }, // 236 (1 6 2 7) - { 4.000000f, 8.000000f, 2.000000f, 0.035714f }, // 237 (1 6 3 6) - { 4.111111f, 7.444445f, 2.222222f, 0.038961f }, // 238 (1 6 4 5) - { 4.222222f, 6.888889f, 2.444444f, 0.043269f }, // 239 (1 6 5 4) - { 4.333333f, 6.333333f, 2.666667f, 0.049180f }, // 240 (1 6 6 3) - { 4.444445f, 5.777778f, 2.888889f, 0.057692f }, // 241 (1 6 7 2) - { 4.555555f, 5.222222f, 3.111111f, 0.070866f }, // 242 (1 6 8 1) - { 4.666667f, 4.666667f, 3.333333f, 0.093750f }, // 243 (1 6 9 0) - { 4.111111f, 8.777778f, 1.555556f, 0.029703f }, // 244 (1 7 0 8) - { 4.222222f, 8.222222f, 1.777778f, 0.031690f }, // 245 (1 7 1 7) - { 4.333333f, 7.666667f, 2.000000f, 0.034221f }, // 246 (1 7 2 6) - { 4.444445f, 7.111111f, 2.222222f, 0.037500f }, // 247 (1 7 3 5) - { 4.555555f, 6.555555f, 2.444444f, 0.041860f }, // 248 (1 7 4 4) - { 4.666667f, 6.000000f, 2.666667f, 0.047872f }, // 249 (1 7 5 3) - { 4.777778f, 5.444445f, 2.888889f, 0.056604f }, // 250 (1 7 6 2) - { 4.888889f, 4.888889f, 3.111111f, 0.070313f }, // 251 (1 7 7 1) - { 5.000000f, 4.333333f, 3.333333f, 0.094737f }, // 252 (1 7 8 0) - { 4.555555f, 7.888889f, 1.777778f, 0.030508f }, // 253 (1 8 0 7) - { 4.666667f, 7.333333f, 2.000000f, 0.033088f }, // 254 (1 8 1 6) - { 4.777778f, 6.777778f, 2.222222f, 0.036437f }, // 255 (1 8 2 5) - { 4.888889f, 6.222222f, 2.444444f, 0.040909f }, // 256 (1 8 3 4) - { 5.000000f, 5.666667f, 2.666667f, 0.047120f }, // 257 (1 8 4 3) - { 5.111111f, 5.111111f, 2.888889f, 0.056250f }, // 258 (1 8 5 2) - { 5.222222f, 4.555555f, 3.111111f, 0.070866f }, // 259 (1 8 6 1) - { 5.333333f, 4.000000f, 3.333333f, 0.097826f }, // 260 (1 8 7 0) - { 5.000000f, 7.000000f, 2.000000f, 0.032258f }, // 261 (1 9 0 6) - { 5.111111f, 6.444445f, 2.222222f, 0.035714f }, // 262 (1 9 1 5) - { 5.222222f, 5.888889f, 2.444444f, 0.040359f }, // 263 (1 9 2 4) - { 5.333333f, 5.333333f, 2.666667f, 0.046875f }, // 264 (1 9 3 3) - { 5.444445f, 4.777778f, 2.888889f, 0.056604f }, // 265 (1 9 4 2) - { 5.555556f, 4.222222f, 3.111111f, 0.072581f }, // 266 (1 9 5 1) - { 5.666667f, 3.666667f, 3.333333f, 0.103448f }, // 267 (1 9 6 0) - { 5.444445f, 6.111111f, 2.222222f, 0.035294f }, // 268 (1 10 0 5) - { 5.555556f, 5.555555f, 2.444444f, 0.040179f }, // 269 (1 10 1 4) - { 5.666667f, 5.000000f, 2.666667f, 0.047120f }, // 270 (1 10 2 3) - { 5.777778f, 4.444445f, 2.888889f, 0.057692f }, // 271 (1 10 3 2) - { 5.888889f, 3.888889f, 3.111111f, 0.075630f }, // 272 (1 10 4 1) - { 6.000000f, 3.333333f, 3.333333f, 0.112500f }, // 273 (1 10 5 0) - { 5.888889f, 5.222222f, 2.444444f, 0.040359f }, // 274 (1 11 0 4) - { 6.000000f, 4.666667f, 2.666667f, 0.047872f }, // 275 (1 11 1 3) - { 6.111111f, 4.111111f, 2.888889f, 0.059603f }, // 276 (1 11 2 2) - { 6.222222f, 3.555556f, 3.111111f, 0.080357f }, // 277 (1 11 3 1) - { 6.333333f, 3.000000f, 3.333333f, 0.126761f }, // 278 (1 11 4 0) - { 6.333333f, 4.333333f, 2.666667f, 0.049180f }, // 279 (1 12 0 3) - { 6.444445f, 3.777778f, 2.888889f, 0.062500f }, // 280 (1 12 1 2) - { 6.555556f, 3.222222f, 3.111111f, 0.087379f }, // 281 (1 12 2 1) - { 6.666667f, 2.666667f, 3.333333f, 0.150000f }, // 282 (1 12 3 0) - { 6.777778f, 3.444444f, 2.888889f, 0.066667f }, // 283 (1 13 0 2) - { 6.888889f, 2.888889f, 3.111111f, 0.097826f }, // 284 (1 13 1 1) - { 7.000000f, 2.333333f, 3.333333f, 0.191489f }, // 285 (1 13 2 0) - { 7.222222f, 2.555556f, 3.111111f, 0.113924f }, // 286 (1 14 0 1) - { 7.333333f, 2.000000f, 3.333333f, 0.281250f }, // 287 (1 14 1 0) - { 7.666667f, 1.666667f, 3.333333f, 0.600000f }, // 288 (1 15 0 0) - { 2.000000f, 14.000000f, 0.000000f, 0.035714f }, // 289 (2 0 0 14) - { 2.111111f, 13.444445f, 0.222222f, 0.035294f }, // 290 (2 0 1 13) - { 2.222222f, 12.888889f, 0.444444f, 0.035156f }, // 291 (2 0 2 12) - { 2.333333f, 12.333333f, 0.666667f, 0.035294f }, // 292 (2 0 3 11) - { 2.444444f, 11.777778f, 0.888889f, 0.035714f }, // 293 (2 0 4 10) - { 2.555556f, 11.222222f, 1.111111f, 0.036437f }, // 294 (2 0 5 9) - { 2.666667f, 10.666667f, 1.333333f, 0.037500f }, // 295 (2 0 6 8) - { 2.777778f, 10.111111f, 1.555556f, 0.038961f }, // 296 (2 0 7 7) - { 2.888889f, 9.555555f, 1.777778f, 0.040909f }, // 297 (2 0 8 6) - { 3.000000f, 9.000000f, 2.000000f, 0.043478f }, // 298 (2 0 9 5) - { 3.111111f, 8.444445f, 2.222222f, 0.046875f }, // 299 (2 0 10 4) - { 3.222222f, 7.888889f, 2.444444f, 0.051429f }, // 300 (2 0 11 3) - { 3.333333f, 7.333333f, 2.666667f, 0.057692f }, // 301 (2 0 12 2) - { 3.444444f, 6.777778f, 2.888889f, 0.066667f }, // 302 (2 0 13 1) - { 3.555556f, 6.222222f, 3.111111f, 0.080357f }, // 303 (2 0 14 0) - { 2.444444f, 13.111111f, 0.222222f, 0.031250f }, // 304 (2 1 0 13) - { 2.555556f, 12.555555f, 0.444444f, 0.031359f }, // 305 (2 1 1 12) - { 2.666667f, 12.000000f, 0.666667f, 0.031690f }, // 306 (2 1 2 11) - { 2.777778f, 11.444445f, 0.888889f, 0.032258f }, // 307 (2 1 3 10) - { 2.888889f, 10.888889f, 1.111111f, 0.033088f }, // 308 (2 1 4 9) - { 3.000000f, 10.333333f, 1.333333f, 0.034221f }, // 309 (2 1 5 8) - { 3.111111f, 9.777778f, 1.555556f, 0.035714f }, // 310 (2 1 6 7) - { 3.222222f, 9.222222f, 1.777778f, 0.037657f }, // 311 (2 1 7 6) - { 3.333333f, 8.666667f, 2.000000f, 0.040179f }, // 312 (2 1 8 5) - { 3.444444f, 8.111111f, 2.222222f, 0.043478f }, // 313 (2 1 9 4) - { 3.555556f, 7.555556f, 2.444444f, 0.047872f }, // 314 (2 1 10 3) - { 3.666667f, 7.000000f, 2.666667f, 0.053892f }, // 315 (2 1 11 2) - { 3.777778f, 6.444445f, 2.888889f, 0.062500f }, // 316 (2 1 12 1) - { 3.888889f, 5.888889f, 3.111111f, 0.075630f }, // 317 (2 1 13 0) - { 2.888889f, 12.222222f, 0.444444f, 0.028481f }, // 318 (2 2 0 12) - { 3.000000f, 11.666667f, 0.666667f, 0.028939f }, // 319 (2 2 1 11) - { 3.111111f, 11.111111f, 0.888889f, 0.029605f }, // 320 (2 2 2 10) - { 3.222222f, 10.555555f, 1.111111f, 0.030508f }, // 321 (2 2 3 9) - { 3.333333f, 10.000000f, 1.333333f, 0.031690f }, // 322 (2 2 4 8) - { 3.444444f, 9.444445f, 1.555556f, 0.033210f }, // 323 (2 2 5 7) - { 3.555556f, 8.888889f, 1.777778f, 0.035156f }, // 324 (2 2 6 6) - { 3.666667f, 8.333333f, 2.000000f, 0.037657f }, // 325 (2 2 7 5) - { 3.777778f, 7.777778f, 2.222222f, 0.040909f }, // 326 (2 2 8 4) - { 3.888889f, 7.222222f, 2.444444f, 0.045226f }, // 327 (2 2 9 3) - { 4.000000f, 6.666667f, 2.666667f, 0.051136f }, // 328 (2 2 10 2) - { 4.111111f, 6.111111f, 2.888889f, 0.059603f }, // 329 (2 2 11 1) - { 4.222222f, 5.555556f, 3.111111f, 0.072581f }, // 330 (2 2 12 0) - { 3.333333f, 11.333333f, 0.666667f, 0.026786f }, // 331 (2 3 0 11) - { 3.444444f, 10.777778f, 0.888889f, 0.027523f }, // 332 (2 3 1 10) - { 3.555556f, 10.222222f, 1.111111f, 0.028481f }, // 333 (2 3 2 9) - { 3.666667f, 9.666667f, 1.333333f, 0.029703f }, // 334 (2 3 3 8) - { 3.777778f, 9.111111f, 1.555556f, 0.031250f }, // 335 (2 3 4 7) - { 3.888889f, 8.555555f, 1.777778f, 0.033210f }, // 336 (2 3 5 6) - { 4.000000f, 8.000000f, 2.000000f, 0.035714f }, // 337 (2 3 6 5) - { 4.111111f, 7.444445f, 2.222222f, 0.038961f }, // 338 (2 3 7 4) - { 4.222222f, 6.888889f, 2.444444f, 0.043269f }, // 339 (2 3 8 3) - { 4.333333f, 6.333333f, 2.666667f, 0.049180f }, // 340 (2 3 9 2) - { 4.444445f, 5.777778f, 2.888889f, 0.057692f }, // 341 (2 3 10 1) - { 4.555555f, 5.222222f, 3.111111f, 0.070866f }, // 342 (2 3 11 0) - { 3.777778f, 10.444445f, 0.888889f, 0.025862f }, // 343 (2 4 0 10) - { 3.888889f, 9.888889f, 1.111111f, 0.026866f }, // 344 (2 4 1 9) - { 4.000000f, 9.333333f, 1.333333f, 0.028125f }, // 345 (2 4 2 8) - { 4.111111f, 8.777778f, 1.555556f, 0.029703f }, // 346 (2 4 3 7) - { 4.222222f, 8.222222f, 1.777778f, 0.031690f }, // 347 (2 4 4 6) - { 4.333333f, 7.666667f, 2.000000f, 0.034221f }, // 348 (2 4 5 5) - { 4.444445f, 7.111111f, 2.222222f, 0.037500f }, // 349 (2 4 6 4) - { 4.555555f, 6.555555f, 2.444444f, 0.041860f }, // 350 (2 4 7 3) - { 4.666667f, 6.000000f, 2.666667f, 0.047872f }, // 351 (2 4 8 2) - { 4.777778f, 5.444445f, 2.888889f, 0.056604f }, // 352 (2 4 9 1) - { 4.888889f, 4.888889f, 3.111111f, 0.070313f }, // 353 (2 4 10 0) - { 4.222222f, 9.555555f, 1.111111f, 0.025568f }, // 354 (2 5 0 9) - { 4.333333f, 9.000000f, 1.333333f, 0.026866f }, // 355 (2 5 1 8) - { 4.444445f, 8.444445f, 1.555556f, 0.028481f }, // 356 (2 5 2 7) - { 4.555555f, 7.888889f, 1.777778f, 0.030508f }, // 357 (2 5 3 6) - { 4.666667f, 7.333333f, 2.000000f, 0.033088f }, // 358 (2 5 4 5) - { 4.777778f, 6.777778f, 2.222222f, 0.036437f }, // 359 (2 5 5 4) - { 4.888889f, 6.222222f, 2.444444f, 0.040909f }, // 360 (2 5 6 3) - { 5.000000f, 5.666667f, 2.666667f, 0.047120f }, // 361 (2 5 7 2) - { 5.111111f, 5.111111f, 2.888889f, 0.056250f }, // 362 (2 5 8 1) - { 5.222222f, 4.555556f, 3.111111f, 0.070866f }, // 363 (2 5 9 0) - { 4.666667f, 8.666667f, 1.333333f, 0.025862f }, // 364 (2 6 0 8) - { 4.777778f, 8.111111f, 1.555556f, 0.027523f }, // 365 (2 6 1 7) - { 4.888889f, 7.555555f, 1.777778f, 0.029605f }, // 366 (2 6 2 6) - { 5.000000f, 7.000000f, 2.000000f, 0.032258f }, // 367 (2 6 3 5) - { 5.111111f, 6.444445f, 2.222222f, 0.035714f }, // 368 (2 6 4 4) - { 5.222222f, 5.888889f, 2.444444f, 0.040359f }, // 369 (2 6 5 3) - { 5.333333f, 5.333333f, 2.666667f, 0.046875f }, // 370 (2 6 6 2) - { 5.444445f, 4.777778f, 2.888889f, 0.056604f }, // 371 (2 6 7 1) - { 5.555555f, 4.222222f, 3.111111f, 0.072581f }, // 372 (2 6 8 0) - { 5.111111f, 7.777778f, 1.555556f, 0.026786f }, // 373 (2 7 0 7) - { 5.222222f, 7.222222f, 1.777778f, 0.028939f }, // 374 (2 7 1 6) - { 5.333333f, 6.666667f, 2.000000f, 0.031690f }, // 375 (2 7 2 5) - { 5.444445f, 6.111111f, 2.222222f, 0.035294f }, // 376 (2 7 3 4) - { 5.555555f, 5.555555f, 2.444444f, 0.040179f }, // 377 (2 7 4 3) - { 5.666667f, 5.000000f, 2.666667f, 0.047120f }, // 378 (2 7 5 2) - { 5.777778f, 4.444445f, 2.888889f, 0.057692f }, // 379 (2 7 6 1) - { 5.888889f, 3.888889f, 3.111111f, 0.075630f }, // 380 (2 7 7 0) - { 5.555555f, 6.888889f, 1.777778f, 0.028481f }, // 381 (2 8 0 6) - { 5.666667f, 6.333333f, 2.000000f, 0.031359f }, // 382 (2 8 1 5) - { 5.777778f, 5.777778f, 2.222222f, 0.035156f }, // 383 (2 8 2 4) - { 5.888889f, 5.222222f, 2.444444f, 0.040359f }, // 384 (2 8 3 3) - { 6.000000f, 4.666667f, 2.666667f, 0.047872f }, // 385 (2 8 4 2) - { 6.111111f, 4.111111f, 2.888889f, 0.059603f }, // 386 (2 8 5 1) - { 6.222222f, 3.555556f, 3.111111f, 0.080357f }, // 387 (2 8 6 0) - { 6.000000f, 6.000000f, 2.000000f, 0.031250f }, // 388 (2 9 0 5) - { 6.111111f, 5.444445f, 2.222222f, 0.035294f }, // 389 (2 9 1 4) - { 6.222222f, 4.888889f, 2.444444f, 0.040909f }, // 390 (2 9 2 3) - { 6.333333f, 4.333333f, 2.666667f, 0.049180f }, // 391 (2 9 3 2) - { 6.444445f, 3.777778f, 2.888889f, 0.062500f }, // 392 (2 9 4 1) - { 6.555556f, 3.222222f, 3.111111f, 0.087379f }, // 393 (2 9 5 0) - { 6.444445f, 5.111111f, 2.222222f, 0.035714f }, // 394 (2 10 0 4) - { 6.555556f, 4.555555f, 2.444444f, 0.041860f }, // 395 (2 10 1 3) - { 6.666667f, 4.000000f, 2.666667f, 0.051136f }, // 396 (2 10 2 2) - { 6.777778f, 3.444444f, 2.888889f, 0.066667f }, // 397 (2 10 3 1) - { 6.888889f, 2.888889f, 3.111111f, 0.097826f }, // 398 (2 10 4 0) - { 6.888889f, 4.222222f, 2.444444f, 0.043269f }, // 399 (2 11 0 3) - { 7.000000f, 3.666667f, 2.666667f, 0.053892f }, // 400 (2 11 1 2) - { 7.111111f, 3.111111f, 2.888889f, 0.072581f }, // 401 (2 11 2 1) - { 7.222222f, 2.555556f, 3.111111f, 0.113924f }, // 402 (2 11 3 0) - { 7.333333f, 3.333333f, 2.666667f, 0.057692f }, // 403 (2 12 0 2) - { 7.444445f, 2.777778f, 2.888889f, 0.081081f }, // 404 (2 12 1 1) - { 7.555556f, 2.222222f, 3.111111f, 0.140625f }, // 405 (2 12 2 0) - { 7.777778f, 2.444444f, 2.888889f, 0.093750f }, // 406 (2 13 0 1) - { 7.888889f, 1.888889f, 3.111111f, 0.191489f }, // 407 (2 13 1 0) - { 8.222222f, 1.555556f, 3.111111f, 0.321429f }, // 408 (2 14 0 0) - { 3.000000f, 13.000000f, 0.000000f, 0.025641f }, // 409 (3 0 0 13) - { 3.111111f, 12.444445f, 0.222222f, 0.025862f }, // 410 (3 0 1 12) - { 3.222222f, 11.888889f, 0.444444f, 0.026239f }, // 411 (3 0 2 11) - { 3.333333f, 11.333333f, 0.666667f, 0.026786f }, // 412 (3 0 3 10) - { 3.444444f, 10.777778f, 0.888889f, 0.027523f }, // 413 (3 0 4 9) - { 3.555556f, 10.222222f, 1.111111f, 0.028481f }, // 414 (3 0 5 8) - { 3.666667f, 9.666667f, 1.333333f, 0.029703f }, // 415 (3 0 6 7) - { 3.777778f, 9.111111f, 1.555556f, 0.031250f }, // 416 (3 0 7 6) - { 3.888889f, 8.555555f, 1.777778f, 0.033210f }, // 417 (3 0 8 5) - { 4.000000f, 8.000000f, 2.000000f, 0.035714f }, // 418 (3 0 9 4) - { 4.111111f, 7.444445f, 2.222222f, 0.038961f }, // 419 (3 0 10 3) - { 4.222222f, 6.888889f, 2.444444f, 0.043269f }, // 420 (3 0 11 2) - { 4.333333f, 6.333333f, 2.666667f, 0.049180f }, // 421 (3 0 12 1) - { 4.444445f, 5.777778f, 2.888889f, 0.057692f }, // 422 (3 0 13 0) - { 3.444444f, 12.111111f, 0.222222f, 0.024000f }, // 423 (3 1 0 12) - { 3.555556f, 11.555555f, 0.444444f, 0.024457f }, // 424 (3 1 1 11) - { 3.666667f, 11.000000f, 0.666667f, 0.025070f }, // 425 (3 1 2 10) - { 3.777778f, 10.444445f, 0.888889f, 0.025862f }, // 426 (3 1 3 9) - { 3.888889f, 9.888889f, 1.111111f, 0.026866f }, // 427 (3 1 4 8) - { 4.000000f, 9.333333f, 1.333333f, 0.028125f }, // 428 (3 1 5 7) - { 4.111111f, 8.777778f, 1.555556f, 0.029703f }, // 429 (3 1 6 6) - { 4.222222f, 8.222222f, 1.777778f, 0.031690f }, // 430 (3 1 7 5) - { 4.333333f, 7.666667f, 2.000000f, 0.034221f }, // 431 (3 1 8 4) - { 4.444445f, 7.111111f, 2.222222f, 0.037500f }, // 432 (3 1 9 3) - { 4.555555f, 6.555556f, 2.444444f, 0.041860f }, // 433 (3 1 10 2) - { 4.666667f, 6.000000f, 2.666667f, 0.047872f }, // 434 (3 1 11 1) - { 4.777778f, 5.444445f, 2.888889f, 0.056604f }, // 435 (3 1 12 0) - { 3.888889f, 11.222222f, 0.444444f, 0.023018f }, // 436 (3 2 0 11) - { 4.000000f, 10.666667f, 0.666667f, 0.023684f }, // 437 (3 2 1 10) - { 4.111111f, 10.111111f, 0.888889f, 0.024523f }, // 438 (3 2 2 9) - { 4.222222f, 9.555555f, 1.111111f, 0.025568f }, // 439 (3 2 3 8) - { 4.333333f, 9.000000f, 1.333333f, 0.026866f }, // 440 (3 2 4 7) - { 4.444445f, 8.444445f, 1.555556f, 0.028481f }, // 441 (3 2 5 6) - { 4.555555f, 7.888889f, 1.777778f, 0.030508f }, // 442 (3 2 6 5) - { 4.666667f, 7.333333f, 2.000000f, 0.033088f }, // 443 (3 2 7 4) - { 4.777778f, 6.777778f, 2.222222f, 0.036437f }, // 444 (3 2 8 3) - { 4.888889f, 6.222222f, 2.444444f, 0.040909f }, // 445 (3 2 9 2) - { 5.000000f, 5.666667f, 2.666667f, 0.047120f }, // 446 (3 2 10 1) - { 5.111111f, 5.111111f, 2.888889f, 0.056250f }, // 447 (3 2 11 0) - { 4.333333f, 10.333333f, 0.666667f, 0.022556f }, // 448 (3 3 0 10) - { 4.444445f, 9.777778f, 0.888889f, 0.023438f }, // 449 (3 3 1 9) - { 4.555555f, 9.222222f, 1.111111f, 0.024523f }, // 450 (3 3 2 8) - { 4.666667f, 8.666667f, 1.333333f, 0.025862f }, // 451 (3 3 3 7) - { 4.777778f, 8.111111f, 1.555556f, 0.027523f }, // 452 (3 3 4 6) - { 4.888889f, 7.555555f, 1.777778f, 0.029605f }, // 453 (3 3 5 5) - { 5.000000f, 7.000000f, 2.000000f, 0.032258f }, // 454 (3 3 6 4) - { 5.111111f, 6.444445f, 2.222222f, 0.035714f }, // 455 (3 3 7 3) - { 5.222222f, 5.888889f, 2.444444f, 0.040359f }, // 456 (3 3 8 2) - { 5.333333f, 5.333333f, 2.666667f, 0.046875f }, // 457 (3 3 9 1) - { 5.444445f, 4.777778f, 2.888889f, 0.056604f }, // 458 (3 3 10 0) - { 4.777778f, 9.444445f, 0.888889f, 0.022556f }, // 459 (3 4 0 9) - { 4.888889f, 8.888889f, 1.111111f, 0.023684f }, // 460 (3 4 1 8) - { 5.000000f, 8.333333f, 1.333333f, 0.025070f }, // 461 (3 4 2 7) - { 5.111111f, 7.777778f, 1.555556f, 0.026786f }, // 462 (3 4 3 6) - { 5.222222f, 7.222222f, 1.777778f, 0.028939f }, // 463 (3 4 4 5) - { 5.333333f, 6.666667f, 2.000000f, 0.031690f }, // 464 (3 4 5 4) - { 5.444445f, 6.111111f, 2.222222f, 0.035294f }, // 465 (3 4 6 3) - { 5.555555f, 5.555555f, 2.444444f, 0.040179f }, // 466 (3 4 7 2) - { 5.666667f, 5.000000f, 2.666667f, 0.047120f }, // 467 (3 4 8 1) - { 5.777778f, 4.444445f, 2.888889f, 0.057692f }, // 468 (3 4 9 0) - { 5.222222f, 8.555555f, 1.111111f, 0.023018f }, // 469 (3 5 0 8) - { 5.333333f, 8.000000f, 1.333333f, 0.024457f }, // 470 (3 5 1 7) - { 5.444445f, 7.444445f, 1.555556f, 0.026239f }, // 471 (3 5 2 6) - { 5.555555f, 6.888889f, 1.777778f, 0.028481f }, // 472 (3 5 3 5) - { 5.666667f, 6.333333f, 2.000000f, 0.031359f }, // 473 (3 5 4 4) - { 5.777778f, 5.777778f, 2.222222f, 0.035156f }, // 474 (3 5 5 3) - { 5.888889f, 5.222222f, 2.444444f, 0.040359f }, // 475 (3 5 6 2) - { 6.000000f, 4.666667f, 2.666667f, 0.047872f }, // 476 (3 5 7 1) - { 6.111111f, 4.111111f, 2.888889f, 0.059603f }, // 477 (3 5 8 0) - { 5.666667f, 7.666667f, 1.333333f, 0.024000f }, // 478 (3 6 0 7) - { 5.777778f, 7.111111f, 1.555556f, 0.025862f }, // 479 (3 6 1 6) - { 5.888889f, 6.555555f, 1.777778f, 0.028213f }, // 480 (3 6 2 5) - { 6.000000f, 6.000000f, 2.000000f, 0.031250f }, // 481 (3 6 3 4) - { 6.111111f, 5.444445f, 2.222222f, 0.035294f }, // 482 (3 6 4 3) - { 6.222222f, 4.888889f, 2.444444f, 0.040909f }, // 483 (3 6 5 2) - { 6.333333f, 4.333333f, 2.666667f, 0.049180f }, // 484 (3 6 6 1) - { 6.444445f, 3.777778f, 2.888889f, 0.062500f }, // 485 (3 6 7 0) - { 6.111111f, 6.777778f, 1.555556f, 0.025641f }, // 486 (3 7 0 6) - { 6.222222f, 6.222222f, 1.777778f, 0.028125f }, // 487 (3 7 1 5) - { 6.333333f, 5.666667f, 2.000000f, 0.031359f }, // 488 (3 7 2 4) - { 6.444445f, 5.111111f, 2.222222f, 0.035714f }, // 489 (3 7 3 3) - { 6.555555f, 4.555555f, 2.444444f, 0.041860f }, // 490 (3 7 4 2) - { 6.666667f, 4.000000f, 2.666667f, 0.051136f }, // 491 (3 7 5 1) - { 6.777778f, 3.444444f, 2.888889f, 0.066667f }, // 492 (3 7 6 0) - { 6.555555f, 5.888889f, 1.777778f, 0.028213f }, // 493 (3 8 0 5) - { 6.666667f, 5.333333f, 2.000000f, 0.031690f }, // 494 (3 8 1 4) - { 6.777778f, 4.777778f, 2.222222f, 0.036437f }, // 495 (3 8 2 3) - { 6.888889f, 4.222222f, 2.444444f, 0.043269f }, // 496 (3 8 3 2) - { 7.000000f, 3.666667f, 2.666667f, 0.053892f }, // 497 (3 8 4 1) - { 7.111111f, 3.111111f, 2.888889f, 0.072581f }, // 498 (3 8 5 0) - { 7.000000f, 5.000000f, 2.000000f, 0.032258f }, // 499 (3 9 0 4) - { 7.111111f, 4.444445f, 2.222222f, 0.037500f }, // 500 (3 9 1 3) - { 7.222222f, 3.888889f, 2.444444f, 0.045226f }, // 501 (3 9 2 2) - { 7.333333f, 3.333333f, 2.666667f, 0.057692f }, // 502 (3 9 3 1) - { 7.444445f, 2.777778f, 2.888889f, 0.081081f }, // 503 (3 9 4 0) - { 7.444445f, 4.111111f, 2.222222f, 0.038961f }, // 504 (3 10 0 3) - { 7.555556f, 3.555556f, 2.444444f, 0.047872f }, // 505 (3 10 1 2) - { 7.666667f, 3.000000f, 2.666667f, 0.062937f }, // 506 (3 10 2 1) - { 7.777778f, 2.444444f, 2.888889f, 0.093750f }, // 507 (3 10 3 0) - { 7.888889f, 3.222222f, 2.444444f, 0.051429f }, // 508 (3 11 0 2) - { 8.000000f, 2.666667f, 2.666667f, 0.070313f }, // 509 (3 11 1 1) - { 8.111111f, 2.111111f, 2.888889f, 0.113924f }, // 510 (3 11 2 0) - { 8.333333f, 2.333333f, 2.666667f, 0.081081f }, // 511 (3 12 0 1) - { 8.444445f, 1.777778f, 2.888889f, 0.150000f }, // 512 (3 12 1 0) - { 8.777778f, 1.444444f, 2.888889f, 0.230769f }, // 513 (3 13 0 0) - { 4.000000f, 12.000000f, 0.000000f, 0.020833f }, // 514 (4 0 0 12) - { 4.111111f, 11.444445f, 0.222222f, 0.021277f }, // 515 (4 0 1 11) - { 4.222222f, 10.888889f, 0.444444f, 0.021845f }, // 516 (4 0 2 10) - { 4.333333f, 10.333333f, 0.666667f, 0.022556f }, // 517 (4 0 3 9) - { 4.444445f, 9.777778f, 0.888889f, 0.023438f }, // 518 (4 0 4 8) - { 4.555555f, 9.222222f, 1.111111f, 0.024523f }, // 519 (4 0 5 7) - { 4.666667f, 8.666667f, 1.333333f, 0.025862f }, // 520 (4 0 6 6) - { 4.777778f, 8.111111f, 1.555556f, 0.027523f }, // 521 (4 0 7 5) - { 4.888889f, 7.555555f, 1.777778f, 0.029605f }, // 522 (4 0 8 4) - { 5.000000f, 7.000000f, 2.000000f, 0.032258f }, // 523 (4 0 9 3) - { 5.111111f, 6.444445f, 2.222222f, 0.035714f }, // 524 (4 0 10 2) - { 5.222222f, 5.888889f, 2.444444f, 0.040359f }, // 525 (4 0 11 1) - { 5.333333f, 5.333333f, 2.666667f, 0.046875f }, // 526 (4 0 12 0) - { 4.444445f, 11.111111f, 0.222222f, 0.020270f }, // 527 (4 1 0 11) - { 4.555555f, 10.555555f, 0.444444f, 0.020882f }, // 528 (4 1 1 10) - { 4.666667f, 10.000000f, 0.666667f, 0.021635f }, // 529 (4 1 2 9) - { 4.777778f, 9.444445f, 0.888889f, 0.022556f }, // 530 (4 1 3 8) - { 4.888889f, 8.888889f, 1.111111f, 0.023684f }, // 531 (4 1 4 7) - { 5.000000f, 8.333333f, 1.333333f, 0.025070f }, // 532 (4 1 5 6) - { 5.111111f, 7.777778f, 1.555556f, 0.026786f }, // 533 (4 1 6 5) - { 5.222222f, 7.222222f, 1.777778f, 0.028939f }, // 534 (4 1 7 4) - { 5.333333f, 6.666667f, 2.000000f, 0.031690f }, // 535 (4 1 8 3) - { 5.444445f, 6.111111f, 2.222222f, 0.035294f }, // 536 (4 1 9 2) - { 5.555555f, 5.555556f, 2.444444f, 0.040179f }, // 537 (4 1 10 1) - { 5.666667f, 5.000000f, 2.666667f, 0.047120f }, // 538 (4 1 11 0) - { 4.888889f, 10.222222f, 0.444444f, 0.020089f }, // 539 (4 2 0 10) - { 5.000000f, 9.666667f, 0.666667f, 0.020882f }, // 540 (4 2 1 9) - { 5.111111f, 9.111111f, 0.888889f, 0.021845f }, // 541 (4 2 2 8) - { 5.222222f, 8.555555f, 1.111111f, 0.023018f }, // 542 (4 2 3 7) - { 5.333333f, 8.000000f, 1.333333f, 0.024457f }, // 543 (4 2 4 6) - { 5.444445f, 7.444445f, 1.555556f, 0.026239f }, // 544 (4 2 5 5) - { 5.555555f, 6.888889f, 1.777778f, 0.028481f }, // 545 (4 2 6 4) - { 5.666667f, 6.333333f, 2.000000f, 0.031359f }, // 546 (4 2 7 3) - { 5.777778f, 5.777778f, 2.222222f, 0.035156f }, // 547 (4 2 8 2) - { 5.888889f, 5.222222f, 2.444444f, 0.040359f }, // 548 (4 2 9 1) - { 6.000000f, 4.666667f, 2.666667f, 0.047872f }, // 549 (4 2 10 0) - { 5.333333f, 9.333333f, 0.666667f, 0.020270f }, // 550 (4 3 0 9) - { 5.444445f, 8.777778f, 0.888889f, 0.021277f }, // 551 (4 3 1 8) - { 5.555555f, 8.222222f, 1.111111f, 0.022500f }, // 552 (4 3 2 7) - { 5.666667f, 7.666667f, 1.333333f, 0.024000f }, // 553 (4 3 3 6) - { 5.777778f, 7.111111f, 1.555556f, 0.025862f }, // 554 (4 3 4 5) - { 5.888889f, 6.555555f, 1.777778f, 0.028213f }, // 555 (4 3 5 4) - { 6.000000f, 6.000000f, 2.000000f, 0.031250f }, // 556 (4 3 6 3) - { 6.111111f, 5.444445f, 2.222222f, 0.035294f }, // 557 (4 3 7 2) - { 6.222222f, 4.888889f, 2.444444f, 0.040909f }, // 558 (4 3 8 1) - { 6.333333f, 4.333333f, 2.666667f, 0.049180f }, // 559 (4 3 9 0) - { 5.777778f, 8.444445f, 0.888889f, 0.020833f }, // 560 (4 4 0 8) - { 5.888889f, 7.888889f, 1.111111f, 0.022113f }, // 561 (4 4 1 7) - { 6.000000f, 7.333333f, 1.333333f, 0.023684f }, // 562 (4 4 2 6) - { 6.111111f, 6.777778f, 1.555556f, 0.025641f }, // 563 (4 4 3 5) - { 6.222222f, 6.222222f, 1.777778f, 0.028125f }, // 564 (4 4 4 4) - { 6.333333f, 5.666667f, 2.000000f, 0.031359f }, // 565 (4 4 5 3) - { 6.444445f, 5.111111f, 2.222222f, 0.035714f }, // 566 (4 4 6 2) - { 6.555555f, 4.555555f, 2.444444f, 0.041860f }, // 567 (4 4 7 1) - { 6.666667f, 4.000000f, 2.666667f, 0.051136f }, // 568 (4 4 8 0) - { 6.222222f, 7.555555f, 1.111111f, 0.021845f }, // 569 (4 5 0 7) - { 6.333333f, 7.000000f, 1.333333f, 0.023499f }, // 570 (4 5 1 6) - { 6.444445f, 6.444445f, 1.555556f, 0.025568f }, // 571 (4 5 2 5) - { 6.555555f, 5.888889f, 1.777778f, 0.028213f }, // 572 (4 5 3 4) - { 6.666667f, 5.333333f, 2.000000f, 0.031690f }, // 573 (4 5 4 3) - { 6.777778f, 4.777778f, 2.222222f, 0.036437f }, // 574 (4 5 5 2) - { 6.888889f, 4.222222f, 2.444444f, 0.043269f }, // 575 (4 5 6 1) - { 7.000000f, 3.666667f, 2.666667f, 0.053892f }, // 576 (4 5 7 0) - { 6.666667f, 6.666667f, 1.333333f, 0.023438f }, // 577 (4 6 0 6) - { 6.777778f, 6.111111f, 1.555556f, 0.025641f }, // 578 (4 6 1 5) - { 6.888889f, 5.555555f, 1.777778f, 0.028481f }, // 579 (4 6 2 4) - { 7.000000f, 5.000000f, 2.000000f, 0.032258f }, // 580 (4 6 3 3) - { 7.111111f, 4.444445f, 2.222222f, 0.037500f }, // 581 (4 6 4 2) - { 7.222222f, 3.888889f, 2.444444f, 0.045226f }, // 582 (4 6 5 1) - { 7.333333f, 3.333333f, 2.666667f, 0.057692f }, // 583 (4 6 6 0) - { 7.111111f, 5.777778f, 1.555556f, 0.025862f }, // 584 (4 7 0 5) - { 7.222222f, 5.222222f, 1.777778f, 0.028939f }, // 585 (4 7 1 4) - { 7.333333f, 4.666667f, 2.000000f, 0.033088f }, // 586 (4 7 2 3) - { 7.444445f, 4.111111f, 2.222222f, 0.038961f }, // 587 (4 7 3 2) - { 7.555555f, 3.555556f, 2.444444f, 0.047872f }, // 588 (4 7 4 1) - { 7.666667f, 3.000000f, 2.666667f, 0.062937f }, // 589 (4 7 5 0) - { 7.555555f, 4.888889f, 1.777778f, 0.029605f }, // 590 (4 8 0 4) - { 7.666667f, 4.333333f, 2.000000f, 0.034221f }, // 591 (4 8 1 3) - { 7.777778f, 3.777778f, 2.222222f, 0.040909f }, // 592 (4 8 2 2) - { 7.888889f, 3.222222f, 2.444444f, 0.051429f }, // 593 (4 8 3 1) - { 8.000000f, 2.666667f, 2.666667f, 0.070313f }, // 594 (4 8 4 0) - { 8.000000f, 4.000000f, 2.000000f, 0.035714f }, // 595 (4 9 0 3) - { 8.111111f, 3.444444f, 2.222222f, 0.043478f }, // 596 (4 9 1 2) - { 8.222222f, 2.888889f, 2.444444f, 0.056250f }, // 597 (4 9 2 1) - { 8.333333f, 2.333333f, 2.666667f, 0.081081f }, // 598 (4 9 3 0) - { 8.444445f, 3.111111f, 2.222222f, 0.046875f }, // 599 (4 10 0 2) - { 8.555555f, 2.555556f, 2.444444f, 0.062937f }, // 600 (4 10 1 1) - { 8.666667f, 2.000000f, 2.666667f, 0.097826f }, // 601 (4 10 2 0) - { 8.888889f, 2.222222f, 2.444444f, 0.072581f }, // 602 (4 11 0 1) - { 9.000000f, 1.666667f, 2.666667f, 0.126761f }, // 603 (4 11 1 0) - { 9.333333f, 1.333333f, 2.666667f, 0.187500f }, // 604 (4 12 0 0) - { 5.000000f, 11.000000f, 0.000000f, 0.018182f }, // 605 (5 0 0 11) - { 5.111111f, 10.444445f, 0.222222f, 0.018750f }, // 606 (5 0 1 10) - { 5.222222f, 9.888889f, 0.444444f, 0.019438f }, // 607 (5 0 2 9) - { 5.333333f, 9.333333f, 0.666667f, 0.020270f }, // 608 (5 0 3 8) - { 5.444445f, 8.777778f, 0.888889f, 0.021277f }, // 609 (5 0 4 7) - { 5.555555f, 8.222222f, 1.111111f, 0.022500f }, // 610 (5 0 5 6) - { 5.666667f, 7.666667f, 1.333333f, 0.024000f }, // 611 (5 0 6 5) - { 5.777778f, 7.111111f, 1.555556f, 0.025862f }, // 612 (5 0 7 4) - { 5.888889f, 6.555555f, 1.777778f, 0.028213f }, // 613 (5 0 8 3) - { 6.000000f, 6.000000f, 2.000000f, 0.031250f }, // 614 (5 0 9 2) - { 6.111111f, 5.444445f, 2.222222f, 0.035294f }, // 615 (5 0 10 1) - { 6.222222f, 4.888889f, 2.444444f, 0.040909f }, // 616 (5 0 11 0) - { 5.444445f, 10.111111f, 0.222222f, 0.018182f }, // 617 (5 1 0 10) - { 5.555555f, 9.555555f, 0.444444f, 0.018908f }, // 618 (5 1 1 9) - { 5.666667f, 9.000000f, 0.666667f, 0.019780f }, // 619 (5 1 2 8) - { 5.777778f, 8.444445f, 0.888889f, 0.020833f }, // 620 (5 1 3 7) - { 5.888889f, 7.888889f, 1.111111f, 0.022113f }, // 621 (5 1 4 6) - { 6.000000f, 7.333333f, 1.333333f, 0.023684f }, // 622 (5 1 5 5) - { 6.111111f, 6.777778f, 1.555556f, 0.025641f }, // 623 (5 1 6 4) - { 6.222222f, 6.222222f, 1.777778f, 0.028125f }, // 624 (5 1 7 3) - { 6.333333f, 5.666667f, 2.000000f, 0.031359f }, // 625 (5 1 8 2) - { 6.444445f, 5.111111f, 2.222222f, 0.035714f }, // 626 (5 1 9 1) - { 6.555555f, 4.555556f, 2.444444f, 0.041860f }, // 627 (5 1 10 0) - { 5.888889f, 9.222222f, 0.444444f, 0.018480f }, // 628 (5 2 0 9) - { 6.000000f, 8.666667f, 0.666667f, 0.019397f }, // 629 (5 2 1 8) - { 6.111111f, 8.111111f, 0.888889f, 0.020501f }, // 630 (5 2 2 7) - { 6.222222f, 7.555555f, 1.111111f, 0.021845f }, // 631 (5 2 3 6) - { 6.333333f, 7.000000f, 1.333333f, 0.023499f }, // 632 (5 2 4 5) - { 6.444445f, 6.444445f, 1.555556f, 0.025568f }, // 633 (5 2 5 4) - { 6.555555f, 5.888889f, 1.777778f, 0.028213f }, // 634 (5 2 6 3) - { 6.666667f, 5.333333f, 2.000000f, 0.031690f }, // 635 (5 2 7 2) - { 6.777778f, 4.777778f, 2.222222f, 0.036437f }, // 636 (5 2 8 1) - { 6.888889f, 4.222222f, 2.444444f, 0.043269f }, // 637 (5 2 9 0) - { 6.333333f, 8.333333f, 0.666667f, 0.019108f }, // 638 (5 3 0 8) - { 6.444445f, 7.777778f, 0.888889f, 0.020270f }, // 639 (5 3 1 7) - { 6.555555f, 7.222222f, 1.111111f, 0.021687f }, // 640 (5 3 2 6) - { 6.666667f, 6.666667f, 1.333333f, 0.023438f }, // 641 (5 3 3 5) - { 6.777778f, 6.111111f, 1.555556f, 0.025641f }, // 642 (5 3 4 4) - { 6.888889f, 5.555555f, 1.777778f, 0.028481f }, // 643 (5 3 5 3) - { 7.000000f, 5.000000f, 2.000000f, 0.032258f }, // 644 (5 3 6 2) - { 7.111111f, 4.444445f, 2.222222f, 0.037500f }, // 645 (5 3 7 1) - { 7.222222f, 3.888889f, 2.444444f, 0.045226f }, // 646 (5 3 8 0) - { 6.777778f, 7.444445f, 0.888889f, 0.020134f }, // 647 (5 4 0 7) - { 6.888889f, 6.888889f, 1.111111f, 0.021635f }, // 648 (5 4 1 6) - { 7.000000f, 6.333333f, 1.333333f, 0.023499f }, // 649 (5 4 2 5) - { 7.111111f, 5.777778f, 1.555556f, 0.025862f }, // 650 (5 4 3 4) - { 7.222222f, 5.222222f, 1.777778f, 0.028939f }, // 651 (5 4 4 3) - { 7.333333f, 4.666667f, 2.000000f, 0.033088f }, // 652 (5 4 5 2) - { 7.444445f, 4.111111f, 2.222222f, 0.038961f }, // 653 (5 4 6 1) - { 7.555555f, 3.555556f, 2.444444f, 0.047872f }, // 654 (5 4 7 0) - { 7.222222f, 6.555555f, 1.111111f, 0.021687f }, // 655 (5 5 0 6) - { 7.333333f, 6.000000f, 1.333333f, 0.023684f }, // 656 (5 5 1 5) - { 7.444445f, 5.444445f, 1.555556f, 0.026239f }, // 657 (5 5 2 4) - { 7.555555f, 4.888889f, 1.777778f, 0.029605f }, // 658 (5 5 3 3) - { 7.666667f, 4.333333f, 2.000000f, 0.034221f }, // 659 (5 5 4 2) - { 7.777778f, 3.777778f, 2.222222f, 0.040909f }, // 660 (5 5 5 1) - { 7.888889f, 3.222222f, 2.444444f, 0.051429f }, // 661 (5 5 6 0) - { 7.666667f, 5.666667f, 1.333333f, 0.024000f }, // 662 (5 6 0 5) - { 7.777778f, 5.111111f, 1.555556f, 0.026786f }, // 663 (5 6 1 4) - { 7.888889f, 4.555555f, 1.777778f, 0.030508f }, // 664 (5 6 2 3) - { 8.000000f, 4.000000f, 2.000000f, 0.035714f }, // 665 (5 6 3 2) - { 8.111111f, 3.444444f, 2.222222f, 0.043478f }, // 666 (5 6 4 1) - { 8.222222f, 2.888889f, 2.444444f, 0.056250f }, // 667 (5 6 5 0) - { 8.111111f, 4.777778f, 1.555556f, 0.027523f }, // 668 (5 7 0 4) - { 8.222222f, 4.222222f, 1.777778f, 0.031690f }, // 669 (5 7 1 3) - { 8.333333f, 3.666667f, 2.000000f, 0.037657f }, // 670 (5 7 2 2) - { 8.444445f, 3.111111f, 2.222222f, 0.046875f }, // 671 (5 7 3 1) - { 8.555555f, 2.555556f, 2.444444f, 0.062937f }, // 672 (5 7 4 0) - { 8.555555f, 3.888889f, 1.777778f, 0.033210f }, // 673 (5 8 0 3) - { 8.666667f, 3.333333f, 2.000000f, 0.040179f }, // 674 (5 8 1 2) - { 8.777778f, 2.777778f, 2.222222f, 0.051429f }, // 675 (5 8 2 1) - { 8.888889f, 2.222222f, 2.444444f, 0.072581f }, // 676 (5 8 3 0) - { 9.000000f, 3.000000f, 2.000000f, 0.043478f }, // 677 (5 9 0 2) - { 9.111111f, 2.444444f, 2.222222f, 0.057692f }, // 678 (5 9 1 1) - { 9.222222f, 1.888889f, 2.444444f, 0.087379f }, // 679 (5 9 2 0) - { 9.444445f, 2.111111f, 2.222222f, 0.066667f }, // 680 (5 10 0 1) - { 9.555555f, 1.555556f, 2.444444f, 0.112500f }, // 681 (5 10 1 0) - { 9.888889f, 1.222222f, 2.444444f, 0.163636f }, // 682 (5 11 0 0) - { 6.000000f, 10.000000f, 0.000000f, 0.016667f }, // 683 (6 0 0 10) - { 6.111111f, 9.444445f, 0.222222f, 0.017341f }, // 684 (6 0 1 9) - { 6.222222f, 8.888889f, 0.444444f, 0.018145f }, // 685 (6 0 2 8) - { 6.333333f, 8.333333f, 0.666667f, 0.019108f }, // 686 (6 0 3 7) - { 6.444445f, 7.777778f, 0.888889f, 0.020270f }, // 687 (6 0 4 6) - { 6.555555f, 7.222222f, 1.111111f, 0.021687f }, // 688 (6 0 5 5) - { 6.666667f, 6.666667f, 1.333333f, 0.023438f }, // 689 (6 0 6 4) - { 6.777778f, 6.111111f, 1.555556f, 0.025641f }, // 690 (6 0 7 3) - { 6.888889f, 5.555555f, 1.777778f, 0.028481f }, // 691 (6 0 8 2) - { 7.000000f, 5.000000f, 2.000000f, 0.032258f }, // 692 (6 0 9 1) - { 7.111111f, 4.444445f, 2.222222f, 0.037500f }, // 693 (6 0 10 0) - { 6.444445f, 9.111111f, 0.222222f, 0.017045f }, // 694 (6 1 0 9) - { 6.555555f, 8.555555f, 0.444444f, 0.017893f }, // 695 (6 1 1 8) - { 6.666667f, 8.000000f, 0.666667f, 0.018908f }, // 696 (6 1 2 7) - { 6.777778f, 7.444445f, 0.888889f, 0.020134f }, // 697 (6 1 3 6) - { 6.888889f, 6.888889f, 1.111111f, 0.021635f }, // 698 (6 1 4 5) - { 7.000000f, 6.333333f, 1.333333f, 0.023499f }, // 699 (6 1 5 4) - { 7.111111f, 5.777778f, 1.555556f, 0.025862f }, // 700 (6 1 6 3) - { 7.222222f, 5.222222f, 1.777778f, 0.028939f }, // 701 (6 1 7 2) - { 7.333333f, 4.666667f, 2.000000f, 0.033088f }, // 702 (6 1 8 1) - { 7.444445f, 4.111111f, 2.222222f, 0.038961f }, // 703 (6 1 9 0) - { 6.888889f, 8.222222f, 0.444444f, 0.017717f }, // 704 (6 2 0 8) - { 7.000000f, 7.666667f, 0.666667f, 0.018789f }, // 705 (6 2 1 7) - { 7.111111f, 7.111111f, 0.888889f, 0.020089f }, // 706 (6 2 2 6) - { 7.222222f, 6.555555f, 1.111111f, 0.021687f }, // 707 (6 2 3 5) - { 7.333333f, 6.000000f, 1.333333f, 0.023684f }, // 708 (6 2 4 4) - { 7.444445f, 5.444445f, 1.555556f, 0.026239f }, // 709 (6 2 5 3) - { 7.555555f, 4.888889f, 1.777778f, 0.029605f }, // 710 (6 2 6 2) - { 7.666667f, 4.333333f, 2.000000f, 0.034221f }, // 711 (6 2 7 1) - { 7.777778f, 3.777778f, 2.222222f, 0.040909f }, // 712 (6 2 8 0) - { 7.333333f, 7.333333f, 0.666667f, 0.018750f }, // 713 (6 3 0 7) - { 7.444445f, 6.777778f, 0.888889f, 0.020134f }, // 714 (6 3 1 6) - { 7.555555f, 6.222222f, 1.111111f, 0.021845f }, // 715 (6 3 2 5) - { 7.666667f, 5.666667f, 1.333333f, 0.024000f }, // 716 (6 3 3 4) - { 7.777778f, 5.111111f, 1.555556f, 0.026786f }, // 717 (6 3 4 3) - { 7.888889f, 4.555555f, 1.777778f, 0.030508f }, // 718 (6 3 5 2) - { 8.000000f, 4.000000f, 2.000000f, 0.035714f }, // 719 (6 3 6 1) - { 8.111111f, 3.444444f, 2.222222f, 0.043478f }, // 720 (6 3 7 0) - { 7.777778f, 6.444445f, 0.888889f, 0.020270f }, // 721 (6 4 0 6) - { 7.888889f, 5.888889f, 1.111111f, 0.022113f }, // 722 (6 4 1 5) - { 8.000000f, 5.333333f, 1.333333f, 0.024457f }, // 723 (6 4 2 4) - { 8.111111f, 4.777778f, 1.555556f, 0.027523f }, // 724 (6 4 3 3) - { 8.222222f, 4.222222f, 1.777778f, 0.031690f }, // 725 (6 4 4 2) - { 8.333333f, 3.666667f, 2.000000f, 0.037657f }, // 726 (6 4 5 1) - { 8.444445f, 3.111111f, 2.222222f, 0.046875f }, // 727 (6 4 6 0) - { 8.222222f, 5.555555f, 1.111111f, 0.022500f }, // 728 (6 5 0 5) - { 8.333333f, 5.000000f, 1.333333f, 0.025070f }, // 729 (6 5 1 4) - { 8.444445f, 4.444445f, 1.555556f, 0.028481f }, // 730 (6 5 2 3) - { 8.555555f, 3.888889f, 1.777778f, 0.033210f }, // 731 (6 5 3 2) - { 8.666667f, 3.333333f, 2.000000f, 0.040179f }, // 732 (6 5 4 1) - { 8.777778f, 2.777778f, 2.222222f, 0.051429f }, // 733 (6 5 5 0) - { 8.666667f, 4.666667f, 1.333333f, 0.025862f }, // 734 (6 6 0 4) - { 8.777778f, 4.111111f, 1.555556f, 0.029703f }, // 735 (6 6 1 3) - { 8.888889f, 3.555556f, 1.777778f, 0.035156f }, // 736 (6 6 2 2) - { 9.000000f, 3.000000f, 2.000000f, 0.043478f }, // 737 (6 6 3 1) - { 9.111111f, 2.444444f, 2.222222f, 0.057692f }, // 738 (6 6 4 0) - { 9.111111f, 3.777778f, 1.555556f, 0.031250f }, // 739 (6 7 0 3) - { 9.222222f, 3.222222f, 1.777778f, 0.037657f }, // 740 (6 7 1 2) - { 9.333333f, 2.666667f, 2.000000f, 0.047872f }, // 741 (6 7 2 1) - { 9.444445f, 2.111111f, 2.222222f, 0.066667f }, // 742 (6 7 3 0) - { 9.555555f, 2.888889f, 1.777778f, 0.040909f }, // 743 (6 8 0 2) - { 9.666667f, 2.333333f, 2.000000f, 0.053892f }, // 744 (6 8 1 1) - { 9.777778f, 1.777778f, 2.222222f, 0.080357f }, // 745 (6 8 2 0) - { 10.000000f, 2.000000f, 2.000000f, 0.062500f }, // 746 (6 9 0 1) - { 10.111111f, 1.444444f, 2.222222f, 0.103448f }, // 747 (6 9 1 0) - { 10.444445f, 1.111111f, 2.222222f, 0.150000f }, // 748 (6 10 0 0) - { 7.000000f, 9.000000f, 0.000000f, 0.015873f }, // 749 (7 0 0 9) - { 7.111111f, 8.444445f, 0.222222f, 0.016667f }, // 750 (7 0 1 8) - { 7.222222f, 7.888889f, 0.444444f, 0.017613f }, // 751 (7 0 2 7) - { 7.333333f, 7.333333f, 0.666667f, 0.018750f }, // 752 (7 0 3 6) - { 7.444445f, 6.777778f, 0.888889f, 0.020134f }, // 753 (7 0 4 5) - { 7.555555f, 6.222222f, 1.111111f, 0.021845f }, // 754 (7 0 5 4) - { 7.666667f, 5.666667f, 1.333333f, 0.024000f }, // 755 (7 0 6 3) - { 7.777778f, 5.111111f, 1.555556f, 0.026786f }, // 756 (7 0 7 2) - { 7.888889f, 4.555555f, 1.777778f, 0.030508f }, // 757 (7 0 8 1) - { 8.000000f, 4.000000f, 2.000000f, 0.035714f }, // 758 (7 0 9 0) - { 7.444445f, 8.111111f, 0.222222f, 0.016575f }, // 759 (7 1 0 8) - { 7.555555f, 7.555555f, 0.444444f, 0.017578f }, // 760 (7 1 1 7) - { 7.666667f, 7.000000f, 0.666667f, 0.018789f }, // 761 (7 1 2 6) - { 7.777778f, 6.444445f, 0.888889f, 0.020270f }, // 762 (7 1 3 5) - { 7.888889f, 5.888889f, 1.111111f, 0.022113f }, // 763 (7 1 4 4) - { 8.000000f, 5.333333f, 1.333333f, 0.024457f }, // 764 (7 1 5 3) - { 8.111111f, 4.777778f, 1.555556f, 0.027523f }, // 765 (7 1 6 2) - { 8.222222f, 4.222222f, 1.777778f, 0.031690f }, // 766 (7 1 7 1) - { 8.333333f, 3.666667f, 2.000000f, 0.037657f }, // 767 (7 1 8 0) - { 7.888889f, 7.222222f, 0.444444f, 0.017613f }, // 768 (7 2 0 7) - { 8.000000f, 6.666667f, 0.666667f, 0.018908f }, // 769 (7 2 1 6) - { 8.111111f, 6.111111f, 0.888889f, 0.020501f }, // 770 (7 2 2 5) - { 8.222222f, 5.555555f, 1.111111f, 0.022500f }, // 771 (7 2 3 4) - { 8.333333f, 5.000000f, 1.333333f, 0.025070f }, // 772 (7 2 4 3) - { 8.444445f, 4.444445f, 1.555556f, 0.028481f }, // 773 (7 2 5 2) - { 8.555555f, 3.888889f, 1.777778f, 0.033210f }, // 774 (7 2 6 1) - { 8.666667f, 3.333333f, 2.000000f, 0.040179f }, // 775 (7 2 7 0) - { 8.333333f, 6.333333f, 0.666667f, 0.019108f }, // 776 (7 3 0 6) - { 8.444445f, 5.777778f, 0.888889f, 0.020833f }, // 777 (7 3 1 5) - { 8.555555f, 5.222222f, 1.111111f, 0.023018f }, // 778 (7 3 2 4) - { 8.666667f, 4.666667f, 1.333333f, 0.025862f }, // 779 (7 3 3 3) - { 8.777778f, 4.111111f, 1.555556f, 0.029703f }, // 780 (7 3 4 2) - { 8.888889f, 3.555556f, 1.777778f, 0.035156f }, // 781 (7 3 5 1) - { 9.000000f, 3.000000f, 2.000000f, 0.043478f }, // 782 (7 3 6 0) - { 8.777778f, 5.444445f, 0.888889f, 0.021277f }, // 783 (7 4 0 5) - { 8.888889f, 4.888889f, 1.111111f, 0.023684f }, // 784 (7 4 1 4) - { 9.000000f, 4.333333f, 1.333333f, 0.026866f }, // 785 (7 4 2 3) - { 9.111111f, 3.777778f, 1.555556f, 0.031250f }, // 786 (7 4 3 2) - { 9.222222f, 3.222222f, 1.777778f, 0.037657f }, // 787 (7 4 4 1) - { 9.333333f, 2.666667f, 2.000000f, 0.047872f }, // 788 (7 4 5 0) - { 9.222222f, 4.555555f, 1.111111f, 0.024523f }, // 789 (7 5 0 4) - { 9.333333f, 4.000000f, 1.333333f, 0.028125f }, // 790 (7 5 1 3) - { 9.444445f, 3.444444f, 1.555556f, 0.033210f }, // 791 (7 5 2 2) - { 9.555555f, 2.888889f, 1.777778f, 0.040909f }, // 792 (7 5 3 1) - { 9.666667f, 2.333333f, 2.000000f, 0.053892f }, // 793 (7 5 4 0) - { 9.666667f, 3.666667f, 1.333333f, 0.029703f }, // 794 (7 6 0 3) - { 9.777778f, 3.111111f, 1.555556f, 0.035714f }, // 795 (7 6 1 2) - { 9.888889f, 2.555556f, 1.777778f, 0.045226f }, // 796 (7 6 2 1) - { 10.000000f, 2.000000f, 2.000000f, 0.062500f }, // 797 (7 6 3 0) - { 10.111111f, 2.777778f, 1.555556f, 0.038961f }, // 798 (7 7 0 2) - { 10.222222f, 2.222222f, 1.777778f, 0.051136f }, // 799 (7 7 1 1) - { 10.333333f, 1.666667f, 2.000000f, 0.075630f }, // 800 (7 7 2 0) - { 10.555555f, 1.888889f, 1.777778f, 0.059603f }, // 801 (7 8 0 1) - { 10.666667f, 1.333333f, 2.000000f, 0.097826f }, // 802 (7 8 1 0) - { 11.000000f, 1.000000f, 2.000000f, 0.142857f }, // 803 (7 9 0 0) - { 8.000000f, 8.000000f, 0.000000f, 0.015625f }, // 804 (8 0 0 8) - { 8.111111f, 7.444445f, 0.222222f, 0.016575f }, // 805 (8 0 1 7) - { 8.222222f, 6.888889f, 0.444444f, 0.017717f }, // 806 (8 0 2 6) - { 8.333333f, 6.333333f, 0.666667f, 0.019108f }, // 807 (8 0 3 5) - { 8.444445f, 5.777778f, 0.888889f, 0.020833f }, // 808 (8 0 4 4) - { 8.555555f, 5.222222f, 1.111111f, 0.023018f }, // 809 (8 0 5 3) - { 8.666667f, 4.666667f, 1.333333f, 0.025862f }, // 810 (8 0 6 2) - { 8.777778f, 4.111111f, 1.555556f, 0.029703f }, // 811 (8 0 7 1) - { 8.888889f, 3.555556f, 1.777778f, 0.035156f }, // 812 (8 0 8 0) - { 8.444445f, 7.111111f, 0.222222f, 0.016667f }, // 813 (8 1 0 7) - { 8.555555f, 6.555555f, 0.444444f, 0.017893f }, // 814 (8 1 1 6) - { 8.666667f, 6.000000f, 0.666667f, 0.019397f }, // 815 (8 1 2 5) - { 8.777778f, 5.444445f, 0.888889f, 0.021277f }, // 816 (8 1 3 4) - { 8.888889f, 4.888889f, 1.111111f, 0.023684f }, // 817 (8 1 4 3) - { 9.000000f, 4.333333f, 1.333333f, 0.026866f }, // 818 (8 1 5 2) - { 9.111111f, 3.777778f, 1.555556f, 0.031250f }, // 819 (8 1 6 1) - { 9.222222f, 3.222222f, 1.777778f, 0.037657f }, // 820 (8 1 7 0) - { 8.888889f, 6.222222f, 0.444444f, 0.018145f }, // 821 (8 2 0 6) - { 9.000000f, 5.666667f, 0.666667f, 0.019780f }, // 822 (8 2 1 5) - { 9.111111f, 5.111111f, 0.888889f, 0.021845f }, // 823 (8 2 2 4) - { 9.222222f, 4.555555f, 1.111111f, 0.024523f }, // 824 (8 2 3 3) - { 9.333333f, 4.000000f, 1.333333f, 0.028125f }, // 825 (8 2 4 2) - { 9.444445f, 3.444444f, 1.555556f, 0.033210f }, // 826 (8 2 5 1) - { 9.555555f, 2.888889f, 1.777778f, 0.040909f }, // 827 (8 2 6 0) - { 9.333333f, 5.333333f, 0.666667f, 0.020270f }, // 828 (8 3 0 5) - { 9.444445f, 4.777778f, 0.888889f, 0.022556f }, // 829 (8 3 1 4) - { 9.555555f, 4.222222f, 1.111111f, 0.025568f }, // 830 (8 3 2 3) - { 9.666667f, 3.666667f, 1.333333f, 0.029703f }, // 831 (8 3 3 2) - { 9.777778f, 3.111111f, 1.555556f, 0.035714f }, // 832 (8 3 4 1) - { 9.888889f, 2.555556f, 1.777778f, 0.045226f }, // 833 (8 3 5 0) - { 9.777778f, 4.444445f, 0.888889f, 0.023438f }, // 834 (8 4 0 4) - { 9.888889f, 3.888889f, 1.111111f, 0.026866f }, // 835 (8 4 1 3) - { 10.000000f, 3.333333f, 1.333333f, 0.031690f }, // 836 (8 4 2 2) - { 10.111111f, 2.777778f, 1.555556f, 0.038961f }, // 837 (8 4 3 1) - { 10.222222f, 2.222222f, 1.777778f, 0.051136f }, // 838 (8 4 4 0) - { 10.222222f, 3.555556f, 1.111111f, 0.028481f }, // 839 (8 5 0 3) - { 10.333333f, 3.000000f, 1.333333f, 0.034221f }, // 840 (8 5 1 2) - { 10.444445f, 2.444444f, 1.555556f, 0.043269f }, // 841 (8 5 2 1) - { 10.555555f, 1.888889f, 1.777778f, 0.059603f }, // 842 (8 5 3 0) - { 10.666667f, 2.666667f, 1.333333f, 0.037500f }, // 843 (8 6 0 2) - { 10.777778f, 2.111111f, 1.555556f, 0.049180f }, // 844 (8 6 1 1) - { 10.888889f, 1.555556f, 1.777778f, 0.072581f }, // 845 (8 6 2 0) - { 11.111111f, 1.777778f, 1.555556f, 0.057692f }, // 846 (8 7 0 1) - { 11.222222f, 1.222222f, 1.777778f, 0.094737f }, // 847 (8 7 1 0) - { 11.555555f, 0.888889f, 1.777778f, 0.140625f }, // 848 (8 8 0 0) - { 9.000000f, 7.000000f, 0.000000f, 0.015873f }, // 849 (9 0 0 7) - { 9.111111f, 6.444445f, 0.222222f, 0.017045f }, // 850 (9 0 1 6) - { 9.222222f, 5.888889f, 0.444444f, 0.018480f }, // 851 (9 0 2 5) - { 9.333333f, 5.333333f, 0.666667f, 0.020270f }, // 852 (9 0 3 4) - { 9.444445f, 4.777778f, 0.888889f, 0.022556f }, // 853 (9 0 4 3) - { 9.555555f, 4.222222f, 1.111111f, 0.025568f }, // 854 (9 0 5 2) - { 9.666667f, 3.666667f, 1.333333f, 0.029703f }, // 855 (9 0 6 1) - { 9.777778f, 3.111111f, 1.555556f, 0.035714f }, // 856 (9 0 7 0) - { 9.444445f, 6.111111f, 0.222222f, 0.017341f }, // 857 (9 1 0 6) - { 9.555555f, 5.555555f, 0.444444f, 0.018908f }, // 858 (9 1 1 5) - { 9.666667f, 5.000000f, 0.666667f, 0.020882f }, // 859 (9 1 2 4) - { 9.777778f, 4.444445f, 0.888889f, 0.023438f }, // 860 (9 1 3 3) - { 9.888889f, 3.888889f, 1.111111f, 0.026866f }, // 861 (9 1 4 2) - { 10.000000f, 3.333333f, 1.333333f, 0.031690f }, // 862 (9 1 5 1) - { 10.111111f, 2.777778f, 1.555556f, 0.038961f }, // 863 (9 1 6 0) - { 9.888889f, 5.222222f, 0.444444f, 0.019438f }, // 864 (9 2 0 5) - { 10.000000f, 4.666667f, 0.666667f, 0.021635f }, // 865 (9 2 1 4) - { 10.111111f, 4.111111f, 0.888889f, 0.024523f }, // 866 (9 2 2 3) - { 10.222222f, 3.555556f, 1.111111f, 0.028481f }, // 867 (9 2 3 2) - { 10.333333f, 3.000000f, 1.333333f, 0.034221f }, // 868 (9 2 4 1) - { 10.444445f, 2.444444f, 1.555556f, 0.043269f }, // 869 (9 2 5 0) - { 10.333333f, 4.333333f, 0.666667f, 0.022556f }, // 870 (9 3 0 4) - { 10.444445f, 3.777778f, 0.888889f, 0.025862f }, // 871 (9 3 1 3) - { 10.555555f, 3.222222f, 1.111111f, 0.030508f }, // 872 (9 3 2 2) - { 10.666667f, 2.666667f, 1.333333f, 0.037500f }, // 873 (9 3 3 1) - { 10.777778f, 2.111111f, 1.555556f, 0.049180f }, // 874 (9 3 4 0) - { 10.777778f, 3.444444f, 0.888889f, 0.027523f }, // 875 (9 4 0 3) - { 10.888889f, 2.888889f, 1.111111f, 0.033088f }, // 876 (9 4 1 2) - { 11.000000f, 2.333333f, 1.333333f, 0.041860f }, // 877 (9 4 2 1) - { 11.111111f, 1.777778f, 1.555556f, 0.057692f }, // 878 (9 4 3 0) - { 11.222222f, 2.555556f, 1.111111f, 0.036437f }, // 879 (9 5 0 2) - { 11.333333f, 2.000000f, 1.333333f, 0.047872f }, // 880 (9 5 1 1) - { 11.444445f, 1.444444f, 1.555556f, 0.070866f }, // 881 (9 5 2 0) - { 11.666667f, 1.666667f, 1.333333f, 0.056604f }, // 882 (9 6 0 1) - { 11.777778f, 1.111111f, 1.555556f, 0.093750f }, // 883 (9 6 1 0) - { 12.111111f, 0.777778f, 1.555556f, 0.142857f }, // 884 (9 7 0 0) - { 10.000000f, 6.000000f, 0.000000f, 0.016667f }, // 885 (10 0 0 6) - { 10.111111f, 5.444445f, 0.222222f, 0.018182f }, // 886 (10 0 1 5) - { 10.222222f, 4.888889f, 0.444444f, 0.020089f }, // 887 (10 0 2 4) - { 10.333333f, 4.333333f, 0.666667f, 0.022556f }, // 888 (10 0 3 3) - { 10.444445f, 3.777778f, 0.888889f, 0.025862f }, // 889 (10 0 4 2) - { 10.555555f, 3.222222f, 1.111111f, 0.030508f }, // 890 (10 0 5 1) - { 10.666667f, 2.666667f, 1.333333f, 0.037500f }, // 891 (10 0 6 0) - { 10.444445f, 5.111111f, 0.222222f, 0.018750f }, // 892 (10 1 0 5) - { 10.555555f, 4.555555f, 0.444444f, 0.020882f }, // 893 (10 1 1 4) - { 10.666667f, 4.000000f, 0.666667f, 0.023684f }, // 894 (10 1 2 3) - { 10.777778f, 3.444444f, 0.888889f, 0.027523f }, // 895 (10 1 3 2) - { 10.888889f, 2.888889f, 1.111111f, 0.033088f }, // 896 (10 1 4 1) - { 11.000000f, 2.333333f, 1.333333f, 0.041860f }, // 897 (10 1 5 0) - { 10.888889f, 4.222222f, 0.444444f, 0.021845f }, // 898 (10 2 0 4) - { 11.000000f, 3.666667f, 0.666667f, 0.025070f }, // 899 (10 2 1 3) - { 11.111111f, 3.111111f, 0.888889f, 0.029605f }, // 900 (10 2 2 2) - { 11.222222f, 2.555556f, 1.111111f, 0.036437f }, // 901 (10 2 3 1) - { 11.333333f, 2.000000f, 1.333333f, 0.047872f }, // 902 (10 2 4 0) - { 11.333333f, 3.333333f, 0.666667f, 0.026786f }, // 903 (10 3 0 3) - { 11.444445f, 2.777778f, 0.888889f, 0.032258f }, // 904 (10 3 1 2) - { 11.555555f, 2.222222f, 1.111111f, 0.040909f }, // 905 (10 3 2 1) - { 11.666667f, 1.666667f, 1.333333f, 0.056604f }, // 906 (10 3 3 0) - { 11.777778f, 2.444444f, 0.888889f, 0.035714f }, // 907 (10 4 0 2) - { 11.888889f, 1.888889f, 1.111111f, 0.047120f }, // 908 (10 4 1 1) - { 12.000000f, 1.333333f, 1.333333f, 0.070313f }, // 909 (10 4 2 0) - { 12.222222f, 1.555556f, 1.111111f, 0.056250f }, // 910 (10 5 0 1) - { 12.333333f, 1.000000f, 1.333333f, 0.094737f }, // 911 (10 5 1 0) - { 12.666667f, 0.666667f, 1.333333f, 0.150000f }, // 912 (10 6 0 0) - { 11.000000f, 5.000000f, 0.000000f, 0.018182f }, // 913 (11 0 0 5) - { 11.111111f, 4.444445f, 0.222222f, 0.020270f }, // 914 (11 0 1 4) - { 11.222222f, 3.888889f, 0.444444f, 0.023018f }, // 915 (11 0 2 3) - { 11.333333f, 3.333333f, 0.666667f, 0.026786f }, // 916 (11 0 3 2) - { 11.444445f, 2.777778f, 0.888889f, 0.032258f }, // 917 (11 0 4 1) - { 11.555555f, 2.222222f, 1.111111f, 0.040909f }, // 918 (11 0 5 0) - { 11.444445f, 4.111111f, 0.222222f, 0.021277f }, // 919 (11 1 0 4) - { 11.555555f, 3.555556f, 0.444444f, 0.024457f }, // 920 (11 1 1 3) - { 11.666667f, 3.000000f, 0.666667f, 0.028939f }, // 921 (11 1 2 2) - { 11.777778f, 2.444444f, 0.888889f, 0.035714f }, // 922 (11 1 3 1) - { 11.888889f, 1.888889f, 1.111111f, 0.047120f }, // 923 (11 1 4 0) - { 11.888889f, 3.222222f, 0.444444f, 0.026239f }, // 924 (11 2 0 3) - { 12.000000f, 2.666667f, 0.666667f, 0.031690f }, // 925 (11 2 1 2) - { 12.111111f, 2.111111f, 0.888889f, 0.040359f }, // 926 (11 2 2 1) - { 12.222222f, 1.555556f, 1.111111f, 0.056250f }, // 927 (11 2 3 0) - { 12.333333f, 2.333333f, 0.666667f, 0.035294f }, // 928 (11 3 0 2) - { 12.444445f, 1.777778f, 0.888889f, 0.046875f }, // 929 (11 3 1 1) - { 12.555555f, 1.222222f, 1.111111f, 0.070866f }, // 930 (11 3 2 0) - { 12.777778f, 1.444444f, 0.888889f, 0.056604f }, // 931 (11 4 0 1) - { 12.888889f, 0.888889f, 1.111111f, 0.097826f }, // 932 (11 4 1 0) - { 13.222222f, 0.555556f, 1.111111f, 0.163636f }, // 933 (11 5 0 0) - { 12.000000f, 4.000000f, 0.000000f, 0.020833f }, // 934 (12 0 0 4) - { 12.111111f, 3.444444f, 0.222222f, 0.024000f }, // 935 (12 0 1 3) - { 12.222222f, 2.888889f, 0.444444f, 0.028481f }, // 936 (12 0 2 2) - { 12.333333f, 2.333333f, 0.666667f, 0.035294f }, // 937 (12 0 3 1) - { 12.444445f, 1.777778f, 0.888889f, 0.046875f }, // 938 (12 0 4 0) - { 12.444445f, 3.111111f, 0.222222f, 0.025862f }, // 939 (12 1 0 3) - { 12.555555f, 2.555556f, 0.444444f, 0.031359f }, // 940 (12 1 1 2) - { 12.666667f, 2.000000f, 0.666667f, 0.040179f }, // 941 (12 1 2 1) - { 12.777778f, 1.444444f, 0.888889f, 0.056604f }, // 942 (12 1 3 0) - { 12.888889f, 2.222222f, 0.444444f, 0.035156f }, // 943 (12 2 0 2) - { 13.000000f, 1.666667f, 0.666667f, 0.047120f }, // 944 (12 2 1 1) - { 13.111111f, 1.111111f, 0.888889f, 0.072581f }, // 945 (12 2 2 0) - { 13.333333f, 1.333333f, 0.666667f, 0.057692f }, // 946 (12 3 0 1) - { 13.444445f, 0.777778f, 0.888889f, 0.103448f }, // 947 (12 3 1 0) - { 13.777778f, 0.444444f, 0.888889f, 0.187500f }, // 948 (12 4 0 0) - { 13.000000f, 3.000000f, 0.000000f, 0.025641f }, // 949 (13 0 0 3) - { 13.111111f, 2.444444f, 0.222222f, 0.031250f }, // 950 (13 0 1 2) - { 13.222222f, 1.888889f, 0.444444f, 0.040359f }, // 951 (13 0 2 1) - { 13.333333f, 1.333333f, 0.666667f, 0.057692f }, // 952 (13 0 3 0) - { 13.444445f, 2.111111f, 0.222222f, 0.035294f }, // 953 (13 1 0 2) - { 13.555555f, 1.555556f, 0.444444f, 0.047872f }, // 954 (13 1 1 1) - { 13.666667f, 1.000000f, 0.666667f, 0.075630f }, // 955 (13 1 2 0) - { 13.888889f, 1.222222f, 0.444444f, 0.059603f }, // 956 (13 2 0 1) - { 14.000000f, 0.666667f, 0.666667f, 0.112500f }, // 957 (13 2 1 0) - { 14.333333f, 0.333333f, 0.666667f, 0.230769f }, // 958 (13 3 0 0) - { 14.000000f, 2.000000f, 0.000000f, 0.035714f }, // 959 (14 0 0 2) - { 14.111111f, 1.444444f, 0.222222f, 0.049180f }, // 960 (14 0 1 1) - { 14.222222f, 0.888889f, 0.444444f, 0.080357f }, // 961 (14 0 2 0) - { 14.444445f, 1.111111f, 0.222222f, 0.062500f }, // 962 (14 1 0 1) - { 14.555555f, 0.555556f, 0.444444f, 0.126761f }, // 963 (14 1 1 0) - { 14.888889f, 0.222222f, 0.444444f, 0.321429f }, // 964 (14 2 0 0) - { 15.000000f, 1.000000f, 0.000000f, 0.066667f }, // 965 (15 0 0 1) - { 15.111111f, 0.444444f, 0.222222f, 0.150000f }, // 966 (15 0 1 0) - { 15.444445f, 0.111111f, 0.222222f, 0.600000f }, // 967 (15 1 0 0) - { 16.000000f, 0.000000f, 0.000000f, FLT_MAX }, // 968 (16 0 0 0) -}; // 969 four cluster elements - -#if ICBC_USE_SIMD - -bool ClusterFit::compress3(Vector3 * start, Vector3 * end) -{ - const int count = m_count; - const SimdVector one = SimdVector(1.0f); - const SimdVector zero = SimdVector(0.0f); - const SimdVector half(0.5f, 0.5f, 0.5f, 0.25f); - const SimdVector two = SimdVector(2.0); - const SimdVector grid(31.0f, 63.0f, 31.0f, 0.0f); - const SimdVector gridrcp(1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f, 0.0f); - - // declare variables - SimdVector beststart = SimdVector(0.0f); - SimdVector bestend = SimdVector(0.0f); - SimdVector besterror = SimdVector(FLT_MAX); - - SimdVector x0 = zero; - - // check all possible clusters for this total order - for (int c0 = 0; c0 <= count; c0++) - { - SimdVector x1 = zero; - - for (int c1 = 0; c1 <= count - c0; c1++) - { - const SimdVector x2 = m_xsum - x1 - x0; - - //Vector3 alphax_sum = x0 + x1 * 0.5f; - //float alpha2_sum = w0 + w1 * 0.25f; - const SimdVector alphax_sum = multiplyAdd(x1, half, x0); // alphax_sum, alpha2_sum - const SimdVector alpha2_sum = alphax_sum.splatW(); - - //const Vector3 betax_sum = x2 + x1 * 0.5f; - //const float beta2_sum = w2 + w1 * 0.25f; - const SimdVector betax_sum = multiplyAdd(x1, half, x2); // betax_sum, beta2_sum - const SimdVector beta2_sum = betax_sum.splatW(); - - //const float alphabeta_sum = w1 * 0.25f; - const SimdVector alphabeta_sum = (x1 * half).splatW(); // alphabeta_sum - - // const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum); - const SimdVector factor = reciprocal(negativeMultiplySubtract(alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum)); - - SimdVector a = negativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor; - SimdVector b = negativeMultiplySubtract(alphax_sum, alphabeta_sum, betax_sum*alpha2_sum) * factor; - - // clamp to the grid - a = min(one, max(zero, a)); - b = min(one, max(zero, b)); - a = truncate(multiplyAdd(grid, a, half)) * gridrcp; - b = truncate(multiplyAdd(grid, b, half)) * gridrcp; - - // compute the error (we skip the constant xxsum) - SimdVector e1 = multiplyAdd(a*a, alpha2_sum, b*b*beta2_sum); - SimdVector e2 = negativeMultiplySubtract(a, alphax_sum, a*b*alphabeta_sum); - SimdVector e3 = negativeMultiplySubtract(b, betax_sum, e2); - SimdVector e4 = multiplyAdd(two, e3, e1); - - // apply the metric to the error term - SimdVector e5 = e4 * m_metricSqr; - SimdVector error = e5.splatX() + e5.splatY() + e5.splatZ(); - - // keep the solution if it wins - if (compareAnyLessThan(error, besterror)) - { - besterror = error; - beststart = a; - bestend = b; - } - - x1 += m_weighted[c0 + c1]; - } - - x0 += m_weighted[c0]; - } - - // save the block if necessary - if (compareAnyLessThan(besterror, m_besterror)) - { - *start = beststart.toVector3(); - *end = bestend.toVector3(); - - // save the error - m_besterror = besterror; - - return true; - } - - return false; -} - -bool ClusterFit::compress4(Vector3 * start, Vector3 * end) -{ - const int count = m_count; - const SimdVector one = SimdVector(1.0f); - const SimdVector zero = SimdVector(0.0f); - const SimdVector half = SimdVector(0.5f); - const SimdVector two = SimdVector(2.0); - const SimdVector onethird(1.0f / 3.0f, 1.0f / 3.0f, 1.0f / 3.0f, 1.0f / 9.0f); - const SimdVector twothirds(2.0f / 3.0f, 2.0f / 3.0f, 2.0f / 3.0f, 4.0f / 9.0f); - const SimdVector twonineths = SimdVector(2.0f / 9.0f); - const SimdVector grid(31.0f, 63.0f, 31.0f, 0.0f); - const SimdVector gridrcp(1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f, 0.0f); - - // declare variables - SimdVector beststart = SimdVector(0.0f); - SimdVector bestend = SimdVector(0.0f); - SimdVector besterror = SimdVector(FLT_MAX); - - SimdVector x0 = zero; - - // check all possible clusters for this total order - for (int c0 = 0; c0 <= count; c0++) - { - SimdVector x1 = zero; - - for (int c1 = 0; c1 <= count - c0; c1++) - { - SimdVector x2 = zero; - - for (int c2 = 0; c2 <= count - c0 - c1; c2++) - { - const SimdVector x3 = m_xsum - x2 - x1 - x0; - - //const Vector3 alphax_sum = x0 + x1 * (2.0f / 3.0f) + x2 * (1.0f / 3.0f); - //const float alpha2_sum = w0 + w1 * (4.0f/9.0f) + w2 * (1.0f/9.0f); - const SimdVector alphax_sum = multiplyAdd(x2, onethird, multiplyAdd(x1, twothirds, x0)); // alphax_sum, alpha2_sum - const SimdVector alpha2_sum = alphax_sum.splatW(); - - //const Vector3 betax_sum = x3 + x2 * (2.0f / 3.0f) + x1 * (1.0f / 3.0f); - //const float beta2_sum = w3 + w2 * (4.0f/9.0f) + w1 * (1.0f/9.0f); - const SimdVector betax_sum = multiplyAdd(x2, twothirds, multiplyAdd(x1, onethird, x3)); // betax_sum, beta2_sum - const SimdVector beta2_sum = betax_sum.splatW(); - - //const float alphabeta_sum = (w1 + w2) * (2.0f/9.0f); - const SimdVector alphabeta_sum = twonineths * (x1 + x2).splatW(); // alphabeta_sum - - //const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum); - const SimdVector factor = reciprocal(negativeMultiplySubtract(alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum)); - - SimdVector a = negativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor; - SimdVector b = negativeMultiplySubtract(alphax_sum, alphabeta_sum, betax_sum*alpha2_sum) * factor; - - // clamp to the grid - a = min(one, max(zero, a)); - b = min(one, max(zero, b)); - a = truncate(multiplyAdd(grid, a, half)) * gridrcp; - b = truncate(multiplyAdd(grid, b, half)) * gridrcp; - - // compute the error (we skip the constant xxsum) - // error = a*a*alpha2_sum + b*b*beta2_sum + 2.0f*( a*b*alphabeta_sum - a*alphax_sum - b*betax_sum ); - SimdVector e1 = multiplyAdd(a*a, alpha2_sum, b*b*beta2_sum); - SimdVector e2 = negativeMultiplySubtract(a, alphax_sum, a*b*alphabeta_sum); - SimdVector e3 = negativeMultiplySubtract(b, betax_sum, e2); - SimdVector e4 = multiplyAdd(two, e3, e1); - - // apply the metric to the error term - SimdVector e5 = e4 * m_metricSqr; - SimdVector error = e5.splatX() + e5.splatY() + e5.splatZ(); - - // keep the solution if it wins - if (compareAnyLessThan(error, besterror)) - { - besterror = error; - beststart = a; - bestend = b; - } - - x2 += m_weighted[c0 + c1 + c2]; - } - - x1 += m_weighted[c0 + c1]; - } - - x0 += m_weighted[c0]; - } - - // save the block if necessary - if (compareAnyLessThan(besterror, m_besterror)) - { - *start = beststart.toVector3(); - *end = bestend.toVector3(); - - // save the error - m_besterror = besterror; - - return true; - } - - return false; -} - -bool ClusterFit::fastCompress3(Vector3 * start, Vector3 * end) -{ - const int count = m_count; - const SimdVector one = SimdVector(1.0f); - const SimdVector zero = SimdVector(0.0f); - const SimdVector half(0.5f, 0.5f, 0.5f, 0.25f); - const SimdVector two = SimdVector(2.0); - const SimdVector grid(31.0f, 63.0f, 31.0f, 0.0f); - const SimdVector gridrcp(1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f, 0.0f); - - // declare variables - SimdVector beststart = SimdVector(0.0f); - SimdVector bestend = SimdVector(0.0f); - SimdVector besterror = SimdVector(FLT_MAX); - - SimdVector x0 = zero; - - // check all possible clusters for this total order - for (int c0 = 0, i = 0; c0 <= count; c0++) - { - SimdVector x1 = zero; - - for (int c1 = 0; c1 <= count - c0; c1++, i++) - { - const SimdVector constants = SimdVector((const float *)&s_threeElement[i]); - - const SimdVector alpha2_sum = constants.splatX(); - const SimdVector beta2_sum = constants.splatY(); - const SimdVector alphabeta_sum = constants.splatZ(); - const SimdVector factor = constants.splatW(); - - const SimdVector alphax_sum = multiplyAdd(x1, half, x0); - const SimdVector betax_sum = m_xsum - alphax_sum; - - SimdVector a = negativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor; - SimdVector b = negativeMultiplySubtract(alphax_sum, alphabeta_sum, betax_sum*alpha2_sum) * factor; - - // clamp to the grid - a = min(one, max(zero, a)); - b = min(one, max(zero, b)); - a = truncate(multiplyAdd(grid, a, half)) * gridrcp; - b = truncate(multiplyAdd(grid, b, half)) * gridrcp; - - // compute the error (we skip the constant xxsum) - SimdVector e1 = multiplyAdd(a*a, alpha2_sum, b*b*beta2_sum); - SimdVector e2 = negativeMultiplySubtract(a, alphax_sum, a*b*alphabeta_sum); - SimdVector e3 = negativeMultiplySubtract(b, betax_sum, e2); - SimdVector e4 = multiplyAdd(two, e3, e1); - - // apply the metric to the error term - SimdVector e5 = e4 * m_metricSqr; - SimdVector error = e5.splatX() + e5.splatY() + e5.splatZ(); - - // keep the solution if it wins - if (compareAnyLessThan(error, besterror)) - { - besterror = error; - beststart = a; - bestend = b; - } - - x1 += m_weighted[c0 + c1]; - } - - x0 += m_weighted[c0]; - } - - // save the block if necessary - if (compareAnyLessThan(besterror, m_besterror)) - { - *start = beststart.toVector3(); - *end = bestend.toVector3(); - - // save the error - m_besterror = besterror; - - return true; - } - - return false; -} - -bool ClusterFit::fastCompress4(Vector3 * start, Vector3 * end) -{ - const SimdVector one = SimdVector(1.0f); - const SimdVector zero = SimdVector(0.0f); - const SimdVector half = SimdVector(0.5f); - const SimdVector two = SimdVector(2.0); - const SimdVector onethird(1.0f / 3.0f, 1.0f / 3.0f, 1.0f / 3.0f, 1.0f / 9.0f); - const SimdVector twothirds(2.0f / 3.0f, 2.0f / 3.0f, 2.0f / 3.0f, 4.0f / 9.0f); - const SimdVector grid(31.0f, 63.0f, 31.0f, 0.0f); - const SimdVector gridrcp(1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f, 0.0f); - - // declare variables - SimdVector beststart = SimdVector(0.0f); - SimdVector bestend = SimdVector(0.0f); - SimdVector besterror = SimdVector(FLT_MAX); - - SimdVector x0 = zero; - - // check all possible clusters for this total order - for (int c0 = 0, i = 0; c0 <= 16; c0++) - { - SimdVector x1 = zero; - - for (int c1 = 0; c1 <= 16 - c0; c1++) - { - SimdVector x2 = zero; - - for (int c2 = 0; c2 <= 16 - c0 - c1; c2++, i++) - { - const SimdVector constants = SimdVector((const float *)&s_fourElement[i]); - - const SimdVector alpha2_sum = constants.splatX(); - const SimdVector beta2_sum = constants.splatY(); - const SimdVector alphabeta_sum = constants.splatZ(); - const SimdVector factor = constants.splatW(); - - const SimdVector alphax_sum = multiplyAdd(x2, onethird, multiplyAdd(x1, twothirds, x0)); - const SimdVector betax_sum = m_xsum - alphax_sum; - - SimdVector a = negativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor; - SimdVector b = negativeMultiplySubtract(alphax_sum, alphabeta_sum, betax_sum*alpha2_sum) * factor; - - // clamp to the grid - a = min(one, max(zero, a)); - b = min(one, max(zero, b)); - a = truncate(multiplyAdd(grid, a, half)) * gridrcp; - b = truncate(multiplyAdd(grid, b, half)) * gridrcp; - - // compute the error (we skip the constant xxsum) - // error = a*a*alpha2_sum + b*b*beta2_sum + 2.0f*( a*b*alphabeta_sum - a*alphax_sum - b*betax_sum ); - SimdVector e1 = multiplyAdd(a*a, alpha2_sum, b*b*beta2_sum); - SimdVector e2 = negativeMultiplySubtract(a, alphax_sum, a*b*alphabeta_sum); - SimdVector e3 = negativeMultiplySubtract(b, betax_sum, e2); - SimdVector e4 = multiplyAdd(two, e3, e1); - - // apply the metric to the error term - SimdVector e5 = e4 * m_metricSqr; - SimdVector error = e5.splatX() + e5.splatY() + e5.splatZ(); - - // keep the solution if it wins - if (compareAnyLessThan(error, besterror)) - { - besterror = error; - beststart = a; - bestend = b; - } - - x2 += m_weighted[c0 + c1 + c2]; - } - - x1 += m_weighted[c0 + c1]; - } - - x0 += m_weighted[c0]; - } - - // save the block if necessary - if (compareAnyLessThan(besterror, m_besterror)) - { - *start = beststart.toVector3(); - *end = bestend.toVector3(); - - // save the error - m_besterror = besterror; - - return true; - } - - return false; -} - -#else - -// This is the ideal way to round, but it's too expensive to do this in the inner loop. -inline Vector3 round565(const Vector3 & v) { - static const Vector3 grid = { 31.0f, 63.0f, 31.0f }; - static const Vector3 gridrcp = { 1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f }; - - Vector3 q = floor(grid * v); - q.x += (v.x > midpoints5[int(q.x)]); - q.y += (v.y > midpoints6[int(q.y)]); - q.z += (v.z > midpoints5[int(q.z)]); - q *= gridrcp; - return q; -} - -bool ClusterFit::compress3(Vector3 * start, Vector3 * end) -{ - const uint count = m_count; - const Vector3 grid = { 31.0f, 63.0f, 31.0f }; - const Vector3 gridrcp = { 1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f }; - - // declare variables - Vector3 beststart = { 0.0f }; - Vector3 bestend = { 0.0f }; - float besterror = FLT_MAX; - - Vector3 x0 = { 0.0f }; - float w0 = 0.0f; - - // check all possible clusters for this total order - for (uint c0 = 0; c0 <= count; c0++) - { - Vector3 x1 = { 0.0f }; - float w1 = 0.0f; - - for (uint c1 = 0; c1 <= count - c0; c1++) - { - float w2 = m_wsum - w0 - w1; - - // These factors could be entirely precomputed. - float const alpha2_sum = w0 + w1 * 0.25f; - float const beta2_sum = w2 + w1 * 0.25f; - float const alphabeta_sum = w1 * 0.25f; - float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum); - - Vector3 const alphax_sum = x0 + x1 * 0.5f; - Vector3 const betax_sum = m_xsum - alphax_sum; - - Vector3 a = (alphax_sum*beta2_sum - betax_sum * alphabeta_sum) * factor; - Vector3 b = (betax_sum*alpha2_sum - alphax_sum * alphabeta_sum) * factor; - - // clamp to the grid - a = saturate(a); - b = saturate(b); -#if ICBC_PERFECT_ROUND - a = round565(a); - b = round565(b); -#else - a = round(grid * a) * gridrcp; - b = round(grid * b) * gridrcp; -#endif - - // compute the error - Vector3 e1 = a * a*alpha2_sum + b * b*beta2_sum + 2.0f*(a*b*alphabeta_sum - a * alphax_sum - b * betax_sum); - - // apply the metric to the error term - float error = dot(e1, m_metricSqr); - - // keep the solution if it wins - if (error < besterror) - { - besterror = error; - beststart = a; - bestend = b; - } - - x1 += m_weighted[c0 + c1]; - w1 += m_weights[c0 + c1]; - } - - x0 += m_weighted[c0]; - w0 += m_weights[c0]; - } - - // save the block if necessary - if (besterror < m_besterror) - { - - *start = beststart; - *end = bestend; - - // save the error - m_besterror = besterror; - - return true; - } - - return false; -} - -bool ClusterFit::compress4(Vector3 * start, Vector3 * end) -{ - const uint count = m_count; - const Vector3 grid = { 31.0f, 63.0f, 31.0f }; - const Vector3 gridrcp = { 1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f }; - - // declare variables - Vector3 beststart = { 0.0f }; - Vector3 bestend = { 0.0f }; - float besterror = FLT_MAX; - - Vector3 x0 = { 0.0f }; - float w0 = 0.0f; - - // check all possible clusters for this total order - for (uint c0 = 0; c0 <= count; c0++) - { - Vector3 x1 = { 0.0f }; - float w1 = 0.0f; - - for (uint c1 = 0; c1 <= count - c0; c1++) - { - Vector3 x2 = { 0.0f }; - float w2 = 0.0f; - - for (uint c2 = 0; c2 <= count - c0 - c1; c2++) - { - float w3 = m_wsum - w0 - w1 - w2; - - float const alpha2_sum = w0 + w1 * (4.0f / 9.0f) + w2 * (1.0f / 9.0f); - float const beta2_sum = w3 + w2 * (4.0f / 9.0f) + w1 * (1.0f / 9.0f); - float const alphabeta_sum = (w1 + w2) * (2.0f / 9.0f); - float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum); - - Vector3 const alphax_sum = x0 + x1 * (2.0f / 3.0f) + x2 * (1.0f / 3.0f); - Vector3 const betax_sum = m_xsum - alphax_sum; - - Vector3 a = (alphax_sum*beta2_sum - betax_sum * alphabeta_sum)*factor; - Vector3 b = (betax_sum*alpha2_sum - alphax_sum * alphabeta_sum)*factor; - - // clamp to the grid - a = saturate(a); - b = saturate(b); -#if ICBC_PERFECT_ROUND - a = round565(a); - b = round565(b); -#else - a = round(grid * a) * gridrcp; - b = round(grid * b) * gridrcp; -#endif - // @@ It would be much more accurate to evaluate the error exactly. - - // compute the error - Vector3 e1 = a * a*alpha2_sum + b * b*beta2_sum + 2.0f*(a*b*alphabeta_sum - a * alphax_sum - b * betax_sum); - - // apply the metric to the error term - float error = dot(e1, m_metricSqr); - - // keep the solution if it wins - if (error < besterror) - { - besterror = error; - beststart = a; - bestend = b; - } - - x2 += m_weighted[c0 + c1 + c2]; - w2 += m_weights[c0 + c1 + c2]; - } - - x1 += m_weighted[c0 + c1]; - w1 += m_weights[c0 + c1]; - } - - x0 += m_weighted[c0]; - w0 += m_weights[c0]; - } - - // save the block if necessary - if (besterror < m_besterror) - { - *start = beststart; - *end = bestend; - - // save the error - m_besterror = besterror; - - return true; - } - - return false; -} - -bool ClusterFit::fastCompress3(Vector3 * start, Vector3 * end) -{ - const uint count = m_count; - const Vector3 grid = { 31.0f, 63.0f, 31.0f }; - const Vector3 gridrcp = { 1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f }; - - // declare variables - Vector3 beststart = { 0.0f }; - Vector3 bestend = { 0.0f }; - float besterror = FLT_MAX; - - Vector3 x0 = { 0.0f }; - float w0 = 0.0f; - - // check all possible clusters for this total order - for (uint c0 = 0, i = 0; c0 <= count; c0++) - { - Vector3 x1 = { 0.0f }; - float w1 = 0.0f; - - for (uint c1 = 0; c1 <= count - c0; c1++, i++) - { - float const alpha2_sum = s_threeElement[i].alpha2_sum; - float const beta2_sum = s_threeElement[i].beta2_sum; - float const alphabeta_sum = s_threeElement[i].alphabeta_sum; - float const factor = s_threeElement[i].factor; - - Vector3 const alphax_sum = x0 + x1 * 0.5f; - Vector3 const betax_sum = m_xsum - alphax_sum; - - Vector3 a = (alphax_sum*beta2_sum - betax_sum * alphabeta_sum) * factor; - Vector3 b = (betax_sum*alpha2_sum - alphax_sum * alphabeta_sum) * factor; - - // clamp to the grid - a = saturate(a); - b = saturate(b); -#if ICBC_PERFECT_ROUND - a = round565(a); - b = round565(b); -#else - a = round(grid * a) * gridrcp; - b = round(grid * b) * gridrcp; -#endif - - // compute the error - Vector3 e1 = a * a*alpha2_sum + b * b*beta2_sum + 2.0f*(a*b*alphabeta_sum - a * alphax_sum - b * betax_sum); - - // apply the metric to the error term - float error = dot(e1, m_metricSqr); - - // keep the solution if it wins - if (error < besterror) - { - besterror = error; - beststart = a; - bestend = b; - } - - x1 += m_weighted[c0 + c1]; - w1 += m_weights[c0 + c1]; - } - - x0 += m_weighted[c0]; - w0 += m_weights[c0]; - } - - // save the block if necessary - if (besterror < m_besterror) - { - - *start = beststart; - *end = bestend; - - // save the error - m_besterror = besterror; - - return true; - } - - return false; -} - -bool ClusterFit::fastCompress4(Vector3 * start, Vector3 * end) -{ - const uint count = m_count; - const Vector3 grid = { 31.0f, 63.0f, 31.0f }; - const Vector3 gridrcp = { 1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f }; - - // declare variables - Vector3 beststart = { 0.0f }; - Vector3 bestend = { 0.0f }; - float besterror = FLT_MAX; - - Vector3 x0 = { 0.0f }; - float w0 = 0.0f; - - // check all possible clusters for this total order - for (uint c0 = 0, i = 0; c0 <= count; c0++) - { - Vector3 x1 = { 0.0f }; - float w1 = 0.0f; - - for (uint c1 = 0; c1 <= count - c0; c1++) - { - Vector3 x2 = { 0.0f }; - float w2 = 0.0f; - - for (uint c2 = 0; c2 <= count - c0 - c1; c2++, i++) - { - float const alpha2_sum = s_fourElement[i].alpha2_sum; - float const beta2_sum = s_fourElement[i].beta2_sum; - float const alphabeta_sum = s_fourElement[i].alphabeta_sum; - float const factor = s_fourElement[i].factor; - - Vector3 const alphax_sum = x0 + x1 * (2.0f / 3.0f) + x2 * (1.0f / 3.0f); - Vector3 const betax_sum = m_xsum - alphax_sum; - - Vector3 a = (alphax_sum*beta2_sum - betax_sum * alphabeta_sum)*factor; - Vector3 b = (betax_sum*alpha2_sum - alphax_sum * alphabeta_sum)*factor; - - // clamp to the grid - a = saturate(a); - b = saturate(b); -#if ICBC_PERFECT_ROUND - a = round565(a); - b = round565(b); -#else - a = round(grid * a) * gridrcp; - b = round(grid * b) * gridrcp; -#endif - // @@ It would be much more accurate to evaluate the error exactly. - - // compute the error - Vector3 e1 = a * a*alpha2_sum + b * b*beta2_sum + 2.0f*(a*b*alphabeta_sum - a * alphax_sum - b * betax_sum); - - // apply the metric to the error term - float error = dot(e1, m_metricSqr); - - // keep the solution if it wins - if (error < besterror) - { - besterror = error; - beststart = a; - bestend = b; - } - - x2 += m_weighted[c0 + c1 + c2]; - w2 += m_weights[c0 + c1 + c2]; - } - - x1 += m_weighted[c0 + c1]; - w1 += m_weights[c0 + c1]; - } - - x0 += m_weighted[c0]; - w0 += m_weights[c0]; - } - - // save the block if necessary - if (besterror < m_besterror) - { - *start = beststart; - *end = bestend; - - // save the error - m_besterror = besterror; - - return true; - } - - return false; -} - -#endif // ICBC_USE_SIMD - - -/////////////////////////////////////////////////////////////////////////////////////////////////// -// Palette evaluation. - -// D3D10 -inline void evaluate_palette4_d3d10(Color16 c0, Color16 c1, Color32 palette[4]) { - palette[2].r = (2 * palette[0].r + palette[1].r) / 3; - palette[2].g = (2 * palette[0].g + palette[1].g) / 3; - palette[2].b = (2 * palette[0].b + palette[1].b) / 3; - palette[2].a = 0xFF; - - palette[3].r = (2 * palette[1].r + palette[0].r) / 3; - palette[3].g = (2 * palette[1].g + palette[0].g) / 3; - palette[3].b = (2 * palette[1].b + palette[0].b) / 3; - palette[3].a = 0xFF; -} -inline void evaluate_palette3_d3d10(Color16 c0, Color16 c1, Color32 palette[4]) { - palette[2].r = (palette[0].r + palette[1].r) / 2; - palette[2].g = (palette[0].g + palette[1].g) / 2; - palette[2].b = (palette[0].b + palette[1].b) / 2; - palette[2].a = 0xFF; - palette[3].u = 0; -} -static void evaluate_palette_d3d10(Color16 c0, Color16 c1, Color32 palette[4]) { - palette[0] = bitexpand_color16_to_color32(c0); - palette[1] = bitexpand_color16_to_color32(c1); - if (c0.u > c1.u) { - evaluate_palette4_d3d10(c0, c1, palette); - } - else { - evaluate_palette3_d3d10(c0, c1, palette); - } -} - -// NV -inline void evaluate_palette4_nv(Color16 c0, Color16 c1, Color32 palette[4]) { - int gdiff = palette[1].g - palette[0].g; - palette[2].r = ((2 * c0.r + c1.r) * 22) / 8; - palette[2].g = (256 * palette[0].g + gdiff / 4 + 128 + gdiff * 80) / 256; - palette[2].b = ((2 * c0.b + c1.b) * 22) / 8; - palette[2].a = 0xFF; - - palette[3].r = ((2 * c1.r + c0.r) * 22) / 8; - palette[3].g = (256 * palette[1].g - gdiff / 4 + 128 - gdiff * 80) / 256; - palette[3].b = ((2 * c1.b + c0.b) * 22) / 8; - palette[3].a = 0xFF; -} -inline void evaluate_palette3_nv(Color16 c0, Color16 c1, Color32 palette[4]) { - int gdiff = palette[1].g - palette[0].g; - palette[2].r = ((c0.r + c1.r) * 33) / 8; - palette[2].g = (256 * palette[0].g + gdiff / 4 + 128 + gdiff * 128) / 256; - palette[2].b = ((c0.b + c1.b) * 33) / 8; - palette[2].a = 0xFF; - palette[3].u = 0; -} -static void evaluate_palette_nv(Color16 c0, Color16 c1, Color32 palette[4]) { - palette[0] = bitexpand_color16_to_color32(c0); - palette[1] = bitexpand_color16_to_color32(c1); - - if (c0.u > c1.u) { - evaluate_palette4_nv(c0, c1, palette); - } - else { - evaluate_palette3_nv(c0, c1, palette); - } -} - -// AMD -inline void evaluate_palette4_amd(Color16 c0, Color16 c1, Color32 palette[4]) { - palette[2].r = (43 * palette[0].r + 21 * palette[1].r + 32) / 8; - palette[2].g = (43 * palette[0].g + 21 * palette[1].g + 32) / 8; - palette[2].b = (43 * palette[0].b + 21 * palette[1].b + 32) / 8; - palette[2].a = 0xFF; - - palette[3].r = (43 * palette[1].r + 21 * palette[0].r + 32) / 8; - palette[3].g = (43 * palette[1].g + 21 * palette[0].g + 32) / 8; - palette[3].b = (43 * palette[1].b + 21 * palette[0].b + 32) / 8; - palette[3].a = 0xFF; -} -inline void evaluate_palette3_amd(Color16 c0, Color16 c1, Color32 palette[4]) { - palette[2].r = (c0.r + c1.r + 1) / 2; - palette[2].g = (c0.g + c1.g + 1) / 2; - palette[2].b = (c0.b + c1.b + 1) / 2; - palette[2].a = 0xFF; - palette[3].u = 0; -} -static void evaluate_palette_amd(Color16 c0, Color16 c1, Color32 palette[4]) { - palette[0] = bitexpand_color16_to_color32(c0); - palette[1] = bitexpand_color16_to_color32(c1); - - if (c0.u > c1.u) { - evaluate_palette4_amd(c0, c1, palette); - } - else { - evaluate_palette3_amd(c0, c1, palette); - } -} - -// Use ICBC_DECODER to determine decoder used. -inline void evaluate_palette4(Color16 c0, Color16 c1, Color32 palette[4]) { -#if ICBC_DECODER == Decoder_D3D10 - evaluate_palette4_d3d10(c0, c1, palette); -#elif ICBC_DECODER == Decoder_NVIDIA - evaluate_palette4_nv(c0, c1, palette); -#elif ICBC_DECODER == Decoder_AMD - evaluate_palette4_amd(c0, c1, palette); -#endif -} -inline void evaluate_palette3(Color16 c0, Color16 c1, Color32 palette[4]) { -#if ICBC_DECODER == Decoder_D3D10 - evaluate_palette3_d3d10(c0, c1, palette); -#elif ICBC_DECODER == Decoder_NVIDIA - evaluate_palette3_nv(c0, c1, palette); -#elif ICBC_DECODER == Decoder_AMD - evaluate_palette3_amd(c0, c1, palette); -#endif -} -inline void evaluate_palette(Color16 c0, Color16 c1, Color32 palette[4]) { -#if ICBC_DECODER == Decoder_D3D10 - evaluate_palette_d3d10(c0, c1, palette); -#elif ICBC_DECODER == Decoder_NVIDIA - evaluate_palette_nv(c0, c1, palette); -#elif ICBC_DECODER == Decoder_AMD - evaluate_palette_amd(c0, c1, palette); -#endif -} - -static void evaluate_palette(Color16 c0, Color16 c1, Vector3 palette[4]) { - Color32 palette32[4]; - evaluate_palette(c0, c1, palette32); - - for (int i = 0; i < 4; i++) { - palette[i] = color_to_vector3(palette32[i]); - } -} - - -/////////////////////////////////////////////////////////////////////////////////////////////////// -// Error evaluation. - -// Different ways of estimating the error. - -static float evaluate_mse(const Vector3 & p, const Vector3 & c, const Vector3 & w) { - Vector3 d = (p - c) * w * 255; - return dot(d, d); -} - -static float evaluate_mse(const Color32 & p, const Vector3 & c, const Vector3 & w) { - Vector3 d = (color_to_vector3(p) - c) * w * 255; - return dot(d, d); -} - - -/*static float evaluate_mse(const Vector3 & p, const Vector3 & c, const Vector3 & w) { - return ww.x * square(p.x-c.x) + ww.y * square(p.y-c.y) + ww.z * square(p.z-c.z); -}*/ - -static int evaluate_mse(const Color32 & p, const Color32 & c) { - return (square(int(p.r)-c.r) + square(int(p.g)-c.g) + square(int(p.b)-c.b)); -} - -/*static float evaluate_mse(const Vector3 palette[4], const Vector3 & c, const Vector3 & w) { - float e0 = evaluate_mse(palette[0], c, w); - float e1 = evaluate_mse(palette[1], c, w); - float e2 = evaluate_mse(palette[2], c, w); - float e3 = evaluate_mse(palette[3], c, w); - return min(min(e0, e1), min(e2, e3)); -}*/ - -static int evaluate_mse(const Color32 palette[4], const Color32 & c) { - int e0 = evaluate_mse(palette[0], c); - int e1 = evaluate_mse(palette[1], c); - int e2 = evaluate_mse(palette[2], c); - int e3 = evaluate_mse(palette[3], c); - return min(min(e0, e1), min(e2, e3)); -} - -// Returns MSE error in [0-255] range. -static int evaluate_mse(const BlockDXT1 * output, Color32 color, int index) { - Color32 palette[4]; - evaluate_palette(output->col0, output->col1, palette); - - return evaluate_mse(palette[index], color); -} - -// Returns weighted MSE error in [0-255] range. -static float evaluate_palette_error(Color32 palette[4], const Color32 * colors, const float * weights, int count) { - - float total = 0.0f; - for (int i = 0; i < count; i++) { - total += weights[i] * evaluate_mse(palette, colors[i]); - } - - return total; -} - -static float evaluate_palette_error(Color32 palette[4], const Color32 * colors, int count) { - - float total = 0.0f; - for (int i = 0; i < count; i++) { - total += evaluate_mse(palette, colors[i]); - } - - return total; -} - -#if 0 -static float evaluate_mse(const BlockDXT1 * output, const Vector3 colors[16]) { - Color32 palette[4]; - output->evaluatePalette(palette, /*d3d9=*/false); - - // convert palette to float. - Vector3 vector_palette[4]; - for (int i = 0; i < 4; i++) { - vector_palette[i] = color_to_vector3(palette[i]); - } - - // evaluate error for each index. - float error = 0.0f; - for (int i = 0; i < 16; i++) { - int index = (output->indices >> (2*i)) & 3; - error += evaluate_mse(vector_palette[index], colors[i]); - } - - return error; -} -#endif - -static float evaluate_mse(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, const BlockDXT1 * output) { - Color32 palette[4]; - evaluate_palette(output->col0, output->col1, palette); - - // convert palette to float. - /*Vector3 vector_palette[4]; - for (int i = 0; i < 4; i++) { - vector_palette[i] = color_to_vector3(palette[i]); - }*/ - - // evaluate error for each index. - float error = 0.0f; - for (int i = 0; i < 16; i++) { - int index = (output->indices >> (2 * i)) & 3; - error += input_weights[i] * evaluate_mse(palette[index], input_colors[i].xyz, color_weights); - } - return error; -} - -float evaluate_dxt1_error(const uint8 rgba_block[16*4], const BlockDXT1 * block, Decoder decoder) { - Color32 palette[4]; - if (decoder == Decoder_NVIDIA) { - evaluate_palette_nv(block->col0, block->col1, palette); - } - else if (decoder == Decoder_AMD) { - evaluate_palette_amd(block->col0, block->col1, palette); - } - else { - evaluate_palette(block->col0, block->col1, palette); - } - - // evaluate error for each index. - float error = 0.0f; - for (int i = 0; i < 16; i++) { - int index = (block->indices >> (2 * i)) & 3; - Color32 c; - c.r = rgba_block[4 * i + 0]; - c.g = rgba_block[4 * i + 1]; - c.b = rgba_block[4 * i + 2]; - c.a = 255; - error += evaluate_mse(palette[index], c); - } - return error; -} - - -/////////////////////////////////////////////////////////////////////////////////////////////////// -// Index selection - -static uint compute_indices4(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 palette[4]) { - - uint indices = 0; - for (int i = 0; i < 16; i++) { - float d0 = evaluate_mse(palette[0], input_colors[i].xyz, color_weights); - float d1 = evaluate_mse(palette[1], input_colors[i].xyz, color_weights); - float d2 = evaluate_mse(palette[2], input_colors[i].xyz, color_weights); - float d3 = evaluate_mse(palette[3], input_colors[i].xyz, color_weights); - - uint b0 = d0 > d3; - uint b1 = d1 > d2; - uint b2 = d0 > d2; - uint b3 = d1 > d3; - uint b4 = d2 > d3; - - uint x0 = b1 & b2; - uint x1 = b0 & b3; - uint x2 = b0 & b4; - - indices |= (x2 | ((x0 | x1) << 1)) << (2 * i); - } - - return indices; -} - - -static uint compute_indices4(const Vector3 input_colors[16], const Vector3 palette[4]) { - - uint indices = 0; - for (int i = 0; i < 16; i++) { - float d0 = evaluate_mse(palette[0], input_colors[i], {1,1,1}); - float d1 = evaluate_mse(palette[1], input_colors[i], {1,1,1}); - float d2 = evaluate_mse(palette[2], input_colors[i], {1,1,1}); - float d3 = evaluate_mse(palette[3], input_colors[i], {1,1,1}); - - uint b0 = d0 > d3; - uint b1 = d1 > d2; - uint b2 = d0 > d2; - uint b3 = d1 > d3; - uint b4 = d2 > d3; - - uint x0 = b1 & b2; - uint x1 = b0 & b3; - uint x2 = b0 & b4; - - indices |= (x2 | ((x0 | x1) << 1)) << (2 * i); - } - - return indices; -} - - -static uint compute_indices(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 palette[4]) { - - uint indices = 0; - for (int i = 0; i < 16; i++) { - float d0 = evaluate_mse(palette[0], input_colors[i].xyz, color_weights); - float d1 = evaluate_mse(palette[1], input_colors[i].xyz, color_weights); - float d2 = evaluate_mse(palette[2], input_colors[i].xyz, color_weights); - float d3 = evaluate_mse(palette[3], input_colors[i].xyz, color_weights); - - uint index; - if (d0 < d1 && d0 < d2 && d0 < d3) index = 0; - else if (d1 < d2 && d1 < d3) index = 1; - else if (d2 < d3) index = 2; - else index = 3; - - indices |= index << (2 * i); - } - - return indices; -} - - -static void output_block3(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 & v0, const Vector3 & v1, BlockDXT1 * block) -{ - Color16 color0 = vector3_to_color16(v0); - Color16 color1 = vector3_to_color16(v1); - - if (color0.u > color1.u) { - swap(color0, color1); - } - - Vector3 palette[4]; - evaluate_palette(color0, color1, palette); - - block->col0 = color0; - block->col1 = color1; - block->indices = compute_indices(input_colors, color_weights, palette); -} - -static void output_block4(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 & v0, const Vector3 & v1, BlockDXT1 * block) -{ - Color16 color0 = vector3_to_color16(v0); - Color16 color1 = vector3_to_color16(v1); - - if (color0.u < color1.u) { - swap(color0, color1); - } - - Vector3 palette[4]; - evaluate_palette(color0, color1, palette); - - block->col0 = color0; - block->col1 = color1; - block->indices = compute_indices4(input_colors, color_weights, palette); -} - - -static void output_block4(const Vector3 input_colors[16], const Vector3 & v0, const Vector3 & v1, BlockDXT1 * block) -{ - Color16 color0 = vector3_to_color16(v0); - Color16 color1 = vector3_to_color16(v1); - - if (color0.u < color1.u) { - swap(color0, color1); - } - - Vector3 palette[4]; - evaluate_palette(color0, color1, palette); - - block->col0 = color0; - block->col1 = color1; - block->indices = compute_indices4(input_colors, palette); -} - -// Least squares fitting of color end points for the given indices. @@ Take weights into account. -static bool optimize_end_points4(uint indices, const Vector4 * colors, /*const float * weights,*/ int count, Vector3 * a, Vector3 * b) -{ - float alpha2_sum = 0.0f; - float beta2_sum = 0.0f; - float alphabeta_sum = 0.0f; - Vector3 alphax_sum = { 0,0,0 }; - Vector3 betax_sum = { 0,0,0 }; - - for (int i = 0; i < count; i++) - { - const uint bits = indices >> (2 * i); - - float beta = float(bits & 1); - if (bits & 2) beta = (1 + beta) / 3.0f; - float alpha = 1.0f - beta; - - alpha2_sum += alpha * alpha; - beta2_sum += beta * beta; - alphabeta_sum += alpha * beta; - alphax_sum += alpha * colors[i].xyz; - betax_sum += beta * colors[i].xyz; - } - - float denom = alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum; - if (equal(denom, 0.0f)) return false; - - float factor = 1.0f / denom; - - *a = saturate((alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor); - *b = saturate((betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor); - - return true; -} - -// Least squares optimization with custom factors. -// This allows us passing the standard [1, 0, 2/3 1/3] weights by default, but also use alternative mappings when the number of clusters is not 4. -static bool optimize_end_points4(uint indices, const Vector3 * colors, int count, float factors[4], Vector3 * a, Vector3 * b) -{ - float alpha2_sum = 0.0f; - float beta2_sum = 0.0f; - float alphabeta_sum = 0.0f; - Vector3 alphax_sum = { 0,0,0 }; - Vector3 betax_sum = { 0,0,0 }; - - for (int i = 0; i < count; i++) - { - const uint idx = (indices >> (2 * i)) & 3; - float alpha = factors[idx]; - float beta = 1 - alpha; - - alpha2_sum += alpha * alpha; - beta2_sum += beta * beta; - alphabeta_sum += alpha * beta; - alphax_sum += alpha * colors[i]; - betax_sum += beta * colors[i]; - } - - float denom = alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum; - if (equal(denom, 0.0f)) return false; - - float factor = 1.0f / denom; - - *a = saturate((alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor); - *b = saturate((betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor); - - return true; -} - -static bool optimize_end_points4(uint indices, const Vector3 * colors, int count, Vector3 * a, Vector3 * b) -{ - float factors[4] = { 1, 0, 2.f / 3, 1.f / 3 }; - return optimize_end_points4(indices, colors, count, factors, a, b); -} - - -// Least squares fitting of color end points for the given indices. @@ This does not support black/transparent index. @@ Take weights into account. -static bool optimize_end_points3(uint indices, const Vector3 * colors, /*const float * weights,*/ int count, Vector3 * a, Vector3 * b) -{ - float alpha2_sum = 0.0f; - float beta2_sum = 0.0f; - float alphabeta_sum = 0.0f; - Vector3 alphax_sum = { 0,0,0 }; - Vector3 betax_sum = { 0,0,0 }; - - for (int i = 0; i < count; i++) - { - const uint bits = indices >> (2 * i); - - float beta = float(bits & 1); - if (bits & 2) beta = 0.5f; - float alpha = 1.0f - beta; - - alpha2_sum += alpha * alpha; - beta2_sum += beta * beta; - alphabeta_sum += alpha * beta; - alphax_sum += alpha * colors[i]; - betax_sum += beta * colors[i]; - } - - float denom = alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum; - if (equal(denom, 0.0f)) return false; - - float factor = 1.0f / denom; - - *a = saturate((alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor); - *b = saturate((betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor); - - return true; -} - - -// find minimum and maximum colors based on bounding box in color space -inline static void fit_colors_bbox(const Vector3 * colors, int count, Vector3 * __restrict c0, Vector3 * __restrict c1) -{ - *c0 = { 0,0,0 }; - *c1 = { 1,1,1 }; - - for (int i = 0; i < count; i++) { - *c0 = max(*c0, colors[i]); - *c1 = min(*c1, colors[i]); - } -} - -inline static void select_diagonal(const Vector3 * colors, int count, Vector3 * __restrict c0, Vector3 * __restrict c1) -{ - Vector3 center = (*c0 + *c1) * 0.5f; - - /*Vector3 center = colors[0]; - for (int i = 1; i < count; i++) { - center = center * float(i-1) / i + colors[i] / i; - }*/ - /*Vector3 center = colors[0]; - for (int i = 1; i < count; i++) { - center += colors[i]; - } - center /= count;*/ - - float cov_xz = 0.0f; - float cov_yz = 0.0f; - for (int i = 0; i < count; i++) { - Vector3 t = colors[i] - center; - cov_xz += t.x * t.z; - cov_yz += t.y * t.z; - } - - float x0 = c0->x; - float y0 = c0->y; - float x1 = c1->x; - float y1 = c1->y; - - if (cov_xz < 0) { - swap(x0, x1); - } - if (cov_yz < 0) { - swap(y0, y1); - } - - *c0 = { x0, y0, c0->z }; - *c1 = { x1, y1, c1->z }; -} - -inline static void inset_bbox(Vector3 * __restrict c0, Vector3 * __restrict c1) -{ - float bias = (8.0f / 255.0f) / 16.0f; - Vector3 inset = (*c0 - *c1) / 16.0f - scalar_to_vector3(bias); - *c0 = saturate(*c0 - inset); - *c1 = saturate(*c1 + inset); -} - - - -// Single color lookup tables from: -// https://github.com/nothings/stb/blob/master/stb_dxt.h -static uint8 match5[256][2]; -static uint8 match6[256][2]; - -static inline int Lerp13(int a, int b) -{ - // replace "/ 3" by "* 0xaaab) >> 17" if your compiler sucks or you really need every ounce of speed. - return (a * 2 + b) / 3; -} - -static void PrepareOptTable(uint8 * table, const uint8 * expand, int size) -{ - for (int i = 0; i < 256; i++) { - int bestErr = 256 * 100; - - for (int min = 0; min < size; min++) { - for (int max = 0; max < size; max++) { - int mine = expand[min]; - int maxe = expand[max]; - - int err = abs(Lerp13(maxe, mine) - i) * 100; - - // DX10 spec says that interpolation must be within 3% of "correct" result, - // add this as error term. (normally we'd expect a random distribution of - // +-1.5% error, but nowhere in the spec does it say that the error has to be - // unbiased - better safe than sorry). - err += abs(max - min) * 3; - - if (err < bestErr) { - bestErr = err; - table[i * 2 + 0] = max; - table[i * 2 + 1] = min; - } - } - } - } -} - -static void init_dxt1_tables() -{ - // Prepare single color lookup tables. - uint8 expand5[32]; - uint8 expand6[64]; - for (int i = 0; i < 32; i++) expand5[i] = (i << 3) | (i >> 2); - for (int i = 0; i < 64; i++) expand6[i] = (i << 2) | (i >> 4); - - PrepareOptTable(&match5[0][0], expand5, 32); - PrepareOptTable(&match6[0][0], expand6, 64); -} - -// Single color compressor, based on: -// https://mollyrocket.com/forums/viewtopic.php?t=392 -static void compress_dxt1_single_color_optimal(Color32 c, BlockDXT1 * output) -{ - output->col0.r = match5[c.r][0]; - output->col0.g = match6[c.g][0]; - output->col0.b = match5[c.b][0]; - output->col1.r = match5[c.r][1]; - output->col1.g = match6[c.g][1]; - output->col1.b = match5[c.b][1]; - output->indices = 0xaaaaaaaa; - - if (output->col0.u < output->col1.u) - { - swap(output->col0.u, output->col1.u); - output->indices ^= 0x55555555; - } -} - - -// Compress block using the average color. -static float compress_dxt1_single_color(const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, BlockDXT1 * output) -{ - // Compute block average. - Vector3 color_sum = { 0,0,0 }; - float weight_sum = 0; - - for (int i = 0; i < count; i++) { - color_sum += colors[i] * weights[i]; - weight_sum += weights[i]; - } - - // Compress optimally. - compress_dxt1_single_color_optimal(vector3_to_color32(color_sum / weight_sum), output); - - // Decompress block color. - Color32 palette[4]; - evaluate_palette(output->col0, output->col1, palette); - - Vector3 block_color = color_to_vector3(palette[output->indices & 0x3]); - - // Evaluate error. - float error = 0; - for (int i = 0; i < count; i++) { - error += weights[i] * evaluate_mse(block_color, colors[i], color_weights); - } - return error; -} - - -static float compress_dxt1_bounding_box_exhaustive(const Vector4 input_colors[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, bool three_color_mode, int max_volume, BlockDXT1 * output) -{ - // Compute bounding box. - Vector3 min_color = { 1,1,1 }; - Vector3 max_color = { 0,0,0 }; - - for (int i = 0; i < count; i++) { - min_color = min(min_color, colors[i]); - max_color = max(max_color, colors[i]); - } - - // Convert to 5:6:5 - int min_r = int(31 * min_color.x); - int min_g = int(63 * min_color.y); - int min_b = int(31 * min_color.z); - int max_r = int(31 * max_color.x + 1); - int max_g = int(63 * max_color.y + 1); - int max_b = int(31 * max_color.z + 1); - - // Expand the box. - int range_r = max_r - min_r; - int range_g = max_g - min_g; - int range_b = max_b - min_b; - - min_r = max(0, min_r - range_r / 2 - 2); - min_g = max(0, min_g - range_g / 2 - 2); - min_b = max(0, min_b - range_b / 2 - 2); - - max_r = min(31, max_r + range_r / 2 + 2); - max_g = min(63, max_g + range_g / 2 + 2); - max_b = min(31, max_b + range_b / 2 + 2); - - // Estimate size of search space. - int volume = (max_r-min_r+1) * (max_g-min_g+1) * (max_b-min_b+1); - - // if size under search_limit, then proceed. Note that search_volume is sqrt of number of evaluations. - if (volume > max_volume) { - return FLT_MAX; - } - - // @@ Convert to fixed point before building box? - Color32 colors32[16]; - for (int i = 0; i < count; i++) { - colors32[i] = vector3_to_color32(colors[i]); - } - - float best_error = FLT_MAX; - Color16 best0, best1; // @@ Record endpoints as Color16? - - Color16 c0, c1; - Color32 palette[4]; - - for(int r0 = min_r; r0 <= max_r; r0++) - for(int g0 = min_g; g0 <= max_g; g0++) - for(int b0 = min_b; b0 <= max_b; b0++) - { - c0.r = r0; c0.g = g0; c0.b = b0; - palette[0] = bitexpand_color16_to_color32(c0); - - for(int r1 = min_r; r1 <= max_r; r1++) - for(int g1 = min_g; g1 <= max_g; g1++) - for(int b1 = min_b; b1 <= max_b; b1++) - { - c1.r = r1; c1.g = g1; c1.b = b1; - palette[1] = bitexpand_color16_to_color32(c1); - - if (c0.u > c1.u) { - // Evaluate error in 4 color mode. - evaluate_palette4(c0, c1, palette); - } - else { - if (three_color_mode) { - // Evaluate error in 3 color mode. - evaluate_palette3(c0, c1, palette); - } - else { - // Skip 3 color mode. - continue; - } - } - - float error = evaluate_palette_error(palette, colors32, weights, count); - - if (error < best_error) { - best_error = error; - best0 = c0; - best1 = c1; - } - } - } - - output->col0 = best0; - output->col1 = best1; - - Vector3 vector_palette[4]; - evaluate_palette(output->col0, output->col1, vector_palette); - - output->indices = compute_indices(input_colors, color_weights, vector_palette); - - return best_error / (255 * 255); -} - - -static void compress_dxt1_cluster_fit(const Vector4 input_colors[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, bool three_color_mode, BlockDXT1 * output) -{ - ClusterFit fit; - -#if ICBC_FAST_CLUSTER_FIT - if (count > 15) { - fit.setColorSet(input_colors, color_weights); - - // start & end are in [0, 1] range. - Vector3 start, end; - fit.fastCompress4(&start, &end); - - if (three_color_mode && fit.fastCompress3(&start, &end)) { - output_block3(input_colors, color_weights, start, end, output); - } - else { - output_block4(input_colors, color_weights, start, end, output); - } - } - else -#endif - { - fit.setColorSet(colors, weights, count, color_weights); - - // start & end are in [0, 1] range. - Vector3 start, end; - fit.compress4(&start, &end); - - if (three_color_mode && fit.compress3(&start, &end)) { - output_block3(input_colors, color_weights, start, end, output); - } - else { - output_block4(input_colors, color_weights, start, end, output); - } - } -} - - -static float refine_endpoints(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, bool three_color_mode, float input_error, BlockDXT1 * output) { - // TODO: - // - Optimize palette evaluation when updating only one channel. - // - try all diagonals. - - // Things that don't help: - // - Alternate endpoint updates. - // - Randomize order. - // - If one direction does not improve, test opposite direction next. - - static const int8 deltas[16][3] = { - {1,0,0}, - {0,1,0}, - {0,0,1}, - - {-1,0,0}, - {0,-1,0}, - {0,0,-1}, - - {1,1,0}, - {1,0,1}, - {0,1,1}, - - {-1,-1,0}, - {-1,0,-1}, - {0,-1,-1}, - - {-1,1,0}, - //{-1,0,1}, - - {1,-1,0}, - {0,-1,1}, - - //{1,0,-1}, - {0,1,-1}, - }; - - float best_error = input_error; - - int lastImprovement = 0; - for (int i = 0; i < 256; i++) { - BlockDXT1 refined = *output; - int8 delta[3] = { deltas[i % 16][0], deltas[i % 16][1], deltas[i % 16][2] }; - - if ((i / 16) & 1) { - refined.col0.r += delta[0]; - refined.col0.g += delta[1]; - refined.col0.b += delta[2]; - } - else { - refined.col1.r += delta[0]; - refined.col1.g += delta[1]; - refined.col1.b += delta[2]; - } - - if (!three_color_mode) { - if (refined.col0.u == refined.col1.u) refined.col1.g += 1; - if (refined.col0.u < refined.col1.u) swap(refined.col0.u, refined.col1.u); - } - - Vector3 palette[4]; - evaluate_palette(output->col0, output->col1, palette); - - refined.indices = compute_indices(input_colors, color_weights, palette); - - float refined_error = evaluate_mse(input_colors, input_weights, color_weights, &refined); - if (refined_error < best_error) { - best_error = refined_error; - *output = refined; - lastImprovement = i; - } - - // Early out if the last 32 steps didn't improve error. - if (i - lastImprovement > 32) break; - } - - return best_error; -} - - -static float compress_dxt1(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, bool three_color_mode, bool hq, BlockDXT1 * output) -{ - Vector3 colors[16]; - float weights[16]; - int count = reduce_colors(input_colors, input_weights, colors, weights); - - if (count == 0) { - // Output trivial block. - output->col0.u = 0; - output->col1.u = 0; - output->indices = 0; - return 0; - } - - // Cluster fit cannot handle single color blocks, so encode them optimally. - if (count == 1) { - compress_dxt1_single_color_optimal(vector3_to_color32(colors[0]), output); - return evaluate_mse(input_colors, input_weights, color_weights, output); - } - - // Quick end point selection. - Vector3 c0, c1; - fit_colors_bbox(colors, count, &c0, &c1); - inset_bbox(&c0, &c1); - select_diagonal(colors, count, &c0, &c1); - output_block4(input_colors, color_weights, c0, c1, output); - - float error = evaluate_mse(input_colors, input_weights, color_weights, output); - - // Refine color for the selected indices. - if (optimize_end_points4(output->indices, input_colors, 16, &c0, &c1)) { - BlockDXT1 optimized_block; - output_block4(input_colors, color_weights, c0, c1, &optimized_block); - - float optimized_error = evaluate_mse(input_colors, input_weights, color_weights, &optimized_block); - if (optimized_error < error) { - error = optimized_error; - *output = optimized_block; - } - } - - // @@ Use current endpoints as input for initial PCA approximation? - - // Try cluster fit. - BlockDXT1 cluster_fit_output; - compress_dxt1_cluster_fit(input_colors, colors, weights, count, color_weights, three_color_mode, &cluster_fit_output); - - float cluster_fit_error = evaluate_mse(input_colors, input_weights, color_weights, &cluster_fit_output); - if (cluster_fit_error < error) { - *output = cluster_fit_output; - error = cluster_fit_error; - } - - if (hq) { - error = refine_endpoints(input_colors, input_weights, color_weights, three_color_mode, error, output); - } - - return error; -} - - -// -static bool centroid_end_points(uint indices, const Vector3 * colors, /*const float * weights,*/ float factor[4], Vector3 * c0, Vector3 * c1) { - - *c0 = { 0,0,0 }; - *c1 = { 0,0,0 }; - float w0_sum = 0; - float w1_sum = 0; - - for (int i = 0; i < 16; i++) { - int idx = (indices >> (2 * i)) & 3; - float w0 = factor[idx];// * weights[i]; - float w1 = (1 - factor[idx]);// * weights[i]; - - *c0 += colors[i] * w0; w0_sum += w0; - *c1 += colors[i] * w1; w1_sum += w1; - } - - *c0 *= (1.0f / w0_sum); - *c1 *= (1.0f / w1_sum); - - return true; -} - - - -static float compress_dxt1_test(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, BlockDXT1 * output) -{ - Vector3 colors[16]; - for (int i = 0; i < 16; i++) { - colors[i] = input_colors[i].xyz; - } - int count = 16; - - // Quick end point selection. - Vector3 c0, c1; - fit_colors_bbox(colors, count, &c0, &c1); - if (c0 == c1) { - compress_dxt1_single_color_optimal(vector3_to_color32(c0), output); - return evaluate_mse(input_colors, input_weights, color_weights, output); - } - inset_bbox(&c0, &c1); - select_diagonal(colors, count, &c0, &c1); - - output_block4(colors, c0, c1, output); - float best_error = evaluate_mse(input_colors, input_weights, color_weights, output); - - - // Given an index assignment, we can compute end points in two different ways: - // - least squares optimization. - // - centroid. - // Are these different? The first finds the end points that minimize the least squares error. - // The second averages the input colors - - while (true) { - float last_error = best_error; - uint last_indices = output->indices; - - int cluster_counts[4] = { 0, 0, 0, 0 }; - for (int i = 0; i < 16; i++) { - int idx = (output->indices >> (2 * i)) & 3; - cluster_counts[idx] += 1; - } - int n = 0; - for (int i = 0; i < 4; i++) n += int(cluster_counts[i] != 0); - - if (n == 4) { - float factors[4] = { 1.0f, 0.0f, 2.0f / 3, 1.0f / 3 }; - if (optimize_end_points4(last_indices, colors, 16, factors, &c0, &c1)) { - BlockDXT1 refined_block; - output_block4(colors, c0, c1, &refined_block); - float new_error = evaluate_mse(input_colors, input_weights, color_weights, &refined_block); - if (new_error < best_error) { - best_error = new_error; - *output = refined_block; - } - } - } - else if (n == 3) { - // 4 options: - static const float tables[4][3] = { - { 0, 2.f/3, 1.f/3 }, // 0, 1/3, 2/3 - { 1, 0, 1.f/3 }, // 0, 1/3, 1 - { 1, 0, 2.f/3 }, // 0, 2/3, 1 - { 1, 2.f/3, 1.f/3 }, // 1/2, 2/3, 1 - }; - - for (int k = 0; k < 4; k++) { - // Remap tables: - float factors[4]; - for (int i = 0, j = 0; i < 4; i++) { - factors[i] = tables[k][j]; - if (cluster_counts[i] != 0) j += 1; - } - if (optimize_end_points4(last_indices, colors, 16, factors, &c0, &c1)) { - BlockDXT1 refined_block; - output_block4(colors, c0, c1, &refined_block); - float new_error = evaluate_mse(input_colors, input_weights, color_weights, &refined_block); - if (new_error < best_error) { - best_error = new_error; - *output = refined_block; - } - } - } - - // @@ And 1 3-color block: - // 0, 1/2, 1 - } - else if (n == 2) { - - // 6 options: - static const float tables[6][2] = { - { 0, 1.f/3 }, // 0, 1/3 - { 0, 2.f/3 }, // 0, 2/3 - { 1, 0 }, // 0, 1 - { 2.f/3, 1.f/3 }, // 1/3, 2/3 - { 1, 1.f/3 }, // 1/3, 1 - { 1, 2.f/3 }, // 2/3, 1 - }; - - for (int k = 0; k < 6; k++) { - // Remap tables: - float factors[4]; - for (int i = 0, j = 0; i < 4; i++) { - factors[i] = tables[k][j]; - if (cluster_counts[i] != 0) j += 1; - } - if (optimize_end_points4(last_indices, colors, 16, factors, &c0, &c1)) { - BlockDXT1 refined_block; - output_block4(colors, c0, c1, &refined_block); - float new_error = evaluate_mse(input_colors, input_weights, color_weights, &refined_block); - if (new_error < best_error) { - best_error = new_error; - *output = refined_block; - } - } - } - - // @@ And 2 3-color blocks: - // 0, 0.5 - // 0.5, 1 - // 0, 1 // This is equivalent to the 4 color mode. - } - - // If error has not improved, stop. - //if (best_error == last_error) break; - - // If error has not improved or indices haven't changed, stop. - if (output->indices == last_indices || best_error < last_error) break; - } - - if (false) { - best_error = refine_endpoints(input_colors, input_weights, color_weights, false, best_error, output); - } - - return best_error; -} - - - -static float compress_dxt1_fast(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, BlockDXT1 * output) -{ - Vector3 colors[16]; - for (int i = 0; i < 16; i++) { - colors[i] = input_colors[i].xyz; - } - int count = 16; - - /*float error = FLT_MAX; - error = compress_dxt1_single_color(colors, input_weights, count, color_weights, output); - - if (error == 0.0f || count == 1) { - // Early out. - return error; - }*/ - - // Quick end point selection. - Vector3 c0, c1; - fit_colors_bbox(colors, count, &c0, &c1); - if (c0 == c1) { - compress_dxt1_single_color_optimal(vector3_to_color32(c0), output); - return evaluate_mse(input_colors, input_weights, color_weights, output); - } - inset_bbox(&c0, &c1); - select_diagonal(colors, count, &c0, &c1); - output_block4(input_colors, color_weights, c0, c1, output); - - // Refine color for the selected indices. - if (optimize_end_points4(output->indices, input_colors, 16, &c0, &c1)) { - output_block4(input_colors, color_weights, c0, c1, output); - } - - return evaluate_mse(input_colors, input_weights, color_weights, output); -} - - -static void compress_dxt1_fast(const uint8 input_colors[16*4], BlockDXT1 * output) { - - Vector3 vec_colors[16]; - for (int i = 0; i < 16; i++) { - vec_colors[i] = { input_colors[4 * i + 0] / 255.0f, input_colors[4 * i + 1] / 255.0f, input_colors[4 * i + 2] / 255.0f }; - } - - // Quick end point selection. - Vector3 c0, c1; - //fit_colors_bbox(colors, count, &c0, &c1); - //select_diagonal(colors, count, &c0, &c1); - fit_colors_bbox(vec_colors, 16, &c0, &c1); - if (c0 == c1) { - compress_dxt1_single_color_optimal(vector3_to_color32(c0), output); - return; - } - inset_bbox(&c0, &c1); - select_diagonal(vec_colors, 16, &c0, &c1); - output_block4(vec_colors, c0, c1, output); - - // Refine color for the selected indices. - if (optimize_end_points4(output->indices, vec_colors, 16, &c0, &c1)) { - output_block4(vec_colors, c0, c1, output); - } -} - -// Public API - -void init() { - init_dxt1_tables(); -} - -float compress_dxt1(const float input_colors[16 * 4], const float input_weights[16], const float rgb[3], bool three_color_mode, bool hq, void * output) { - return compress_dxt1((Vector4*)input_colors, input_weights, { rgb[0], rgb[1], rgb[2] }, three_color_mode, hq, (BlockDXT1*)output); -} - -float compress_dxt1_fast(const float input_colors[16 * 4], const float input_weights[16], const float rgb[3], void * output) { - return compress_dxt1_fast((Vector4*)input_colors, input_weights, { rgb[0], rgb[1], rgb[2] }, (BlockDXT1*)output); -} - -void compress_dxt1_fast(const unsigned char input_colors[16 * 4], void * output) { - compress_dxt1_fast(input_colors, (BlockDXT1*)output); -} - -void compress_dxt1_test(const float input_colors[16 * 4], const float input_weights[16], const float rgb[3], void * output) { - compress_dxt1_test((Vector4*)input_colors, input_weights, { rgb[0], rgb[1], rgb[2] }, (BlockDXT1*)output); -} - -float evaluate_dxt1_error(const unsigned char rgba_block[16 * 4], const void * dxt_block, Decoder decoder/*=Decoder_D3D10*/) { - return evaluate_dxt1_error(rgba_block, (BlockDXT1 *)dxt_block, decoder); -} - -} // icbc -#endif // ICBC_IMPLEMENTATION +// icbc.h v1.03 +// A High Quality SIMD BC1 Encoder by Ignacio Castano . +// +// LICENSE: +// MIT license at the end of this file. + +#ifndef ICBC_H +#define ICBC_H + +namespace icbc { + + void init_dxt1(); + + enum Quality { + Quality_Level1, // Box fit + least squares fit. + Quality_Level2, // Cluster fit 4, threshold = 24. + Quality_Level3, // Cluster fit 4, threshold = 32. + Quality_Level4, // Cluster fit 4, threshold = 48. + Quality_Level5, // Cluster fit 4, threshold = 64. + Quality_Level6, // Cluster fit 4, threshold = 96. + Quality_Level7, // Cluster fit 4, threshold = 128. + Quality_Level8, // Cluster fit 4+3, threshold = 256. + Quality_Level9, // Cluster fit 4+3, threshold = 256 + Refinement. + + Quality_Fast = Quality_Level1, + Quality_Default = Quality_Level8, + Quality_Max = Quality_Level9, + }; + + float compress_dxt1(Quality level, const float * input_colors, const float * input_weights, const float color_weights[3], bool three_color_mode, bool three_color_black, void * output); + + // @@ Is there any difference between this and compress_dxt1(Quality_Level0) ? + float compress_dxt1_fast(const float input_colors[16 * 4], const float input_weights[16], const float color_weights[3], void * output); + void compress_dxt1_fast(const unsigned char input_colors[16 * 4], void * output); + + + enum Decoder { + Decoder_D3D10 = 0, + Decoder_NVIDIA = 1, + Decoder_AMD = 2 + }; + + void decode_dxt1(const void * block, unsigned char rgba_block[16 * 4], Decoder decoder = Decoder_D3D10); + float evaluate_dxt1_error(const unsigned char rgba_block[16 * 4], const void * block, Decoder decoder = Decoder_D3D10); + +} + +#endif // ICBC_H + +#ifdef ICBC_IMPLEMENTATION + +// Instruction level support must be chosen at compile time setting ICBC_SIMD to one of these values: +#define ICBC_FLOAT 0 +#define ICBC_SSE2 1 +#define ICBC_SSE41 2 +#define ICBC_AVX1 3 +#define ICBC_AVX2 4 +#define ICBC_AVX512 5 +#define ICBC_NEON -1 +#define ICBC_VMX -2 + +// SIMD version. (FLOAT=0, SSE2=1, SSE41=2, AVX1=3, AVX2=4, AVX512=5, NEON=-1, VMX=-2) +#ifndef ICBC_SIMD +#if _M_ARM +#define ICBC_SIMD -1 +#else +#define ICBC_SIMD 4 +#endif +#endif + +// AVX1 does not require FMA, and depending on whether it's Intel or AMD you may have FMA3 or FMA4. What a mess. +#ifndef ICBC_USE_FMA +//#define ICBC_USE_FMA 3 +//#define ICBC_USE_FMA 4 +#endif + +#if ICBC_SIMD >= ICBC_AVX2 +#define ICBC_BMI2 1 +#endif + +// Apparently rcp is not deterministic (different precision on Intel and AMD), enable if you don't care about that for a small performance boost. +//#define ICBC_USE_RCP 1 + +#if ICBC_SIMD == ICBC_AVX2 +#define ICBC_USE_AVX2_PERMUTE2 1 // Using permutevar8x32 and bitops. +#endif + +#if ICBC_SIMD == ICBC_AVX512 +#define ICBC_USE_AVX512_PERMUTE 1 +#endif + +#if ICBC_SIMD == ICBC_NEON +#define ICBC_USE_NEON_VTL 0 // Not tested. +#endif + + +#ifndef ICBC_DECODER +#define ICBC_DECODER 0 // 0 = d3d10, 1 = nvidia, 2 = amd +#endif + +// Some experimental knobs: +#define ICBC_PERFECT_ROUND 0 // Enable perfect rounding to compute cluster fit residual. + + +#if ICBC_SIMD >= ICBC_SSE2 +#include +#endif + +#if ICBC_SIMD >= ICBC_SSE41 +#include +#endif + +#if ICBC_SIMD >= ICBC_AVX1 +#include +#endif + +#if ICBC_SIMD >= ICBC_AVX512 && _MSC_VER +#include +#endif + +#if ICBC_SIMD == ICBC_NEON +#include +#endif + +#if ICBC_SIMD == ICBC_VMX +#include +#endif + +#if _MSC_VER +#include // _BitScanReverse +#endif + +#include +#include // abs +#include // memset +#include // floorf +#include // FLT_MAX + +#ifndef ICBC_ASSERT +#if _DEBUG +#define ICBC_ASSERT assert +#include +#else +#define ICBC_ASSERT(x) +#endif +#endif + +namespace icbc { + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Basic Templates + +template inline void swap(T & a, T & b) { + T temp(a); + a = b; + b = temp; +} + +template inline T max(const T & a, const T & b) { + return (b < a) ? a : b; +} + +template inline T min(const T & a, const T & b) { + return (a < b) ? a : b; +} + +template inline T clamp(const T & x, const T & a, const T & b) { + return min(max(x, a), b); +} + +template inline T square(const T & a) { + return a * a; +} + + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Basic Types + +typedef uint8_t uint8; +typedef int8_t int8; +typedef uint16_t uint16; +typedef uint32_t uint32; +typedef uint32_t uint; + + +struct Color16 { + union { + struct { + uint16 b : 5; + uint16 g : 6; + uint16 r : 5; + }; + uint16 u; + }; +}; + +struct Color32 { + union { + struct { + uint8 b, g, r, a; + }; + uint32 u; + }; +}; + +struct BlockDXT1 { + Color16 col0; + Color16 col1; + uint32 indices; +}; + + +struct Vector3 { + float x; + float y; + float z; + + inline void operator+=(Vector3 v) { + x += v.x; y += v.y; z += v.z; + } + inline void operator*=(Vector3 v) { + x *= v.x; y *= v.y; z *= v.z; + } + inline void operator*=(float s) { + x *= s; y *= s; z *= s; + } +}; + +struct Vector4 { + union { + struct { + float x, y, z, w; + }; + Vector3 xyz; + }; +}; + + +inline Vector3 operator*(Vector3 v, float s) { + return { v.x * s, v.y * s, v.z * s }; +} + +inline Vector3 operator*(float s, Vector3 v) { + return { v.x * s, v.y * s, v.z * s }; +} + +inline Vector3 operator*(Vector3 a, Vector3 b) { + return { a.x * b.x, a.y * b.y, a.z * b.z }; +} + +inline float dot(Vector3 a, Vector3 b) { + return a.x * b.x + a.y * b.y + a.z * b.z; +} + +inline Vector3 operator+(Vector3 a, Vector3 b) { + return { a.x + b.x, a.y + b.y, a.z + b.z }; +} + +inline Vector3 operator-(Vector3 a, Vector3 b) { + return { a.x - b.x, a.y - b.y, a.z - b.z }; +} + +inline Vector3 operator/(Vector3 v, float s) { + return { v.x / s, v.y / s, v.z / s }; +} + +inline float saturate(float x) { + return clamp(x, 0.0f, 1.0f); +} + +inline Vector3 saturate(Vector3 v) { + return { saturate(v.x), saturate(v.y), saturate(v.z) }; +} + +inline Vector3 min(Vector3 a, Vector3 b) { + return { min(a.x, b.x), min(a.y, b.y), min(a.z, b.z) }; +} + +inline Vector3 max(Vector3 a, Vector3 b) { + return { max(a.x, b.x), max(a.y, b.y), max(a.z, b.z) }; +} + +inline Vector3 round(Vector3 v) { + return { floorf(v.x+0.5f), floorf(v.y + 0.5f), floorf(v.z + 0.5f) }; +} + +inline Vector3 floor(Vector3 v) { + return { floorf(v.x), floorf(v.y), floorf(v.z) }; +} + +inline bool operator==(const Vector3 & a, const Vector3 & b) { + return a.x == b.x && a.y == b.y && a.z == b.z; +} + +inline Vector3 scalar_to_vector3(float f) { + return {f, f, f}; +} + +inline float lengthSquared(Vector3 v) { + return dot(v, v); +} + +inline bool equal(float a, float b, float epsilon = 0.0001) { + // http://realtimecollisiondetection.net/blog/?p=89 + //return fabsf(a - b) < epsilon * max(1.0f, max(fabsf(a), fabsf(b))); + return fabsf(a - b) < epsilon; +} + +inline bool equal(Vector3 a, Vector3 b, float epsilon) { + return equal(a.x, b.x, epsilon) && equal(a.y, b.y, epsilon) && equal(a.z, b.z, epsilon); +} + + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// SIMD + +#ifndef ICBC_ALIGN_16 +#if __GNUC__ +# define ICBC_ALIGN_16 __attribute__ ((__aligned__ (16))) +#else // _MSC_VER +# define ICBC_ALIGN_16 __declspec(align(16)) +#endif +#endif + +#if __GNUC__ +#define ICBC_FORCEINLINE inline __attribute__((always_inline)) +#else +#define ICBC_FORCEINLINE __forceinline +#endif + + +// Count trailing zeros (BSR). +ICBC_FORCEINLINE int ctz(uint mask) { +#if __GNUC__ + return __builtin_ctz(mask); +#else + unsigned long index; + _BitScanReverse(&index, mask); + return (int)index; +#endif +} + + +#if ICBC_SIMD == ICBC_FLOAT // Purely scalar version. + +#define VEC_SIZE 1 + +using VFloat = float; +using VMask = bool; + +ICBC_FORCEINLINE float & lane(VFloat & v, int i) { return v; } +ICBC_FORCEINLINE VFloat vzero() { return 0.0f; } +ICBC_FORCEINLINE VFloat vbroadcast(float x) { return x; } +ICBC_FORCEINLINE VFloat vload(const float * ptr) { return *ptr; } +ICBC_FORCEINLINE VFloat vrcp(VFloat a) { return 1.0f / a; } +ICBC_FORCEINLINE VFloat vmadd(VFloat a, VFloat b, VFloat c) { return a * b + c; } +ICBC_FORCEINLINE VFloat vmsub(VFloat a, VFloat b, VFloat c) { return a * b - c; } +ICBC_FORCEINLINE VFloat vm2sub(VFloat a, VFloat b, VFloat c, VFloat d) { return a * b - c * d; } +ICBC_FORCEINLINE VFloat vsaturate(VFloat a) { return min(max(a, 0.0f), 1.0f); } +ICBC_FORCEINLINE VFloat vround01(VFloat a) { return float(int(a + 0.5f)); } +ICBC_FORCEINLINE VFloat lane_id() { return 0; } +ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) { return mask ? b : a; } +ICBC_FORCEINLINE bool all(VMask m) { return m; } +ICBC_FORCEINLINE bool any(VMask m) { return m; } +ICBC_FORCEINLINE uint mask(VMask m) { return (uint)m; } +ICBC_FORCEINLINE int reduce_min_index(VFloat v) { return 0; } +ICBC_FORCEINLINE void vtranspose4(VFloat & a, VFloat & b, VFloat & c, VFloat & d) {} + +#elif ICBC_SIMD == ICBC_SSE2 || ICBC_SIMD == ICBC_SSE41 + +#define VEC_SIZE 4 + +#if __GNUC__ +union VFloat { + __m128 v; + float m128_f32[VEC_SIZE]; + + VFloat() {} + VFloat(__m128 v) : v(v) {} + operator __m128 & () { return v; } +}; +union VMask { + __m128 m; + + VMask() {} + VMask(__m128 m) : m(m) {} + operator __m128 & () { return m; } +}; +#else +using VFloat = __m128; +using VMask = __m128; +#endif + +ICBC_FORCEINLINE float & lane(VFloat & v, int i) { + return v.m128_f32[i]; +} + +ICBC_FORCEINLINE VFloat vzero() { + return _mm_setzero_ps(); +} + +ICBC_FORCEINLINE VFloat vbroadcast(float x) { + return _mm_set1_ps(x); +} + +ICBC_FORCEINLINE VFloat vload(const float * ptr) { + return _mm_load_ps(ptr); +} + +ICBC_FORCEINLINE VFloat vgather(const float * base, VFloat index) { + VFloat v; + for (int i = 0; i < VEC_SIZE; i++) { + lane(v, i) = base[int(lane(index, i))]; + } + return v; +} + +ICBC_FORCEINLINE VFloat operator+(VFloat a, VFloat b) { + return _mm_add_ps(a, b); +} + +ICBC_FORCEINLINE VFloat operator-(VFloat a, VFloat b) { + return _mm_sub_ps(a, b); +} + +ICBC_FORCEINLINE VFloat operator*(VFloat a, VFloat b) { + return _mm_mul_ps(a, b); +} + +ICBC_FORCEINLINE VFloat vrcp(VFloat a) { +#if ICBC_USE_RCP + VFloat r = _mm_rcp_ps(a); + return _mm_mul_ps(r, _mm_sub_ps(vbroadcast(2.0f), _mm_mul_ps(r, a))); // r * (2 - r * a) +#else + return _mm_div_ps(vbroadcast(1.0f), a); +#endif +} + +// a*b+c +ICBC_FORCEINLINE VFloat vmadd(VFloat a, VFloat b, VFloat c) { + return a * b + c; +} + +ICBC_FORCEINLINE VFloat vmsub(VFloat a, VFloat b, VFloat c) { + return a * b - c; +} + +ICBC_FORCEINLINE VFloat vm2sub(VFloat a, VFloat b, VFloat c, VFloat d) { + return a * b - c * d; +} + +ICBC_FORCEINLINE VFloat vsaturate(VFloat a) { + auto zero = _mm_setzero_ps(); + auto one = _mm_set1_ps(1.0f); + return _mm_min_ps(_mm_max_ps(a, zero), one); +} + +ICBC_FORCEINLINE VFloat vround01(VFloat a) { +#if ICBC_SIMD == ICBC_SSE41 + return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); +#else + // @@ Assumes a is positive and small. + return _mm_cvtepi32_ps(_mm_cvttps_epi32(a + vbroadcast(0.5f))); +#endif +} + +ICBC_FORCEINLINE VFloat vtruncate(VFloat a) { +#if ICBC_SIMD == ICBC_SSE41 + return _mm_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +#else + return _mm_cvtepi32_ps(_mm_cvttps_epi32(a)); +#endif +} + +ICBC_FORCEINLINE VFloat lane_id() { + return _mm_set_ps(3, 2, 1, 0); +} + +ICBC_FORCEINLINE VMask operator> (VFloat A, VFloat B) { return _mm_cmpgt_ps(A, B); } +ICBC_FORCEINLINE VMask operator>=(VFloat A, VFloat B) { return _mm_cmpge_ps(A, B); } +ICBC_FORCEINLINE VMask operator< (VFloat A, VFloat B) { return _mm_cmplt_ps(A, B); } +ICBC_FORCEINLINE VMask operator<=(VFloat A, VFloat B) { return _mm_cmple_ps(A, B); } + +ICBC_FORCEINLINE VMask operator| (VMask A, VMask B) { return _mm_or_ps(A, B); } +ICBC_FORCEINLINE VMask operator& (VMask A, VMask B) { return _mm_and_ps(A, B); } +ICBC_FORCEINLINE VMask operator^ (VMask A, VMask B) { return _mm_xor_ps(A, B); } + +// mask ? b : a +ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) { +#if ICBC_SIMD == ICBC_SSE41 + return _mm_blendv_ps(a, b, mask); +#else + return _mm_or_ps(_mm_andnot_ps(mask, a), _mm_and_ps(mask, b)); +#endif +} + +ICBC_FORCEINLINE bool all(VMask m) { + int value = _mm_movemask_ps(m); + return value == 0x7; +} + +ICBC_FORCEINLINE bool any(VMask m) { + int value = _mm_movemask_ps(m); + return value != 0; +} + +ICBC_FORCEINLINE uint mask(VMask m) { + return (uint)_mm_movemask_ps(m); +} + +ICBC_FORCEINLINE int reduce_min_index(VFloat v) { + + // First do an horizontal reduction. // v = [ D C | B A ] + VFloat shuf = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 3, 0, 1)); // [ C D | A B ] + VFloat mins = _mm_min_ps(v, shuf); // mins = [ D+C C+D | B+A A+B ] + shuf = _mm_movehl_ps(shuf, mins); // [ C D | D+C C+D ] // let the compiler avoid a mov by reusing shuf + mins = _mm_min_ss(mins, shuf); + mins = _mm_shuffle_ps(mins, mins, _MM_SHUFFLE(0, 0, 0, 0)); + + // Then find the index. + uint mask = _mm_movemask_ps(v <= mins); + return ctz(mask); +} + +// https://gcc.gnu.org/legacy-ml/gcc-patches/2005-10/msg00324.html +ICBC_FORCEINLINE void vtranspose4(VFloat & r0, VFloat & r1, VFloat & r2, VFloat & r3) { + VFloat t0 = _mm_unpacklo_ps(r0, r1); + VFloat t1 = _mm_unpacklo_ps(r2, r3); + VFloat t2 = _mm_unpackhi_ps(r0, r1); + VFloat t3 = _mm_unpackhi_ps(r2, r3); + r0 = _mm_movelh_ps(t0, t1); + r1 = _mm_movehl_ps(t1, t0); + r2 = _mm_movelh_ps(t2, t3); + r3 = _mm_movehl_ps(t3, t2); +} + + +#elif ICBC_SIMD == ICBC_AVX1 || ICBC_SIMD == ICBC_AVX2 + +#define VEC_SIZE 8 + +#if __GNUC__ +union VFloat { + __m256 v; + float m256_f32[VEC_SIZE]; + + VFloat() {} + VFloat(__m256 v) : v(v) {} + operator __m256 & () { return v; } +}; +union VInt { + __m256i v; + int m256_i32[VEC_SIZE]; + + VInt() {} + VInt(__m256i v) : v(v) {} + operator __m256i & () { return v; } +}; +union VMask { + __m256 m; + + VMask() {} + VMask(__m256 m) : m(m) {} + operator __m256 & () { return m; } +}; +#else +using VFloat = __m256; +using VInt = __m256i; +using VMask = __m256; // Emulate mask vector using packed float. +#endif + +ICBC_FORCEINLINE float & lane(VFloat & v, int i) { + return v.m256_f32[i]; +} + +ICBC_FORCEINLINE VFloat vzero() { + return _mm256_setzero_ps(); +} + +ICBC_FORCEINLINE VFloat vbroadcast(float a) { + return _mm256_set1_ps(a); +} + +ICBC_FORCEINLINE VFloat vload(const float * ptr) { + return _mm256_load_ps(ptr); +} + +ICBC_FORCEINLINE VFloat vgather(const float * base, VFloat index) { +#if ICBC_SIMD == ICBC_AVX2 + return _mm256_i32gather_ps(base, _mm256_cvtps_epi32(index), 4); +#else + VFloat v; + for (int i = 0; i < VEC_SIZE; i++) { + lane(v, i) = base[int(lane(index, i))]; + } + return v; +#endif +} + +ICBC_FORCEINLINE VFloat operator+(VFloat a, VFloat b) { + return _mm256_add_ps(a, b); +} + +ICBC_FORCEINLINE VFloat operator-(VFloat a, VFloat b) { + return _mm256_sub_ps(a, b); +} + +ICBC_FORCEINLINE VFloat operator*(VFloat a, VFloat b) { + return _mm256_mul_ps(a, b); +} + +ICBC_FORCEINLINE VFloat vrcp(VFloat a) { +#if ICBC_USE_RCP + #if ICBC_SIMD == ICBC_AVX512 + VFloat r = _mm256_rcp14_ps(a); + #else + VFloat r = _mm256_rcp_ps(a); + #endif + + // r = r * (2 - r * a) + #if ICBC_USE_FMA == 3 || ICBC_AVX2 + return _mm256_mul_ps(r, _mm256_fnmadd_ps(r, a, vbroadcast(2.0f))); + #else + return _mm256_mul_ps(r, _mm256_sub_ps(vbroadcast(2.0f), _mm256_mul_ps(r, a))); + #endif +#else + return _mm256_div_ps(vbroadcast(1.0f), a); +#endif +} + +// a*b+c +ICBC_FORCEINLINE VFloat vmadd(VFloat a, VFloat b, VFloat c) { +#if ICBC_USE_FMA == 3 || ICBC_SIMD == ICBC_AVX2 + return _mm256_fmadd_ps(a, b, c); +#elif ICBC_USE_FMA == 4 + return _mm256_macc_ps(a, b, c); +#else + return ((a * b) + c); +#endif +} + +ICBC_FORCEINLINE VFloat vmsub(VFloat a, VFloat b, VFloat c) { +#if ICBC_USE_FMA == 3 || ICBC_SIMD == ICBC_AVX2 + return _mm256_fmsub_ps(a, b, c); +#elif ICBC_USE_FMA == 4 + return _mm256_msub_ps(a, b, c); +#else + return ((a * b) - c); +#endif +} + +ICBC_FORCEINLINE VFloat vm2sub(VFloat a, VFloat b, VFloat c, VFloat d) { + return vmsub(a, b, c * d); +} + +ICBC_FORCEINLINE VFloat vsaturate(VFloat a) { + __m256 zero = _mm256_setzero_ps(); + __m256 one = _mm256_set1_ps(1.0f); + return _mm256_min_ps(_mm256_max_ps(a, zero), one); +} + +ICBC_FORCEINLINE VFloat vround01(VFloat a) { + return _mm256_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); +} + +ICBC_FORCEINLINE VFloat vtruncate(VFloat a) { + return _mm256_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); +} + +ICBC_FORCEINLINE VFloat lane_id() { + return _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0); +} + +ICBC_FORCEINLINE VMask operator> (VFloat A, VFloat B) { return _mm256_cmp_ps(A, B, _CMP_GT_OQ); } +ICBC_FORCEINLINE VMask operator>=(VFloat A, VFloat B) { return _mm256_cmp_ps(A, B, _CMP_GE_OQ); } +ICBC_FORCEINLINE VMask operator< (VFloat A, VFloat B) { return _mm256_cmp_ps(A, B, _CMP_LT_OQ); } +ICBC_FORCEINLINE VMask operator<=(VFloat A, VFloat B) { return _mm256_cmp_ps(A, B, _CMP_LE_OQ); } + +ICBC_FORCEINLINE VMask operator| (VMask A, VMask B) { return _mm256_or_ps(A, B); } +ICBC_FORCEINLINE VMask operator& (VMask A, VMask B) { return _mm256_and_ps(A, B); } +ICBC_FORCEINLINE VMask operator^ (VMask A, VMask B) { return _mm256_xor_ps(A, B); } + +// mask ? b : a +ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) { + return _mm256_blendv_ps(a, b, mask); +} + +ICBC_FORCEINLINE bool all(VMask m) { + __m256 zero = _mm256_setzero_ps(); + return _mm256_testc_ps(_mm256_cmp_ps(zero, zero, _CMP_EQ_UQ), m) == 0; +} + +ICBC_FORCEINLINE bool any(VMask m) { + return _mm256_testz_ps(m, m) == 0; +} + +ICBC_FORCEINLINE uint mask(VMask m) { + return (uint)_mm256_movemask_ps(m); +} + +ICBC_FORCEINLINE int reduce_min_index(VFloat v) { + + __m128 vlow = _mm256_castps256_ps128(v); + __m128 vhigh = _mm256_extractf128_ps(v, 1); + vlow = _mm_min_ps(vlow, vhigh); + + // First do an horizontal reduction. // v = [ D C | B A ] + __m128 shuf = _mm_shuffle_ps(vlow, vlow, _MM_SHUFFLE(2, 3, 0, 1)); // [ C D | A B ] + __m128 mins = _mm_min_ps(vlow, shuf); // mins = [ D+C C+D | B+A A+B ] + shuf = _mm_movehl_ps(shuf, mins); // [ C D | D+C C+D ] + mins = _mm_min_ss(mins, shuf); + + VFloat vmin = _mm256_permute_ps(_mm256_set_m128(mins, mins), 0); // _MM256_PERMUTE(0, 0, 0, 0, 0, 0, 0, 0) + + // Then find the index. + uint mask = _mm256_movemask_ps(v <= vmin); + return ctz(mask); +} + +// AoS to SoA +ICBC_FORCEINLINE void vtranspose4(VFloat & a, VFloat & b, VFloat & c, VFloat & d) { + VFloat r0 = _mm256_unpacklo_ps(a, b); + VFloat r1 = _mm256_unpacklo_ps(c, d); + VFloat r2 = _mm256_permute2f128_ps(r0, r1, 0x20); + VFloat r3 = _mm256_permute2f128_ps(r0, r1, 0x31); + r0 = _mm256_unpackhi_ps(a, b); + r1 = _mm256_unpackhi_ps(c, d); + a = _mm256_unpacklo_ps(r2, r3); + b = _mm256_unpackhi_ps(r2, r3); + r2 = _mm256_permute2f128_ps(r0, r1, 0x20); + r3 = _mm256_permute2f128_ps(r0, r1, 0x31); + c = _mm256_unpacklo_ps(r2, r3); + d = _mm256_unpackhi_ps(r2, r3); +} + +ICBC_FORCEINLINE int lane(VInt v, int i) { + //return _mm256_extract_epi32(v, i); + return v.m256_i32[i]; +} + +ICBC_FORCEINLINE VInt vzeroi() { + return _mm256_setzero_si256(); +} + +ICBC_FORCEINLINE VInt vbroadcast(int a) { + return _mm256_set1_epi32(a); +} + +ICBC_FORCEINLINE VInt vload(const int * ptr) { + return _mm256_load_si256((const __m256i*)ptr); +} + +ICBC_FORCEINLINE VInt operator- (VInt A, int b) { return _mm256_sub_epi32(A, _mm256_set1_epi32(b)); } +ICBC_FORCEINLINE VInt operator& (VInt A, int b) { return _mm256_and_si256(A, _mm256_set1_epi32(b)); } +ICBC_FORCEINLINE VInt operator>> (VInt A, int b) { return _mm256_srli_epi32(A, b); } + +ICBC_FORCEINLINE VMask operator> (VInt A, int b) { return _mm256_castsi256_ps(_mm256_cmpgt_epi32(A, _mm256_set1_epi32(b))); } +ICBC_FORCEINLINE VMask operator== (VInt A, int b) { return _mm256_castsi256_ps(_mm256_cmpeq_epi32(A, _mm256_set1_epi32(b))); } + +// mask ? v[idx] : 0 +ICBC_FORCEINLINE VFloat vpermuteif(VMask mask, VFloat v, VInt idx) { + return _mm256_and_ps(_mm256_permutevar8x32_ps(v, idx), mask); +} + +// mask ? (idx > 8 ? vhi[idx] : vlo[idx]) : 0 +ICBC_FORCEINLINE VFloat vpermute2if(VMask mask, VFloat vlo, VFloat vhi, VInt idx) { +#if 0 + VMask mhi = idx > 7; + vlo = _mm256_permutevar8x32_ps(vlo, idx); + vhi = _mm256_permutevar8x32_ps(vhi, idx); + VFloat v = _mm256_blendv_ps(vlo, vhi, mhi); + return _mm256_and_ps(v, mask); +#else + // Fabian Giesen says not to mix _mm256_blendv_ps and _mm256_permutevar8x32_ps since they contend for the same gates and instead suggests the following: + vhi = _mm256_xor_ps(vhi, vlo); + VFloat v = _mm256_permutevar8x32_ps(vlo, idx); + VMask mhi = idx > 7; + v = _mm256_xor_ps(v, _mm256_and_ps(_mm256_permutevar8x32_ps(vhi, idx), mhi)); + return _mm256_and_ps(v, mask); +#endif +} + +ICBC_FORCEINLINE VFloat vgatherifpositive(const float * base, VInt vidx) { + VFloat v = vzero(); + for (int i = 0; i < VEC_SIZE; i++) { + int idx = lane(vidx, i); + if (idx >= 0) lane(v, i) = base[idx]; + } + return v; +} + +ICBC_FORCEINLINE VFloat vmask_gather(VMask mask, const float * base, VInt vidx) { +#if 1//ICBC_SIMD == ICBC_AVX2 + VFloat v = vzero(); + return _mm256_mask_i32gather_ps(v, base, vidx, mask, 4); +#else + VFloat v = vzero(); + for (int i = 0; i < VEC_SIZE; i++) { + int idx = lane(vidx, i); + if (idx >= 0) lane(v, i) = base[idx]; + } + return v; +#endif +} + + +#elif ICBC_SIMD == ICBC_AVX512 + +#define VEC_SIZE 16 + +#if __GNUC__ +union VFloat { + __m512 v; + float m512_f32[VEC_SIZE]; + + VFloat() {} + VFloat(__m512 v) : v(v) {} + operator __m512 & () { return v; } +}; +union VInt { + __m512i v; + int m512_i32[VEC_SIZE]; + + VInt() {} + VInt(__m512i v) : v(v) {} + operator __m512i & () { return v; } +}; +#else +using VFloat = __m512; +using VInt = __m512i; +#endif +struct VMask { __mmask16 m; }; + +ICBC_FORCEINLINE float & lane(VFloat & v, int i) { + return v.m512_f32[i]; +} + +ICBC_FORCEINLINE VFloat vzero() { + return _mm512_setzero_ps(); +} + +ICBC_FORCEINLINE VFloat vbroadcast(float a) { + return _mm512_set1_ps(a); +} + +ICBC_FORCEINLINE VFloat vload(const float * ptr) { + return _mm512_load_ps(ptr); +} + +ICBC_FORCEINLINE VFloat vload(VMask mask, const float * ptr) { + return _mm512_mask_load_ps(_mm512_undefined(), mask.m, ptr); +} + +ICBC_FORCEINLINE VFloat vload(VMask mask, const float * ptr, float fallback) { + return _mm512_mask_load_ps(_mm512_set1_ps(fallback), mask.m, ptr); +} + +ICBC_FORCEINLINE VFloat vgather(const float * base, VFloat index) { + return _mm512_i32gather_ps(_mm512_cvtps_epi32(index), base, 4); +} + +ICBC_FORCEINLINE VFloat operator+(VFloat a, VFloat b) { + return _mm512_add_ps(a, b); +} + +ICBC_FORCEINLINE VFloat operator-(VFloat a, VFloat b) { + return _mm512_sub_ps(a, b); +} + +ICBC_FORCEINLINE VFloat operator*(VFloat a, VFloat b) { + return _mm512_mul_ps(a, b); +} + +ICBC_FORCEINLINE VFloat vrcp(VFloat a) { +#if ICBC_USE_RCP + VFloat r = _mm512_rcp14_ps(a); + + // r = r * (2 - r * a) + return _mm512_mul_ps(r, _mm512_fnmadd_ps(r, a, vbroadcast(2.0f))); +#else + return _mm512_div_ps(vbroadcast(1.0f), a); +#endif +} + +// a*b+c +ICBC_FORCEINLINE VFloat vmadd(VFloat a, VFloat b, VFloat c) { + return _mm512_fmadd_ps(a, b, c); +} + +ICBC_FORCEINLINE VFloat vmsub(VFloat a, VFloat b, VFloat c) { + return _mm512_fmsub_ps(a, b, c); +} + +ICBC_FORCEINLINE VFloat vm2sub(VFloat a, VFloat b, VFloat c, VFloat d) { + return vmsub(a, b, c * d); +} + +ICBC_FORCEINLINE VFloat vsaturate(VFloat a) { + auto zero = _mm512_setzero_ps(); + auto one = _mm512_set1_ps(1.0f); + return _mm512_min_ps(_mm512_max_ps(a, zero), one); +} + +ICBC_FORCEINLINE VFloat vround01(VFloat a) { + return _mm512_roundscale_ps(a, _MM_FROUND_TO_NEAREST_INT); +} + +ICBC_FORCEINLINE VFloat vtruncate(VFloat a) { + return _mm512_roundscale_ps(a, _MM_FROUND_TO_ZERO); +} + +ICBC_FORCEINLINE VFloat lane_id() { + return _mm512_set_ps(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); +} + +ICBC_FORCEINLINE VMask operator> (VFloat A, VFloat B) { return { _mm512_cmp_ps_mask(A, B, _CMP_GT_OQ) }; } +ICBC_FORCEINLINE VMask operator>=(VFloat A, VFloat B) { return { _mm512_cmp_ps_mask(A, B, _CMP_GE_OQ) }; } +ICBC_FORCEINLINE VMask operator< (VFloat A, VFloat B) { return { _mm512_cmp_ps_mask(A, B, _CMP_LT_OQ) }; } +ICBC_FORCEINLINE VMask operator<=(VFloat A, VFloat B) { return { _mm512_cmp_ps_mask(A, B, _CMP_LE_OQ) }; } + +ICBC_FORCEINLINE VMask operator! (VMask A) { return { _mm512_knot(A.m) }; } +ICBC_FORCEINLINE VMask operator| (VMask A, VMask B) { return { _mm512_kor(A.m, B.m) }; } +ICBC_FORCEINLINE VMask operator& (VMask A, VMask B) { return { _mm512_kand(A.m, B.m) }; } +ICBC_FORCEINLINE VMask operator^ (VMask A, VMask B) { return { _mm512_kxor(A.m, B.m) }; } + +// mask ? b : a +ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) { + return _mm512_mask_blend_ps(mask.m, a, b); +} + +ICBC_FORCEINLINE bool all(VMask mask) { + return mask.m == 0xFFFFFFFF; +} + +ICBC_FORCEINLINE bool any(VMask mask) { + return mask.m != 0; +} + +ICBC_FORCEINLINE uint mask(VMask mask) { + return mask.m; +} + +ICBC_FORCEINLINE int reduce_min_index(VFloat v) { + + // First do an horizontal reduction. + VFloat vmin = vbroadcast(_mm512_reduce_min_ps(v)); + + // Then find the index. + VMask mask = (v <= vmin); + return ctz(mask.m); +} + +//ICBC_FORCEINLINE void vtranspose4(VFloat & a, VFloat & b, VFloat & c, VFloat & d); // @@ + +ICBC_FORCEINLINE int lane(VInt v, int i) { + //return _mm256_extract_epi32(v, i); + return v.m512_i32[i]; +} + +ICBC_FORCEINLINE VInt vzeroi() { + return _mm512_setzero_epi32(); +} + +ICBC_FORCEINLINE VInt vbroadcast(int a) { + return _mm512_set1_epi32(a); +} + +ICBC_FORCEINLINE VInt vload(const int * ptr) { + return _mm512_load_epi32(ptr); +} + +ICBC_FORCEINLINE VInt operator- (VInt A, int b) { return _mm512_sub_epi32(A, vbroadcast(b)); } +ICBC_FORCEINLINE VInt operator& (VInt A, int b) { return _mm512_and_si256(A, vbroadcast(b)); } +ICBC_FORCEINLINE VInt operator>> (VInt A, int b) { return _mm512_srli_epi32(A, b); } + +ICBC_FORCEINLINE VMask operator> (VInt A, int b) { return _mm512_cmpgt_epi32_mask(A, vbroadcast(b)); } +ICBC_FORCEINLINE VMask operator== (VInt A, int b) { return _mm512_cmpeq_epi32_mask(A, vbroadcast(b)); } + +// mask ? v[idx] : 0 +ICBC_FORCEINLINE VFloat vpermuteif(VMask mask, VFloat v, VInt idx) { + return _mm256_maskz_permutexvar_ps(mask, idx, v); +} + + +#elif ICBC_SIMD == ICBC_NEON + +#define VEC_SIZE 4 + +#if __GNUC__ +union VFloat { + float32x4_t v; + float e[4]; + VFloat() {} + VFloat(float32x4_t v) : v(v) {} + operator float32x4_t & () { return v; } +}; +struct VMask { + uint32x4_t v; + VMask() {} + VMask(uint32x4_t v) : v(v) {} + operator uint32x4_t & () { return v; } +}; +#else +using VFloat = float32x4_t; +using VMask = uint32x4_t; +#endif + +ICBC_FORCEINLINE float & lane(VFloat & v, int i) { +#if defined(__clang__) + return v.e[i]; +#elif defined(_MSC_VER) + return v.n128_f32[i]; +#else + return v.v[i]; +#endif +} + +ICBC_FORCEINLINE VFloat vzero() { + return vdupq_n_f32(0.0f); +} + +ICBC_FORCEINLINE VFloat vbroadcast(float a) { + return vdupq_n_f32(a); +} + +ICBC_FORCEINLINE VFloat vload(const float * ptr) { + return vld1q_f32(ptr); +} + +ICBC_FORCEINLINE VFloat operator+(VFloat a, VFloat b) { + return vaddq_f32(a, b); +} + +ICBC_FORCEINLINE VFloat operator-(VFloat a, VFloat b) { + return vsubq_f32(a, b); +} + +ICBC_FORCEINLINE VFloat operator*(VFloat a, VFloat b) { + return vmulq_f32(a, b); +} + +ICBC_FORCEINLINE VFloat vrcp(VFloat a) { +//#if ICBC_USE_RCP + VFloat rcp = vrecpeq_f32(a); + //return rcp; + return vmulq_f32(vrecpsq_f32(a, rcp), rcp); +//#else +// return vdiv_f32(vbroadcast(1.0f), a); // @@ ARMv8 only? +//#endif +} + +// a*b+c +ICBC_FORCEINLINE VFloat vmadd(VFloat a, VFloat b, VFloat c) { + return vmlaq_f32(c, a, b); +} + +ICBC_FORCEINLINE VFloat vmsub(VFloat a, VFloat b, VFloat c) { + return a * b - c; +} + +ICBC_FORCEINLINE VFloat vm2sub(VFloat a, VFloat b, VFloat c, VFloat d) { + // vmlsq_f32(a, b, c) == a - (b * c) + return vmlsq_f32(a * b, c, d); +} + +ICBC_FORCEINLINE VFloat vsaturate(VFloat a) { + return vminq_f32(vmaxq_f32(a, vzero()), vbroadcast(1)); +} + +ICBC_FORCEINLINE VFloat vround01(VFloat a) { +#if __ARM_RACH >= 8 + return vrndqn_f32(a); // Round to integral (to nearest, ties to even) +#else + // @@ Assumes a is positive and ~small + return vcvtq_f32_s32(vcvtq_s32_f32(a + vbroadcast(0.5))); +#endif +} + +ICBC_FORCEINLINE VFloat vtruncate(VFloat a) { +#if __ARM_RACH >= 8 + return vrndq_f32(a); +#else + return vcvtq_f32_s32(vcvtq_s32_f32(a)); +#endif +} + +ICBC_FORCEINLINE VFloat lane_id() { + ICBC_ALIGN_16 float data[4] = { 0, 1, 2, 3 }; + return vld1q_f32(data); +} + +ICBC_FORCEINLINE VMask operator> (VFloat A, VFloat B) { return { vcgtq_f32(A, B) }; } +ICBC_FORCEINLINE VMask operator>=(VFloat A, VFloat B) { return { vcgeq_f32(A, B) }; } +ICBC_FORCEINLINE VMask operator< (VFloat A, VFloat B) { return { vcltq_f32(A, B) }; } +ICBC_FORCEINLINE VMask operator<=(VFloat A, VFloat B) { return { vcleq_f32(A, B) }; } + +ICBC_FORCEINLINE VMask operator| (VMask A, VMask B) { return { vorrq_u32(A, B) }; } +ICBC_FORCEINLINE VMask operator& (VMask A, VMask B) { return { vandq_u32(A, B) }; } +ICBC_FORCEINLINE VMask operator^ (VMask A, VMask B) { return { veorq_u32(A, B) }; } + +// mask ? b : a +ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) { + return vbslq_f32(mask, b, a); +} + +ICBC_FORCEINLINE bool all(VMask mask) { + uint32x2_t m2 = vpmin_u32(vget_low_u32(mask), vget_high_u32(mask)); + uint32x2_t m1 = vpmin_u32(m2, m2); +#if defined(_MSC_VER) + return m1.n64_u32[0] != 0; +#else + return m1[0] != 0; +#endif +} + +ICBC_FORCEINLINE bool any(VMask mask) { + uint32x2_t m2 = vpmax_u32(vget_low_u32(mask), vget_high_u32(mask)); + uint32x2_t m1 = vpmax_u32(m2, m2); +#if defined(_MSC_VER) + return m1.n64_u32[0] != 0; +#else + return m1[0] != 0; +#endif +} + +// @@ Is this the best we can do? +// From: https://github.com/jratcliff63367/sse2neon/blob/master/SSE2NEON.h +ICBC_FORCEINLINE uint mask(VMask mask) { + static const uint32x4_t movemask = { 1, 2, 4, 8 }; + static const uint32x4_t highbit = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }; + uint32x4_t t1 = vtstq_u32(mask, highbit); + uint32x4_t t2 = vandq_u32(t1, movemask); + uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2)); + return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1); +} + +inline int reduce_min_index(VFloat v) { +#if 0 + float32x2_t m2 = vpmin_f32(vget_low_u32(V), vget_high_u32(v)); + float32x2_t m1 = vpmin_f32(m2, m2); + float min_value = vget_lane_f32(m1, 0); + VFloat vmin = vbroadcast(min_value); + + // (v <= vmin) + + // @@ Find the lane that contains minValue? +#endif + + // @@ Is there a better way to do this reduction? + int min_idx = 0; + float min_value = lane(v, 0); + + for (int i = 1; i < VEC_SIZE; i++) { + float value = lane(v, i); + if (value < min_value) { + min_value = value; + min_idx = i; + } + } + + return min_idx; +} + +// https://github.com/Maratyszcza/NNPACK/blob/master/src/neon/transpose.h +ICBC_FORCEINLINE void vtranspose4(VFloat & a, VFloat & b, VFloat & c, VFloat & d) { + // row0 = ( x00 x01 x02 x03 ) + // row1 = ( x10 x11 x12 x13 ) + // row2 = ( x20 x21 x22 x23 ) + // row3 = ( x30 x31 x32 x33 ) + + // row01 = ( x00 x10 x02 x12 ), ( x01 x11 x03, x13 ) + // row23 = ( x20 x30 x22 x32 ), ( x21 x31 x23, x33 ) + float32x4x2_t row01 = vtrnq_f32(a, b); + float32x4x2_t row23 = vtrnq_f32(c, d); + + // row0 = ( x00 x10 x20 x30 ) + // row1 = ( x01 x11 x21 x31 ) + // row2 = ( x02 x12 x22 x32 ) + // row3 = ( x03 x13 x23 x33 ) + a = vcombine_f32(vget_low_f32(row01.val[0]), vget_low_f32(row23.val[0])); + b = vcombine_f32(vget_low_f32(row01.val[1]), vget_low_f32(row23.val[1])); + c = vcombine_f32(vget_high_f32(row01.val[0]), vget_high_f32(row23.val[0])); + d = vcombine_f32(vget_high_f32(row01.val[1]), vget_high_f32(row23.val[1])); +} + +#elif ICBC_SIMD == ICBC_VMX + +#define VEC_SIZE 4 + +union VFloat { + vectro float v; + float e[4]; + VFloat() {} + VFloat(vector float v) : v(v) {} + operator vector float & () { return v; } +}; +struct VMask { + vector unsigned int v; + VMask() {} + VMask(vector unsigned int v) : v(v) {} + operator vector unsigned int & () { return v; } +}; + +ICBC_FORCEINLINE float & lane(VFloat & v, int i) { + return v.e[i]; +} + +ICBC_FORCEINLINE VFloat vzero() { + return vec_splats(0.0f); +} + +ICBC_FORCEINLINE VFloat vbroadcast(float a) { + return vec_splats(a); +} + +ICBC_FORCEINLINE VFloat vload(const float * ptr) { + return vec_ld(ptr) +} + +ICBC_FORCEINLINE VFloat operator+(VFloat a, VFloat b) { + return vec_add(a, b); +} + +ICBC_FORCEINLINE VFloat operator-(VFloat a, VFloat b) { + return vec_sub(a, b); +} + +ICBC_FORCEINLINE VFloat operator*(VFloat a, VFloat b) { + return vec_madd(a, b, vec_splats(-0.0f)); +} + +ICBC_FORCEINLINE VFloat vrcp(VFloat a) { +#if ICBC_USE_RCP + // get the reciprocal estimate + vector float estimate = vec_re( v.vec ); + + // one round of Newton-Rhaphson refinement + vector float diff = vec_nmsub( estimate, v.vec, vec_splats( 1.0f ) ); + return vec_madd(diff, estimate, estimate ); +#else + return vec_div(vec_splats(1),a); +#endif +} + +// a*b+c +ICBC_FORCEINLINE VFloat vmadd(VFloat a, VFloat b, VFloat c) { + return vec_madd(a, b, c); +} + +ICBC_FORCEINLINE VFloat vmsub(VFloat a, VFloat b, VFloat c) { + return vec_msub(a, b, c); // @@ Is this right? +} + +ICBC_FORCEINLINE VFloat vm2sub(VFloat a, VFloat b, VFloat c, VFloat d) { + return vmsub(a, b, c * d); +} + +ICBC_FORCEINLINE VFloat vsaturate(VFloat a) { + return vec_min(vec_max(a, vzero()), vbroadcast(1)); +} + +ICBC_FORCEINLINE VFloat vround01(VFloat a) { + // @@ Assumes a is positive and ~small + return vec_trunc(a + vbroadcast(0.5)); +} + +ICBC_FORCEINLINE VFloat vtruncate(VFloat a) { + return vec_trunc(a); +} + +ICBC_FORCEINLINE VFloat lane_id() { + return (VFloat){ 0, 1, 2, 3 }; +} + +ICBC_FORCEINLINE VMask operator> (VFloat A, VFloat B) { return { vec_cmpgt(A, B) }; } +ICBC_FORCEINLINE VMask operator>=(VFloat A, VFloat B) { return { vec_cmpge(A, B) }; } +ICBC_FORCEINLINE VMask operator< (VFloat A, VFloat B) { return { vec_cmplt(A, B) }; } +ICBC_FORCEINLINE VMask operator<=(VFloat A, VFloat B) { return { vec_cmple(A, B) }; } + +ICBC_FORCEINLINE VMask operator| (VMask A, VMask B) { return { vec_or(A, B) }; } +ICBC_FORCEINLINE VMask operator& (VMask A, VMask B) { return { vec_and(A, B) }; } +ICBC_FORCEINLINE VMask operator^ (VMask A, VMask B) { return { vec_xor(A, B) }; } + +// mask ? b : a +ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) { + return vec_sel(a, b, mask); +} + +ICBC_FORCEINLINE int reduce_min_index(VFloat v) { + + //VFloat vmin = //@@ Horizontal min? + //return vec_cmpeq_idx(v, vmin); + + // @@ Is there a better way to do this reduction? + int min_idx = 0; + float min_value = lane(v, 0); + + for (int i = 1; i < VEC_SIZE; i++) { + float value = lane(v, i); + if (value < min_value) { + min_value = value; + min_idx = i; + } + } + + return min_idx; +} + +ICBC_FORCEINLINE void vtranspose4(VFloat & a, VFloat & b, VFloat & c, VFloat & d) { + VFloat t1 = vec_mergeh(a, c); + VFloat t2 = vec_mergel(a, c); + VFloat t3 = vec_mergeh(b, d); + VFloat t4 = vec_mergel(b, d); + a = vec_mergeh(t1, t3); + b = vec_mergel(t1, t3); + c = vec_mergeh(t2, t4); + d = vec_mergel(t2, t4); +} + +#endif // ICBC_SIMD == * + +#if ICBC_SIMD != ICBC_FLOAT +ICBC_FORCEINLINE VFloat vmadd(VFloat a, float b, VFloat c) { + VFloat vb = vbroadcast(b); + return vmadd(a, vb, c); +} +#endif + +struct VVector3 { + VFloat x; + VFloat y; + VFloat z; +}; + +ICBC_FORCEINLINE VVector3 vbroadcast(Vector3 v) { + VVector3 v8; + + v8.x = vbroadcast(v.x); + v8.y = vbroadcast(v.y); + v8.z = vbroadcast(v.z); + return v8; +} + +ICBC_FORCEINLINE VVector3 vbroadcast(float x, float y, float z) { + VVector3 v8; + v8.x = vbroadcast(x); + v8.y = vbroadcast(y); + v8.z = vbroadcast(z); + return v8; +} + +/*ICBC_FORCEINLINE VVector3 vload(const Vector3 * v) { + // @@ See this for a 8x3 transpose: https://software.intel.com/content/www/us/en/develop/articles/3d-vector-normalization-using-256-bit-intel-advanced-vector-extensions-intel-avx.html +}*/ + +ICBC_FORCEINLINE VVector3 vload(const Vector4 * ptr) { +#if ICBC_SIMD == ICBC_AVX512 + + // @@ AVX512 transpose not implemented. + __m512i vindex = _mm512_set_epi32(4 * 15, 4 * 14, 4 * 13, 4 * 12, 4 * 11, 4 * 10, 4 * 9, 4 * 8, 4 * 7, 4 * 6, 4 * 5, 4 * 4, 4 * 3, 4 * 2, 4 * 1, 0); + + VVector3 v; + v.x = _mm512_i32gather_ps(vindex, &ptr->x, 4); + v.y = _mm512_i32gather_ps(vindex, &ptr->y, 4); + v.z = _mm512_i32gather_ps(vindex, &ptr->z, 4); + return v; + +#else + + VVector3 v; + v.x = vload(&ptr->x + 0 * VEC_SIZE); + v.y = vload(&ptr->x + 1 * VEC_SIZE); + v.z = vload(&ptr->x + 2 * VEC_SIZE); + VFloat tmp = vload(&ptr->x + 3 * VEC_SIZE); + + vtranspose4(v.x, v.y, v.z, tmp); + + return v; +#endif +} + + +ICBC_FORCEINLINE VVector3 operator+(VVector3 a, VVector3 b) { + VVector3 v; + v.x = (a.x + b.x); + v.y = (a.y + b.y); + v.z = (a.z + b.z); + return v; +} + +ICBC_FORCEINLINE VVector3 operator-(VVector3 a, VVector3 b) { + VVector3 v8; + v8.x = (a.x - b.x); + v8.y = (a.y - b.y); + v8.z = (a.z - b.z); + return v8; +} + +ICBC_FORCEINLINE VVector3 operator*(VVector3 a, VVector3 b) { + VVector3 v8; + v8.x = (a.x * b.x); + v8.y = (a.y * b.y); + v8.z = (a.z * b.z); + return v8; +} + +ICBC_FORCEINLINE VVector3 operator*(VVector3 a, VFloat b) { + VVector3 v8; + v8.x = (a.x * b); + v8.y = (a.y * b); + v8.z = (a.z * b); + return v8; +} + +ICBC_FORCEINLINE VVector3 vmadd(VVector3 a, VVector3 b, VVector3 c) { + VVector3 v8; + v8.x = vmadd(a.x, b.x, c.x); + v8.y = vmadd(a.y, b.y, c.y); + v8.z = vmadd(a.z, b.z, c.z); + return v8; +} + +ICBC_FORCEINLINE VVector3 vmadd(VVector3 a, VFloat b, VVector3 c) { + VVector3 v8; + v8.x = vmadd(a.x, b, c.x); + v8.y = vmadd(a.y, b, c.y); + v8.z = vmadd(a.z, b, c.z); + return v8; +} + +#if ICBC_SIMD != ICBC_FLOAT +ICBC_FORCEINLINE VVector3 vmadd(VVector3 a, float b, VVector3 c) { + VVector3 v8; + VFloat vb = vbroadcast(b); + v8.x = vmadd(a.x, vb, c.x); + v8.y = vmadd(a.y, vb, c.y); + v8.z = vmadd(a.z, vb, c.z); + return v8; +} +#endif + +ICBC_FORCEINLINE VVector3 vmsub(VVector3 a, VFloat b, VFloat c) { + VVector3 v8; + v8.x = vmsub(a.x, b, c); + v8.y = vmsub(a.y, b, c); + v8.z = vmsub(a.z, b, c); + return v8; +} + +ICBC_FORCEINLINE VVector3 vmsub(VVector3 a, VFloat b, VVector3 c) { + VVector3 v8; + v8.x = vmsub(a.x, b, c.x); + v8.y = vmsub(a.y, b, c.y); + v8.z = vmsub(a.z, b, c.z); + return v8; +} + +ICBC_FORCEINLINE VVector3 vm2sub(VVector3 a, VFloat b, VVector3 c, VFloat d) { + VVector3 v; + v.x = vm2sub(a.x, b, c.x, d); + v.y = vm2sub(a.y, b, c.y, d); + v.z = vm2sub(a.z, b, c.z, d); + return v; +} + +ICBC_FORCEINLINE VVector3 vm2sub(VVector3 a, VVector3 b, VVector3 c, VFloat d) { + VVector3 v; + v.x = vm2sub(a.x, b.x, c.x, d); + v.y = vm2sub(a.y, b.y, c.y, d); + v.z = vm2sub(a.z, b.z, c.z, d); + return v; +} + +ICBC_FORCEINLINE VVector3 vm2sub(VVector3 a, VVector3 b, VVector3 c, VVector3 d) { + VVector3 v; + v.x = vm2sub(a.x, b.x, c.x, d.x); + v.y = vm2sub(a.y, b.y, c.y, d.y); + v.z = vm2sub(a.z, b.z, c.z, d.z); + return v; +} + +ICBC_FORCEINLINE VFloat vdot(VVector3 a, VVector3 b) { + VFloat r; + r = a.x * b.x + a.y * b.y + a.z * b.z; + return r; +} + +// Length squared. +ICBC_FORCEINLINE VFloat vlen2(VVector3 v) { + return vdot(v, v); +} + + +// mask ? b : a +ICBC_FORCEINLINE VVector3 vselect(VMask mask, VVector3 a, VVector3 b) { + VVector3 r; + r.x = vselect(mask, a.x, b.x); + r.y = vselect(mask, a.y, b.y); + r.z = vselect(mask, a.z, b.z); + return r; +} + + + + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Color conversion functions. + +static const float midpoints5[32] = { + 0.015686f, 0.047059f, 0.078431f, 0.111765f, 0.145098f, 0.176471f, 0.207843f, 0.241176f, 0.274510f, 0.305882f, 0.337255f, 0.370588f, 0.403922f, 0.435294f, 0.466667f, 0.5f, + 0.533333f, 0.564706f, 0.596078f, 0.629412f, 0.662745f, 0.694118f, 0.725490f, 0.758824f, 0.792157f, 0.823529f, 0.854902f, 0.888235f, 0.921569f, 0.952941f, 0.984314f, FLT_MAX +}; + +static const float midpoints6[64] = { + 0.007843f, 0.023529f, 0.039216f, 0.054902f, 0.070588f, 0.086275f, 0.101961f, 0.117647f, 0.133333f, 0.149020f, 0.164706f, 0.180392f, 0.196078f, 0.211765f, 0.227451f, 0.245098f, + 0.262745f, 0.278431f, 0.294118f, 0.309804f, 0.325490f, 0.341176f, 0.356863f, 0.372549f, 0.388235f, 0.403922f, 0.419608f, 0.435294f, 0.450980f, 0.466667f, 0.482353f, 0.500000f, + 0.517647f, 0.533333f, 0.549020f, 0.564706f, 0.580392f, 0.596078f, 0.611765f, 0.627451f, 0.643137f, 0.658824f, 0.674510f, 0.690196f, 0.705882f, 0.721569f, 0.737255f, 0.754902f, + 0.772549f, 0.788235f, 0.803922f, 0.819608f, 0.835294f, 0.850980f, 0.866667f, 0.882353f, 0.898039f, 0.913725f, 0.929412f, 0.945098f, 0.960784f, 0.976471f, 0.992157f, FLT_MAX +}; + +/*void init_tables() { + for (int i = 0; i < 31; i++) { + float f0 = float(((i+0) << 3) | ((i+0) >> 2)) / 255.0f; + float f1 = float(((i+1) << 3) | ((i+1) >> 2)) / 255.0f; + midpoints5[i] = (f0 + f1) * 0.5; + } + midpoints5[31] = FLT_MAX; + + for (int i = 0; i < 63; i++) { + float f0 = float(((i+0) << 2) | ((i+0) >> 4)) / 255.0f; + float f1 = float(((i+1) << 2) | ((i+1) >> 4)) / 255.0f; + midpoints6[i] = (f0 + f1) * 0.5; + } + midpoints6[63] = FLT_MAX; +}*/ + +ICBC_FORCEINLINE VFloat vround5(VFloat x) { + const VFloat rb_scale = vbroadcast(31.0f); + const VFloat rb_inv_scale = vbroadcast(1.0f / 31.0f); + +#if ICBC_PERFECT_ROUND + VFloat q = vtruncate(x * rb_scale); + VFloat mp = vgather(midpoints5, q); + //return (q + (vbroadcast(1.0f) & (x > mp))) * rb_inv_scale; + return (q + vselect(x > mp, vzero(), vbroadcast(1))) * rb_inv_scale; +#else + return vround01(x * rb_scale) * rb_inv_scale; +#endif +} + +ICBC_FORCEINLINE VFloat vround6(VFloat x) { + const VFloat g_scale = vbroadcast(63.0f); + const VFloat g_inv_scale = vbroadcast(1.0f / 63.0f); + +#if ICBC_PERFECT_ROUND + VFloat q = vtruncate(x * g_scale); + VFloat mp = vgather(midpoints6, q); + //return (q + (vbroadcast(1) & (x > mp))) * g_inv_scale; + return (q + vselect(x > mp, vzero(), vbroadcast(1))) * g_inv_scale; +#else + return vround01(x * g_scale) * g_inv_scale; +#endif +} + +ICBC_FORCEINLINE VVector3 vround_ept(VVector3 v) { + VVector3 r; + r.x = vround5(vsaturate(v.x)); + r.y = vround6(vsaturate(v.y)); + r.z = vround5(vsaturate(v.z)); + return r; +} + +static Color16 vector3_to_color16(const Vector3 & v) { + + // Truncate. + uint r = uint(clamp(v.x * 31.0f, 0.0f, 31.0f)); + uint g = uint(clamp(v.y * 63.0f, 0.0f, 63.0f)); + uint b = uint(clamp(v.z * 31.0f, 0.0f, 31.0f)); + + // Round exactly according to 565 bit-expansion. + r += (v.x > midpoints5[r]); + g += (v.y > midpoints6[g]); + b += (v.z > midpoints5[b]); + + Color16 c; + c.u = (r << 11) | (g << 5) | b; + return c; +} + +static Color32 bitexpand_color16_to_color32(Color16 c16) { + Color32 c32; + //c32.b = (c16.b << 3) | (c16.b >> 2); + //c32.g = (c16.g << 2) | (c16.g >> 4); + //c32.r = (c16.r << 3) | (c16.r >> 2); + //c32.a = 0xFF; + + c32.u = ((c16.u << 3) & 0xf8) | ((c16.u << 5) & 0xfc00) | ((c16.u << 8) & 0xf80000); + c32.u |= (c32.u >> 5) & 0x070007; + c32.u |= (c32.u >> 6) & 0x000300; + + return c32; +} + +inline Vector3 color_to_vector3(Color32 c) { + return { c.r / 255.0f, c.g / 255.0f, c.b / 255.0f }; +} + +inline Color32 vector3_to_color32(Vector3 v) { + Color32 color; + color.r = uint8(saturate(v.x) * 255 + 0.5f); + color.g = uint8(saturate(v.y) * 255 + 0.5f); + color.b = uint8(saturate(v.z) * 255 + 0.5f); + color.a = 255; + return color; +} + + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Input block processing. + +inline bool is_black(Vector3 c) { + // This large threshold seems to improve compression. This is not forcing these texels to be black, just + // causes them to be ignored during PCA. + // @@ We may want to adjust this based on the quality level, since this increases the number of blocks that try cluster-3 fitting. + //return c.x < midpoints5[0] && c.y < midpoints6[0] && c.z < midpoints5[0]; + //return c.x < 1.0f / 32 && c.y < 1.0f / 32 && c.z < 1.0f / 32; + return c.x < 1.0f / 8 && c.y < 1.0f / 8 && c.z < 1.0f / 8; +} + +// Find similar colors and combine them together. +static int reduce_colors(const Vector4 * input_colors, const float * input_weights, int count, float threshold, Vector3 * colors, float * weights, bool * any_black) +{ +#if 0 + for (int i = 0; i < 16; i++) { + colors[i] = input_colors[i].xyz; + weights[i] = input_weights[i]; + } + return 16; +#else + *any_black = false; + + int n = 0; + for (int i = 0; i < count; i++) + { + Vector3 ci = input_colors[i].xyz; + float wi = input_weights[i]; + + if (wi > 0) { + // Find matching color. + int j; + for (j = 0; j < n; j++) { + if (equal(colors[j], ci, threshold)) { + colors[j] = (colors[j] * weights[j] + ci * wi) / (weights[j] + wi); + weights[j] += wi; + break; + } + } + + // No match found. Add new color. + if (j == n) { + colors[n] = ci; + weights[n] = wi; + n++; + } + + if (is_black(ci)) { + *any_black = true; + } + } + } + + ICBC_ASSERT(n <= count); + + return n; +#endif +} + +static int skip_blacks(const Vector3 * input_colors, const float * input_weights, int count, Vector3 * colors, float * weights) +{ + int n = 0; + for (int i = 0; i < count; i++) + { + Vector3 ci = input_colors[i]; + float wi = input_weights[i]; + + if (is_black(ci)) { + continue; + } + + colors[n] = ci; + weights[n] = wi; + n += 1; + } + + return n; +} + + + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// PCA + +static Vector3 computeCentroid(int n, const Vector3 *__restrict points, const float *__restrict weights) +{ + Vector3 centroid = { 0 }; + float total = 0.0f; + + for (int i = 0; i < n; i++) + { + total += weights[i]; + centroid += weights[i] * points[i]; + } + centroid *= (1.0f / total); + + return centroid; +} + +static Vector3 computeCovariance(int n, const Vector3 *__restrict points, const float *__restrict weights, float *__restrict covariance) +{ + // compute the centroid + Vector3 centroid = computeCentroid(n, points, weights); + + // compute covariance matrix + for (int i = 0; i < 6; i++) + { + covariance[i] = 0.0f; + } + + for (int i = 0; i < n; i++) + { + Vector3 a = (points[i] - centroid); // @@ I think weight should be squared, but that seems to increase the error slightly. + Vector3 b = weights[i] * a; + + covariance[0] += a.x * b.x; + covariance[1] += a.x * b.y; + covariance[2] += a.x * b.z; + covariance[3] += a.y * b.y; + covariance[4] += a.y * b.z; + covariance[5] += a.z * b.z; + } + + return centroid; +} + +// @@ We should be able to do something cheaper... +static Vector3 estimatePrincipalComponent(const float * __restrict matrix) +{ + const Vector3 row0 = { matrix[0], matrix[1], matrix[2] }; + const Vector3 row1 = { matrix[1], matrix[3], matrix[4] }; + const Vector3 row2 = { matrix[2], matrix[4], matrix[5] }; + + float r0 = lengthSquared(row0); + float r1 = lengthSquared(row1); + float r2 = lengthSquared(row2); + + if (r0 > r1 && r0 > r2) return row0; + if (r1 > r2) return row1; + return row2; +} + +static inline Vector3 firstEigenVector_PowerMethod(const float *__restrict matrix) +{ + if (matrix[0] == 0 && matrix[3] == 0 && matrix[5] == 0) + { + return {0}; + } + + Vector3 v = estimatePrincipalComponent(matrix); + + const int NUM = 8; + for (int i = 0; i < NUM; i++) + { + float x = v.x * matrix[0] + v.y * matrix[1] + v.z * matrix[2]; + float y = v.x * matrix[1] + v.y * matrix[3] + v.z * matrix[4]; + float z = v.x * matrix[2] + v.y * matrix[4] + v.z * matrix[5]; + + float norm = max(max(x, y), z); + + v = { x, y, z }; + v *= (1.0f / norm); + } + + return v; +} + +static Vector3 computePrincipalComponent_PowerMethod(int n, const Vector3 *__restrict points, const float *__restrict weights) +{ + float matrix[6]; + computeCovariance(n, points, weights, matrix); + + return firstEigenVector_PowerMethod(matrix); +} + + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// SAT + +struct SummedAreaTable { + ICBC_ALIGN_16 float r[16]; + ICBC_ALIGN_16 float g[16]; + ICBC_ALIGN_16 float b[16]; + ICBC_ALIGN_16 float w[16]; +}; + +int compute_sat(const Vector3 * colors, const float * weights, int count, SummedAreaTable * sat) +{ + // I've tried using a lower quality approximation of the principal direction, but the best fit line seems to produce best results. + Vector3 principal = computePrincipalComponent_PowerMethod(count, colors, weights); + + // build the list of values + int order[16]; + float dps[16]; + for (int i = 0; i < count; ++i) + { + order[i] = i; + dps[i] = dot(colors[i], principal); + } + + // stable sort + for (int i = 0; i < count; ++i) + { + for (int j = i; j > 0 && dps[j] < dps[j - 1]; --j) + { + swap(dps[j], dps[j - 1]); + swap(order[j], order[j - 1]); + } + } + + float w = weights[order[0]]; + sat->r[0] = colors[order[0]].x * w; + sat->g[0] = colors[order[0]].y * w; + sat->b[0] = colors[order[0]].z * w; + sat->w[0] = w; + + for (int i = 1; i < count; i++) { + float w = weights[order[i]]; + sat->r[i] = sat->r[i - 1] + colors[order[i]].x * w; + sat->g[i] = sat->g[i - 1] + colors[order[i]].y * w; + sat->b[i] = sat->b[i - 1] + colors[order[i]].z * w; + sat->w[i] = sat->w[i - 1] + w; + } + + // Try incremental decimation: + /*if (count > 4) + { + float threshold = 1.0f / 4; + + for (uint i = 1; i < count; ++i) + { + if (sat->r[i] - sat->r[i - 1] < threshold && + sat->g[i] - sat->g[i - 1] < threshold && + sat->b[i] - sat->b[i - 1] < threshold) + { + for (int j = i+1; j < count; j++) { + sat->r[j - 1] = sat->r[j]; + sat->g[j - 1] = sat->g[j]; + sat->b[j - 1] = sat->b[j]; + sat->w[j - 1] = sat->w[j]; + } + count -= 1; + i -= 1; + if (count == 4) break; + } + } + }*/ + + for (int i = count; i < 16; i++) { + sat->r[i] = FLT_MAX; + sat->g[i] = FLT_MAX; + sat->b[i] = FLT_MAX; + sat->w[i] = FLT_MAX; + } + + return count; +} + + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Cluster Fit + +struct Combinations { + uint8 c0, c1, c2, pad; +}; + +static ICBC_ALIGN_16 int s_fourClusterTotal[16]; +static ICBC_ALIGN_16 int s_threeClusterTotal[16]; +static ICBC_ALIGN_16 Combinations s_fourCluster[968 + 8]; +static ICBC_ALIGN_16 Combinations s_threeCluster[152 + 8]; + +#if ICBC_USE_NEON_VTL +static uint8 s_neon_vtl_index0_4[4 * 968]; +static uint8 s_neon_vtl_index1_4[4 * 968]; +static uint8 s_neon_vtl_index2_4[4 * 968]; + +static uint8 s_neon_vtl_index0_3[4 * 152]; +static uint8 s_neon_vtl_index1_3[4 * 152]; +#endif + +static void init_cluster_tables() { + + for (int t = 1, i = 0; t <= 16; t++) { + for (int c0 = 0; c0 <= t; c0++) { + for (int c1 = 0; c1 <= t - c0; c1++) { + for (int c2 = 0; c2 <= t - c0 - c1; c2++) { + + // Skip this cluster so that the total is a multiple of 8 + if (c0 == 0 && c1 == 0 && c2 == 0) continue; + + bool found = false; + if (t > 1) { + for (int j = 0; j < s_fourClusterTotal[t-2]; j++) { + if (s_fourCluster[j].c0 == c0 && s_fourCluster[j].c1 == c0+c1 && s_fourCluster[j].c2 == c0+c1+c2) { + found = true; + break; + } + } + } + + if (!found) { + s_fourCluster[i].c0 = c0; + s_fourCluster[i].c1 = c0+c1; + s_fourCluster[i].c2 = c0+c1+c2; + i++; + } + } + } + } + + s_fourClusterTotal[t - 1] = i; + } + + // Replicate last entry. + for (int i = 0; i < 8; i++) { + s_fourCluster[968 + i] = s_fourCluster[968-1]; + } + + for (int t = 1, i = 0; t <= 16; t++) { + for (int c0 = 0; c0 <= t; c0++) { + for (int c1 = 0; c1 <= t - c0; c1++) { + + // Skip this cluster so that the total is a multiple of 8 + if (c0 == 0 && c1 == 0) continue; + + bool found = false; + if (t > 1) { + for (int j = 0; j < s_threeClusterTotal[t - 2]; j++) { + if (s_threeCluster[j].c0 == c0 && s_threeCluster[j].c1 == c0+c1) { + found = true; + break; + } + } + } + + if (!found) { + s_threeCluster[i].c0 = c0; + s_threeCluster[i].c1 = c0 + c1; + i++; + } + } + } + + s_threeClusterTotal[t - 1] = i; + } + + // Replicate last entry. + for (int i = 0; i < 8; i++) { + s_threeCluster[152 + i] = s_threeCluster[152 - 1]; + } + +#if ICBC_USE_NEON_VTL + for (int i = 0; i < 968; i++) { + int c0 = (s_fourCluster[i].c0) - 1; + s_neon_vtl_index0_4[4 * i + 0] = uint8(c0 * 4 + 0); + s_neon_vtl_index0_4[4 * i + 1] = uint8(c0 * 4 + 1); + s_neon_vtl_index0_4[4 * i + 2] = uint8(c0 * 4 + 2); + s_neon_vtl_index0_4[4 * i + 3] = uint8(c0 * 4 + 3); + + int c1 = (s_fourCluster[i].c1) - 1; + s_neon_vtl_index1_4[4 * i + 0] = uint8(c1 * 4 + 0); + s_neon_vtl_index1_4[4 * i + 1] = uint8(c1 * 4 + 1); + s_neon_vtl_index1_4[4 * i + 2] = uint8(c1 * 4 + 2); + s_neon_vtl_index1_4[4 * i + 3] = uint8(c1 * 4 + 3); + + int c2 = (s_fourCluster[i].c2) - 1; + s_neon_vtl_index2_4[4 * i + 0] = uint8(c2 * 4 + 0); + s_neon_vtl_index2_4[4 * i + 1] = uint8(c2 * 4 + 1); + s_neon_vtl_index2_4[4 * i + 2] = uint8(c2 * 4 + 2); + s_neon_vtl_index2_4[4 * i + 3] = uint8(c2 * 4 + 3); + } + + for (int i = 0; i < 152; i++) { + int c0 = (s_threeCluster[i].c0) - 1; + s_neon_vtl_index0_3[4 * i + 0] = uint8(c0 * 4 + 0); + s_neon_vtl_index0_3[4 * i + 1] = uint8(c0 * 4 + 1); + s_neon_vtl_index0_3[4 * i + 2] = uint8(c0 * 4 + 2); + s_neon_vtl_index0_3[4 * i + 3] = uint8(c0 * 4 + 3); + + int c1 = (s_threeCluster[i].c1) - 1; + s_neon_vtl_index1_3[4 * i + 0] = uint8(c1 * 4 + 0); + s_neon_vtl_index1_3[4 * i + 1] = uint8(c1 * 4 + 1); + s_neon_vtl_index1_3[4 * i + 2] = uint8(c1 * 4 + 2); + s_neon_vtl_index1_3[4 * i + 3] = uint8(c1 * 4 + 3); + } +#endif +} + + + +static void cluster_fit_three(const SummedAreaTable & sat, int count, Vector3 metric_sqr, Vector3 * start, Vector3 * end) +{ + const float r_sum = sat.r[count-1]; + const float g_sum = sat.g[count-1]; + const float b_sum = sat.b[count-1]; + const float w_sum = sat.w[count-1]; + + VFloat vbesterror = vbroadcast(FLT_MAX); + VVector3 vbeststart = { vzero(), vzero(), vzero() }; + VVector3 vbestend = { vzero(), vzero(), vzero() }; + + // check all possible clusters for this total order + const int total_order_count = s_threeClusterTotal[count - 1]; + + for (int i = 0; i < total_order_count; i += VEC_SIZE) + { + VVector3 x0, x1; + VFloat w0, w1; + +#if ICBC_USE_AVX512_PERMUTE + + auto loadmask = lane_id() < vbroadcast(float(count)); + + // Load sat in one register: + VFloat vrsat = vload(loadmask, sat.r, FLT_MAX); + VFloat vgsat = vload(loadmask, sat.g, FLT_MAX); + VFloat vbsat = vload(loadmask, sat.b, FLT_MAX); + VFloat vwsat = vload(loadmask, sat.w, FLT_MAX); + + // Load 4 uint8 per lane. + __m512i packedClusterIndex = _mm512_load_si512((__m512i *)&s_threeCluster[i]); + + // Load index and decrement. + auto c0 = _mm512_and_epi32(packedClusterIndex, _mm512_set1_epi32(0xFF)); + auto c0mask = _mm512_cmpgt_epi32_mask(c0, _mm512_setzero_si512()); + c0 = _mm512_sub_epi32(c0, _mm512_set1_epi32(1)); + + // @@ Avoid blend_ps? + // if upper bit set, zero, otherwise load sat entry. + x0.x = _mm512_mask_blend_ps(c0mask, _mm512_setzero_ps(), _mm512_permutexvar_ps(c0, vrsat)); + x0.y = _mm512_mask_blend_ps(c0mask, _mm512_setzero_ps(), _mm512_permutexvar_ps(c0, vgsat)); + x0.z = _mm512_mask_blend_ps(c0mask, _mm512_setzero_ps(), _mm512_permutexvar_ps(c0, vbsat)); + w0 = _mm512_mask_blend_ps(c0mask, _mm512_setzero_ps(), _mm512_permutexvar_ps(c0, vwsat)); + + auto c1 = _mm512_and_epi32(_mm512_srli_epi32(packedClusterIndex, 8), _mm512_set1_epi32(0xFF)); + auto c1mask = _mm512_cmpgt_epi32_mask(c1, _mm512_setzero_si512()); + c1 = _mm512_sub_epi32(c1, _mm512_set1_epi32(1)); + + x1.x = _mm512_mask_blend_ps(c1mask, _mm512_setzero_ps(), _mm512_permutexvar_ps(c1, vrsat)); + x1.y = _mm512_mask_blend_ps(c1mask, _mm512_setzero_ps(), _mm512_permutexvar_ps(c1, vgsat)); + x1.z = _mm512_mask_blend_ps(c1mask, _mm512_setzero_ps(), _mm512_permutexvar_ps(c1, vbsat)); + w1 = _mm512_mask_blend_ps(c1mask, _mm512_setzero_ps(), _mm512_permutexvar_ps(c1, vwsat)); + +#elif ICBC_USE_AVX2_PERMUTE2 + + // Load 4 uint8 per lane. @@ Ideally I should pack this better and load only 2. + VInt packedClusterIndex = vload((int *)&s_threeCluster[i]); + + VInt c0 = (packedClusterIndex & 0xFF); + VInt c1 = ((packedClusterIndex >> 8)); // No need for & 0xFF + + if (count <= 8) { + // Load sat.r in one register: + VFloat rLo = vload(sat.r); + VFloat gLo = vload(sat.g); + VFloat bLo = vload(sat.b); + VFloat wLo = vload(sat.w); + + x0.x = vpermuteif(c0>0, rLo, c0-1); + x0.y = vpermuteif(c0>0, gLo, c0-1); + x0.z = vpermuteif(c0>0, bLo, c0-1); + w0 = vpermuteif(c0>0, wLo, c0-1); + + x1.x = vpermuteif(c1>0, rLo, c1-1); + x1.y = vpermuteif(c1>0, gLo, c1-1); + x1.z = vpermuteif(c1>0, bLo, c1-1); + w1 = vpermuteif(c1>0, wLo, c1-1); + } + else { + // Load sat.r in two registers: + VFloat rLo = vload(sat.r); VFloat rHi = vload(sat.r + 8); + VFloat gLo = vload(sat.g); VFloat gHi = vload(sat.g + 8); + VFloat bLo = vload(sat.b); VFloat bHi = vload(sat.b + 8); + VFloat wLo = vload(sat.w); VFloat wHi = vload(sat.w + 8); + + x0.x = vpermute2if(c0>0, rLo, rHi, c0-1); + x0.y = vpermute2if(c0>0, gLo, gHi, c0-1); + x0.z = vpermute2if(c0>0, bLo, bHi, c0-1); + w0 = vpermute2if(c0>0, wLo, wHi, c0-1); + + x1.x = vpermute2if(c1>0, rLo, rHi, c1-1); + x1.y = vpermute2if(c1>0, gLo, gHi, c1-1); + x1.z = vpermute2if(c1>0, bLo, bHi, c1-1); + w1 = vpermute2if(c1>0, wLo, wHi, c1-1); + } + +#elif ICBC_USE_NEON_VTL + + uint8x16_t idx0 = (uint8x16_t &)s_neon_vtl_index0_3[4*i]; + uint8x16_t idx1 = (uint8x16_t &)s_neon_vtl_index1_3[4*i]; + + if (count <= 4) { + uint8x16_t rsat1 = vld1q_u8((uint8*)sat.r); + uint8x16_t gsat1 = vld1q_u8((uint8*)sat.g); + uint8x16_t bsat1 = vld1q_u8((uint8*)sat.b); + uint8x16_t wsat1 = vld1q_u8((uint8*)sat.w); + + x0.x = vreinterpretq_f32_u8(vqtbl1q_u8(rsat1, idx0)); + x0.y = vreinterpretq_f32_u8(vqtbl1q_u8(gsat1, idx0)); + x0.z = vreinterpretq_f32_u8(vqtbl1q_u8(bsat1, idx0)); + w0 = vreinterpretq_f32_u8(vqtbl1q_u8(wsat1, idx0)); + + x1.x = vreinterpretq_f32_u8(vqtbl1q_u8(rsat1, idx1)); + x1.y = vreinterpretq_f32_u8(vqtbl1q_u8(gsat1, idx1)); + x1.z = vreinterpretq_f32_u8(vqtbl1q_u8(bsat1, idx1)); + w1 = vreinterpretq_f32_u8(vqtbl1q_u8(wsat1, idx1)); + } + else if (count <= 8) { + uint8x16x2_t rsat2 = vld2q_u8((uint8*)sat.r); + uint8x16x2_t gsat2 = vld2q_u8((uint8*)sat.g); + uint8x16x2_t bsat2 = vld2q_u8((uint8*)sat.b); + uint8x16x2_t wsat2 = vld2q_u8((uint8*)sat.w); + + x0.x = vreinterpretq_f32_u8(vqtbl2q_u8(rsat2, idx0)); + x0.y = vreinterpretq_f32_u8(vqtbl2q_u8(gsat2, idx0)); + x0.z = vreinterpretq_f32_u8(vqtbl2q_u8(bsat2, idx0)); + w0 = vreinterpretq_f32_u8(vqtbl2q_u8(wsat2, idx0)); + + x1.x = vreinterpretq_f32_u8(vqtbl2q_u8(rsat2, idx1)); + x1.y = vreinterpretq_f32_u8(vqtbl2q_u8(gsat2, idx1)); + x1.z = vreinterpretq_f32_u8(vqtbl2q_u8(bsat2, idx1)); + w1 = vreinterpretq_f32_u8(vqtbl2q_u8(wsat2, idx1)); + } + else if (count <= 12) { + uint8x16x3_t rsat3 = vld3q_u8((uint8*)sat.r); + uint8x16x3_t gsat3 = vld3q_u8((uint8*)sat.g); + uint8x16x3_t bsat3 = vld3q_u8((uint8*)sat.b); + uint8x16x3_t wsat3 = vld3q_u8((uint8*)sat.w); + + x0.x = vreinterpretq_f32_u8(vqtbl3q_u8(rsat3, idx0)); + x0.y = vreinterpretq_f32_u8(vqtbl3q_u8(gsat3, idx0)); + x0.z = vreinterpretq_f32_u8(vqtbl3q_u8(bsat3, idx0)); + w0 = vreinterpretq_f32_u8(vqtbl3q_u8(wsat3, idx0)); + + x1.x = vreinterpretq_f32_u8(vqtbl3q_u8(rsat3, idx1)); + x1.y = vreinterpretq_f32_u8(vqtbl3q_u8(gsat3, idx1)); + x1.z = vreinterpretq_f32_u8(vqtbl3q_u8(bsat3, idx1)); + w1 = vreinterpretq_f32_u8(vqtbl3q_u8(wsat3, idx1)); + } + else { + // Load SAT. + uint8x16x4_t rsat4 = vld4q_u8((uint8*)sat.r); + uint8x16x4_t gsat4 = vld4q_u8((uint8*)sat.g); + uint8x16x4_t bsat4 = vld4q_u8((uint8*)sat.b); + uint8x16x4_t wsat4 = vld4q_u8((uint8*)sat.w); + + x0.x = vreinterpretq_f32_u8(vqtbl4q_u8(rsat4, idx0)); + x0.y = vreinterpretq_f32_u8(vqtbl4q_u8(gsat4, idx0)); + x0.z = vreinterpretq_f32_u8(vqtbl4q_u8(bsat4, idx0)); + w0 = vreinterpretq_f32_u8(vqtbl4q_u8(wsat4, idx0)); + + x1.x = vreinterpretq_f32_u8(vqtbl4q_u8(rsat4, idx1)); + x1.y = vreinterpretq_f32_u8(vqtbl4q_u8(gsat4, idx1)); + x1.z = vreinterpretq_f32_u8(vqtbl4q_u8(bsat4, idx1)); + w1 = vreinterpretq_f32_u8(vqtbl4q_u8(wsat4, idx1)); + } + +#else + // Scalar path + x0.x = vzero(); x0.y = vzero(); x0.z = vzero(); w0 = vzero(); + x1.x = vzero(); x1.y = vzero(); x1.z = vzero(); w1 = vzero(); + + for (int l = 0; l < VEC_SIZE; l++) { + int c0 = s_threeCluster[i + l].c0 - 1; + if (c0 >= 0) { + lane(x0.x, l) = sat.r[c0]; + lane(x0.y, l) = sat.g[c0]; + lane(x0.z, l) = sat.b[c0]; + lane(w0, l) = sat.w[c0]; + } + + int c1 = s_threeCluster[i + l].c1 - 1; + if (c1 >= 0) { + lane(x1.x, l) = sat.r[c1]; + lane(x1.y, l) = sat.g[c1]; + lane(x1.z, l) = sat.b[c1]; + lane(w1, l) = sat.w[c1]; + } + } +#endif + + VFloat w2 = vbroadcast(w_sum) - w1; + x1 = x1 - x0; + w1 = w1 - w0; + + VFloat alphabeta_sum = w1 * vbroadcast(0.25f); + VFloat alpha2_sum = w0 + alphabeta_sum; + VFloat beta2_sum = w2 + alphabeta_sum; + VFloat factor = vrcp(vm2sub(alpha2_sum, beta2_sum, alphabeta_sum, alphabeta_sum)); + + VVector3 alphax_sum = x0 + x1 * vbroadcast(0.5f); + VVector3 betax_sum = vbroadcast(r_sum, g_sum, b_sum) - alphax_sum; + + VVector3 a = vm2sub(alphax_sum, beta2_sum, betax_sum, alphabeta_sum) * factor; + VVector3 b = vm2sub(betax_sum, alpha2_sum, alphax_sum, alphabeta_sum) * factor; + + // snap to the grid + a = vround_ept(a); + b = vround_ept(b); + + // compute the error + VVector3 e2 = vm2sub(a, vmsub(b, alphabeta_sum, alphax_sum), b, betax_sum) * vbroadcast(2.0f); + VVector3 e1 = vmadd(a * a, alpha2_sum, vmadd(b * b, beta2_sum, e2)); + + // apply the metric to the error term + VFloat error = vdot(e1, vbroadcast(metric_sqr)); + + + // keep the solution if it wins + auto mask = (error < vbesterror); + + // I could mask the unused lanes here, but instead I set the invalid SAT entries to FLT_MAX. + //mask = (mask & (vbroadcast(total_order_count) >= tid8(i))); // This doesn't seem to help. Is it OK to consider elements out of bounds? + + vbesterror = vselect(mask, vbesterror, error); + vbeststart = vselect(mask, vbeststart, a); + vbestend = vselect(mask, vbestend, b); + } + + int bestindex = reduce_min_index(vbesterror); + + start->x = lane(vbeststart.x, bestindex); + start->y = lane(vbeststart.y, bestindex); + start->z = lane(vbeststart.z, bestindex); + end->x = lane(vbestend.x, bestindex); + end->y = lane(vbestend.y, bestindex); + end->z = lane(vbestend.z, bestindex); +} + + +static void cluster_fit_four(const SummedAreaTable & sat, int count, Vector3 metric_sqr, Vector3 * start, Vector3 * end) +{ + const float r_sum = sat.r[count-1]; + const float g_sum = sat.g[count-1]; + const float b_sum = sat.b[count-1]; + const float w_sum = sat.w[count-1]; + + VFloat vbesterror = vbroadcast(FLT_MAX); + VVector3 vbeststart = { vzero(), vzero(), vzero() }; + VVector3 vbestend = { vzero(), vzero(), vzero() }; + + // check all possible clusters for this total order + const int total_order_count = s_fourClusterTotal[count - 1]; + + for (int i = 0; i < total_order_count; i += VEC_SIZE) + { + VVector3 x0, x1, x2; + VFloat w0, w1, w2; + + /* + // Another approach would be to load and broadcast one color at a time like I do in my old CUDA implementation. + uint akku = 0; + + // Compute alpha & beta for this permutation. + #pragma unroll + for (int i = 0; i < 16; i++) + { + const uint bits = permutation >> (2*i); + + alphax_sum += alphaTable4[bits & 3] * colors[i]; + akku += prods4[bits & 3]; + } + + float alpha2_sum = float(akku >> 16); + float beta2_sum = float((akku >> 8) & 0xff); + float alphabeta_sum = float(akku & 0xff); + float3 betax_sum = 9.0f * color_sum - alphax_sum; + */ + +#if ICBC_USE_AVX512_PERMUTE + + auto loadmask = lane_id() < vbroadcast(float(count)); + + // Load sat in one register: + VFloat vrsat = vload(loadmask, sat.r, FLT_MAX); + VFloat vgsat = vload(loadmask, sat.g, FLT_MAX); + VFloat vbsat = vload(loadmask, sat.b, FLT_MAX); + VFloat vwsat = vload(loadmask, sat.w, FLT_MAX); + +#if 0 + // Load 4 uint8 per lane. + VInt packedClusterIndex = vload((int *)&s_fourCluster[i]); + + VInt c0 = (packedClusterIndex & 0xFF) - 1; + VInt c1 = ((packedClusterIndex >> 8) & 0xFF) - 1; + VInt c2 = ((packedClusterIndex >> 16)) - 1; // @@ No need for & + + x0.x = vpermuteif(c0 >= 0, vrsat, c0); + x0.y = vpermuteif(c0 >= 0, vgsat, c0); + x0.z = vpermuteif(c0 >= 0, vbsat, c0); + w0 = vpermuteif(c0 >= 0, vwsat, c0); + + x1.x = vpermuteif(c1 >= 0, vrsat, c1); + x1.y = vpermuteif(c1 >= 0, vgsat, c1); + x1.z = vpermuteif(c1 >= 0, vbsat, c1); + w1 = vpermuteif(c1 >= 0, vwsat, c1); + + x2.x = vpermuteif(c2 >= 0, vrsat, c2); + x2.y = vpermuteif(c2 >= 0, vgsat, c2); + x2.z = vpermuteif(c2 >= 0, vbsat, c2); + w2 = vpermuteif(c2 >= 0, vwsat, c2); +#else + + // Load 4 uint8 per lane. + __m512i packedClusterIndex = _mm512_load_si512((__m512i *)&s_fourCluster[i]); + + // Load index and decrement. + auto c0 = _mm512_and_epi32(packedClusterIndex, _mm512_set1_epi32(0xFF)); + auto c0mask = _mm512_cmpgt_epi32_mask(c0, _mm512_setzero_si512()); + c0 = _mm512_sub_epi32(c0, _mm512_set1_epi32(1)); + + // if upper bit set, zero, otherwise load sat entry. + x0.x = _mm512_mask_blend_ps(c0mask, _mm512_setzero_ps(), _mm512_permutexvar_ps(c0, vrsat)); + x0.y = _mm512_mask_blend_ps(c0mask, _mm512_setzero_ps(), _mm512_permutexvar_ps(c0, vgsat)); + x0.z = _mm512_mask_blend_ps(c0mask, _mm512_setzero_ps(), _mm512_permutexvar_ps(c0, vbsat)); + w0 = _mm512_mask_blend_ps(c0mask, _mm512_setzero_ps(), _mm512_permutexvar_ps(c0, vwsat)); + + auto c1 = _mm512_and_epi32(_mm512_srli_epi32(packedClusterIndex, 8), _mm512_set1_epi32(0xFF)); + auto c1mask = _mm512_cmpgt_epi32_mask(c1, _mm512_setzero_si512()); + c1 = _mm512_sub_epi32(c1, _mm512_set1_epi32(1)); + + x1.x = _mm512_mask_blend_ps(c1mask, _mm512_setzero_ps(), _mm512_permutexvar_ps(c1, vrsat)); + x1.y = _mm512_mask_blend_ps(c1mask, _mm512_setzero_ps(), _mm512_permutexvar_ps(c1, vgsat)); + x1.z = _mm512_mask_blend_ps(c1mask, _mm512_setzero_ps(), _mm512_permutexvar_ps(c1, vbsat)); + w1 = _mm512_mask_blend_ps(c1mask, _mm512_setzero_ps(), _mm512_permutexvar_ps(c1, vwsat)); + + auto c2 = _mm512_and_epi32(_mm512_srli_epi32(packedClusterIndex, 16), _mm512_set1_epi32(0xFF)); + auto c2mask = _mm512_cmpgt_epi32_mask(c2, _mm512_setzero_si512()); + c2 = _mm512_sub_epi32(c2, _mm512_set1_epi32(1)); + + x2.x = _mm512_mask_blend_ps(c2mask, _mm512_setzero_ps(), _mm512_permutexvar_ps(c2, vrsat)); + x2.y = _mm512_mask_blend_ps(c2mask, _mm512_setzero_ps(), _mm512_permutexvar_ps(c2, vgsat)); + x2.z = _mm512_mask_blend_ps(c2mask, _mm512_setzero_ps(), _mm512_permutexvar_ps(c2, vbsat)); + w2 = _mm512_mask_blend_ps(c2mask, _mm512_setzero_ps(), _mm512_permutexvar_ps(c2, vwsat)); +#endif + +#elif ICBC_USE_AVX2_PERMUTE2 + + // Load 4 uint8 per lane. + VInt packedClusterIndex = vload((int *)&s_fourCluster[i]); + + VInt c0 = (packedClusterIndex & 0xFF); + VInt c1 = ((packedClusterIndex >> 8) & 0xFF); + VInt c2 = ((packedClusterIndex >> 16)); // @@ No need for & + + if (count <= 8) { + // Load sat.r in one register: + VFloat rLo = vload(sat.r); + VFloat gLo = vload(sat.g); + VFloat bLo = vload(sat.b); + VFloat wLo = vload(sat.w); + + x0.x = vpermuteif(c0>0, rLo, c0-1); + x0.y = vpermuteif(c0>0, gLo, c0-1); + x0.z = vpermuteif(c0>0, bLo, c0-1); + w0 = vpermuteif(c0>0, wLo, c0-1); + + x1.x = vpermuteif(c1>0, rLo, c1-1); + x1.y = vpermuteif(c1>0, gLo, c1-1); + x1.z = vpermuteif(c1>0, bLo, c1-1); + w1 = vpermuteif(c1>0, wLo, c1-1); + + x2.x = vpermuteif(c2>0, rLo, c2-1); + x2.y = vpermuteif(c2>0, gLo, c2-1); + x2.z = vpermuteif(c2>0, bLo, c2-1); + w2 = vpermuteif(c2>0, wLo, c2-1); + } + else { + // Load sat.r in two registers: + VFloat rLo = vload(sat.r); VFloat rHi = vload(sat.r + 8); + VFloat gLo = vload(sat.g); VFloat gHi = vload(sat.g + 8); + VFloat bLo = vload(sat.b); VFloat bHi = vload(sat.b + 8); + VFloat wLo = vload(sat.w); VFloat wHi = vload(sat.w + 8); + + x0.x = vpermute2if(c0>0, rLo, rHi, c0-1); + x0.y = vpermute2if(c0>0, gLo, gHi, c0-1); + x0.z = vpermute2if(c0>0, bLo, bHi, c0-1); + w0 = vpermute2if(c0>0, wLo, wHi, c0-1); + + x1.x = vpermute2if(c1>0, rLo, rHi, c1-1); + x1.y = vpermute2if(c1>0, gLo, gHi, c1-1); + x1.z = vpermute2if(c1>0, bLo, bHi, c1-1); + w1 = vpermute2if(c1>0, wLo, wHi, c1-1); + + x2.x = vpermute2if(c2>0, rLo, rHi, c2-1); + x2.y = vpermute2if(c2>0, gLo, gHi, c2-1); + x2.z = vpermute2if(c2>0, bLo, bHi, c2-1); + w2 = vpermute2if(c2>0, wLo, wHi, c2-1); + } + +#elif ICBC_USE_NEON_VTL + + uint8x16_t idx0 = (uint8x16_t &)s_neon_vtl_index0_4[4*i]; + uint8x16_t idx1 = (uint8x16_t &)s_neon_vtl_index1_4[4*i]; + uint8x16_t idx2 = (uint8x16_t &)s_neon_vtl_index2_4[4*i]; + + if (count <= 4) { + uint8x16_t rsat1 = vld1q_u8((uint8*)sat.r); + uint8x16_t gsat1 = vld1q_u8((uint8*)sat.g); + uint8x16_t bsat1 = vld1q_u8((uint8*)sat.b); + uint8x16_t wsat1 = vld1q_u8((uint8*)sat.w); + + x0.x = vreinterpretq_f32_u8(vqtbl1q_u8(rsat1, idx0)); + x0.y = vreinterpretq_f32_u8(vqtbl1q_u8(gsat1, idx0)); + x0.z = vreinterpretq_f32_u8(vqtbl1q_u8(bsat1, idx0)); + w0 = vreinterpretq_f32_u8(vqtbl1q_u8(wsat1, idx0)); + + x1.x = vreinterpretq_f32_u8(vqtbl1q_u8(rsat1, idx1)); + x1.y = vreinterpretq_f32_u8(vqtbl1q_u8(gsat1, idx1)); + x1.z = vreinterpretq_f32_u8(vqtbl1q_u8(bsat1, idx1)); + w1 = vreinterpretq_f32_u8(vqtbl1q_u8(wsat1, idx1)); + + x2.x = vreinterpretq_f32_u8(vqtbl1q_u8(rsat1, idx2)); + x2.y = vreinterpretq_f32_u8(vqtbl1q_u8(gsat1, idx2)); + x2.z = vreinterpretq_f32_u8(vqtbl1q_u8(bsat1, idx2)); + w2 = vreinterpretq_f32_u8(vqtbl1q_u8(wsat1, idx2)); + } + else if (count <= 8) { + uint8x16x2_t rsat2 = vld2q_u8((uint8*)sat.r); + uint8x16x2_t gsat2 = vld2q_u8((uint8*)sat.g); + uint8x16x2_t bsat2 = vld2q_u8((uint8*)sat.b); + uint8x16x2_t wsat2 = vld2q_u8((uint8*)sat.w); + + x0.x = vreinterpretq_f32_u8(vqtbl2q_u8(rsat2, idx0)); + x0.y = vreinterpretq_f32_u8(vqtbl2q_u8(gsat2, idx0)); + x0.z = vreinterpretq_f32_u8(vqtbl2q_u8(bsat2, idx0)); + w0 = vreinterpretq_f32_u8(vqtbl2q_u8(wsat2, idx0)); + + x1.x = vreinterpretq_f32_u8(vqtbl2q_u8(rsat2, idx1)); + x1.y = vreinterpretq_f32_u8(vqtbl2q_u8(gsat2, idx1)); + x1.z = vreinterpretq_f32_u8(vqtbl2q_u8(bsat2, idx1)); + w1 = vreinterpretq_f32_u8(vqtbl2q_u8(wsat2, idx1)); + + x2.x = vreinterpretq_f32_u8(vqtbl2q_u8(rsat2, idx2)); + x2.y = vreinterpretq_f32_u8(vqtbl2q_u8(gsat2, idx2)); + x2.z = vreinterpretq_f32_u8(vqtbl2q_u8(bsat2, idx2)); + w2 = vreinterpretq_f32_u8(vqtbl2q_u8(wsat2, idx2)); + } + else if (count <= 12) { + uint8x16x3_t rsat3 = vld3q_u8((uint8*)sat.r); + uint8x16x3_t gsat3 = vld3q_u8((uint8*)sat.g); + uint8x16x3_t bsat3 = vld3q_u8((uint8*)sat.b); + uint8x16x3_t wsat3 = vld3q_u8((uint8*)sat.w); + + x0.x = vreinterpretq_f32_u8(vqtbl3q_u8(rsat3, idx0)); + x0.y = vreinterpretq_f32_u8(vqtbl3q_u8(gsat3, idx0)); + x0.z = vreinterpretq_f32_u8(vqtbl3q_u8(bsat3, idx0)); + w0 = vreinterpretq_f32_u8(vqtbl3q_u8(wsat3, idx0)); + + x1.x = vreinterpretq_f32_u8(vqtbl3q_u8(rsat3, idx1)); + x1.y = vreinterpretq_f32_u8(vqtbl3q_u8(gsat3, idx1)); + x1.z = vreinterpretq_f32_u8(vqtbl3q_u8(bsat3, idx1)); + w1 = vreinterpretq_f32_u8(vqtbl3q_u8(wsat3, idx1)); + + x2.x = vreinterpretq_f32_u8(vqtbl3q_u8(rsat3, idx2)); + x2.y = vreinterpretq_f32_u8(vqtbl3q_u8(gsat3, idx2)); + x2.z = vreinterpretq_f32_u8(vqtbl3q_u8(bsat3, idx2)); + w2 = vreinterpretq_f32_u8(vqtbl3q_u8(wsat3, idx2)); + } + else { + uint8x16x4_t rsat4 = vld4q_u8((uint8*)sat.r); + uint8x16x4_t gsat4 = vld4q_u8((uint8*)sat.g); + uint8x16x4_t bsat4 = vld4q_u8((uint8*)sat.b); + uint8x16x4_t wsat4 = vld4q_u8((uint8*)sat.w); + + x0.x = vreinterpretq_f32_u8(vqtbl4q_u8(rsat4, idx0)); + x0.y = vreinterpretq_f32_u8(vqtbl4q_u8(gsat4, idx0)); + x0.z = vreinterpretq_f32_u8(vqtbl4q_u8(bsat4, idx0)); + w0 = vreinterpretq_f32_u8(vqtbl4q_u8(wsat4, idx0)); + + x1.x = vreinterpretq_f32_u8(vqtbl4q_u8(rsat4, idx1)); + x1.y = vreinterpretq_f32_u8(vqtbl4q_u8(gsat4, idx1)); + x1.z = vreinterpretq_f32_u8(vqtbl4q_u8(bsat4, idx1)); + w1 = vreinterpretq_f32_u8(vqtbl4q_u8(wsat4, idx1)); + + x2.x = vreinterpretq_f32_u8(vqtbl4q_u8(rsat4, idx2)); + x2.y = vreinterpretq_f32_u8(vqtbl4q_u8(gsat4, idx2)); + x2.z = vreinterpretq_f32_u8(vqtbl4q_u8(bsat4, idx2)); + w2 = vreinterpretq_f32_u8(vqtbl4q_u8(wsat4, idx2)); + } + +#else + + // Scalar path + x0.x = vzero(); x0.y = vzero(); x0.z = vzero(); w0 = vzero(); + x1.x = vzero(); x1.y = vzero(); x1.z = vzero(); w1 = vzero(); + x2.x = vzero(); x2.y = vzero(); x2.z = vzero(); w2 = vzero(); + + for (int l = 0; l < VEC_SIZE; l++) { + int c0 = s_fourCluster[i + l].c0 - 1; + if (c0 >= 0) { + lane(x0.x, l) = sat.r[c0]; + lane(x0.y, l) = sat.g[c0]; + lane(x0.z, l) = sat.b[c0]; + lane(w0, l) = sat.w[c0]; + } + + int c1 = s_fourCluster[i + l].c1 - 1; + if (c1 >= 0) { + lane(x1.x, l) = sat.r[c1]; + lane(x1.y, l) = sat.g[c1]; + lane(x1.z, l) = sat.b[c1]; + lane(w1, l) = sat.w[c1]; + } + + int c2 = s_fourCluster[i + l].c2 - 1; + if (c2 >= 0) { + lane(x2.x, l) = sat.r[c2]; + lane(x2.y, l) = sat.g[c2]; + lane(x2.z, l) = sat.b[c2]; + lane(w2, l) = sat.w[c2]; + } + } + +#endif + + VFloat w3 = vbroadcast(w_sum) - w2; + x2 = x2 - x1; + x1 = x1 - x0; + w2 = w2 - w1; + w1 = w1 - w0; + + VFloat alpha2_sum = vmadd(w2, (1.0f / 9.0f), vmadd(w1, (4.0f / 9.0f), w0)); + VFloat beta2_sum = vmadd(w1, (1.0f / 9.0f), vmadd(w2, (4.0f / 9.0f), w3)); + + VFloat alphabeta_sum = (w1 + w2) * vbroadcast(2.0f / 9.0f); + VFloat factor = vrcp(vm2sub(alpha2_sum, beta2_sum, alphabeta_sum, alphabeta_sum)); + + VVector3 alphax_sum = vmadd(x2, (1.0f / 3.0f), vmadd(x1, (2.0f / 3.0f), x0)); + VVector3 betax_sum = vbroadcast(r_sum, g_sum, b_sum) - alphax_sum; + + VVector3 a = vm2sub(alphax_sum, beta2_sum, betax_sum, alphabeta_sum) * factor; + VVector3 b = vm2sub(betax_sum, alpha2_sum, alphax_sum, alphabeta_sum) * factor; + + // snap to the grid + a = vround_ept(a); + b = vround_ept(b); + + // compute the error + VVector3 e2 = vm2sub(a, vmsub(b, alphabeta_sum, alphax_sum), b, betax_sum) * vbroadcast(2.0f); + VVector3 e1 = vmadd(a * a, alpha2_sum, vmadd(b * b, beta2_sum, e2)); + + // apply the metric to the error term + VFloat error = vdot(e1, vbroadcast(metric_sqr)); + + // keep the solution if it wins + auto mask = (error < vbesterror); + + // We could mask the unused lanes here, but instead set the invalid SAT entries to FLT_MAX. + //mask = (mask & (vbroadcast(total_order_count) >= tid8(i))); // This doesn't seem to help. Is it OK to consider elements out of bounds? + + vbesterror = vselect(mask, vbesterror, error); + vbeststart = vselect(mask, vbeststart, a); + vbestend = vselect(mask, vbestend, b); + } + + int bestindex = reduce_min_index(vbesterror); + + start->x = lane(vbeststart.x, bestindex); + start->y = lane(vbeststart.y, bestindex); + start->z = lane(vbeststart.z, bestindex); + end->x = lane(vbestend.x, bestindex); + end->y = lane(vbestend.y, bestindex); + end->z = lane(vbestend.z, bestindex); +} + + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Palette evaluation. + +// D3D10 +inline void evaluate_palette4_d3d10(Color16 c0, Color16 c1, Color32 palette[4]) { + palette[2].r = (2 * palette[0].r + palette[1].r) / 3; + palette[2].g = (2 * palette[0].g + palette[1].g) / 3; + palette[2].b = (2 * palette[0].b + palette[1].b) / 3; + palette[2].a = 0xFF; + + palette[3].r = (2 * palette[1].r + palette[0].r) / 3; + palette[3].g = (2 * palette[1].g + palette[0].g) / 3; + palette[3].b = (2 * palette[1].b + palette[0].b) / 3; + palette[3].a = 0xFF; +} +inline void evaluate_palette3_d3d10(Color16 c0, Color16 c1, Color32 palette[4]) { + palette[2].r = (palette[0].r + palette[1].r) / 2; + palette[2].g = (palette[0].g + palette[1].g) / 2; + palette[2].b = (palette[0].b + palette[1].b) / 2; + palette[2].a = 0xFF; + palette[3].u = 0; +} +static void evaluate_palette_d3d10(Color16 c0, Color16 c1, Color32 palette[4]) { + palette[0] = bitexpand_color16_to_color32(c0); + palette[1] = bitexpand_color16_to_color32(c1); + if (c0.u > c1.u) { + evaluate_palette4_d3d10(c0, c1, palette); + } + else { + evaluate_palette3_d3d10(c0, c1, palette); + } +} + +// NV +inline void evaluate_palette4_nv(Color16 c0, Color16 c1, Color32 palette[4]) { + int gdiff = palette[1].g - palette[0].g; + palette[2].r = ((2 * c0.r + c1.r) * 22) / 8; + palette[2].g = (256 * palette[0].g + gdiff / 4 + 128 + gdiff * 80) / 256; + palette[2].b = ((2 * c0.b + c1.b) * 22) / 8; + palette[2].a = 0xFF; + + palette[3].r = ((2 * c1.r + c0.r) * 22) / 8; + palette[3].g = (256 * palette[1].g - gdiff / 4 + 128 - gdiff * 80) / 256; + palette[3].b = ((2 * c1.b + c0.b) * 22) / 8; + palette[3].a = 0xFF; +} +inline void evaluate_palette3_nv(Color16 c0, Color16 c1, Color32 palette[4]) { + int gdiff = palette[1].g - palette[0].g; + palette[2].r = ((c0.r + c1.r) * 33) / 8; + palette[2].g = (256 * palette[0].g + gdiff / 4 + 128 + gdiff * 128) / 256; + palette[2].b = ((c0.b + c1.b) * 33) / 8; + palette[2].a = 0xFF; + palette[3].u = 0; +} +static void evaluate_palette_nv(Color16 c0, Color16 c1, Color32 palette[4]) { + palette[0] = bitexpand_color16_to_color32(c0); + palette[1] = bitexpand_color16_to_color32(c1); + + if (c0.u > c1.u) { + evaluate_palette4_nv(c0, c1, palette); + } + else { + evaluate_palette3_nv(c0, c1, palette); + } +} + +// AMD +inline void evaluate_palette4_amd(Color16 c0, Color16 c1, Color32 palette[4]) { + palette[2].r = (43 * palette[0].r + 21 * palette[1].r + 32) / 8; + palette[2].g = (43 * palette[0].g + 21 * palette[1].g + 32) / 8; + palette[2].b = (43 * palette[0].b + 21 * palette[1].b + 32) / 8; + palette[2].a = 0xFF; + + palette[3].r = (43 * palette[1].r + 21 * palette[0].r + 32) / 8; + palette[3].g = (43 * palette[1].g + 21 * palette[0].g + 32) / 8; + palette[3].b = (43 * palette[1].b + 21 * palette[0].b + 32) / 8; + palette[3].a = 0xFF; +} +inline void evaluate_palette3_amd(Color16 c0, Color16 c1, Color32 palette[4]) { + palette[2].r = (c0.r + c1.r + 1) / 2; + palette[2].g = (c0.g + c1.g + 1) / 2; + palette[2].b = (c0.b + c1.b + 1) / 2; + palette[2].a = 0xFF; + palette[3].u = 0; +} +static void evaluate_palette_amd(Color16 c0, Color16 c1, Color32 palette[4]) { + palette[0] = bitexpand_color16_to_color32(c0); + palette[1] = bitexpand_color16_to_color32(c1); + + if (c0.u > c1.u) { + evaluate_palette4_amd(c0, c1, palette); + } + else { + evaluate_palette3_amd(c0, c1, palette); + } +} + +// Use ICBC_DECODER to determine decoder used. +inline void evaluate_palette4(Color16 c0, Color16 c1, Color32 palette[4]) { +#if ICBC_DECODER == Decoder_D3D10 + evaluate_palette4_d3d10(c0, c1, palette); +#elif ICBC_DECODER == Decoder_NVIDIA + evaluate_palette4_nv(c0, c1, palette); +#elif ICBC_DECODER == Decoder_AMD + evaluate_palette4_amd(c0, c1, palette); +#endif +} +inline void evaluate_palette3(Color16 c0, Color16 c1, Color32 palette[4]) { +#if ICBC_DECODER == Decoder_D3D10 + evaluate_palette3_d3d10(c0, c1, palette); +#elif ICBC_DECODER == Decoder_NVIDIA + evaluate_palette3_nv(c0, c1, palette); +#elif ICBC_DECODER == Decoder_AMD + evaluate_palette3_amd(c0, c1, palette); +#endif +} +inline void evaluate_palette(Color16 c0, Color16 c1, Color32 palette[4]) { +#if ICBC_DECODER == Decoder_D3D10 + evaluate_palette_d3d10(c0, c1, palette); +#elif ICBC_DECODER == Decoder_NVIDIA + evaluate_palette_nv(c0, c1, palette); +#elif ICBC_DECODER == Decoder_AMD + evaluate_palette_amd(c0, c1, palette); +#endif +} + +static void evaluate_palette(Color16 c0, Color16 c1, Vector3 palette[4]) { + Color32 palette32[4]; + evaluate_palette(c0, c1, palette32); + + for (int i = 0; i < 4; i++) { + palette[i] = color_to_vector3(palette32[i]); + } +} + +static void decode_dxt1(const BlockDXT1 * block, unsigned char rgba_block[16 * 4], Decoder decoder) +{ + Color32 palette[4]; + if (decoder == Decoder_NVIDIA) { + evaluate_palette_nv(block->col0, block->col1, palette); + } + else if (decoder == Decoder_AMD) { + evaluate_palette_amd(block->col0, block->col1, palette); + } + else { + evaluate_palette(block->col0, block->col1, palette); + } + + for (int i = 0; i < 16; i++) { + int index = (block->indices >> (2 * i)) & 3; + Color32 c = palette[index]; + rgba_block[4 * i + 0] = c.r; + rgba_block[4 * i + 1] = c.g; + rgba_block[4 * i + 2] = c.b; + rgba_block[4 * i + 3] = c.a; + } +} + + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Error evaluation. + +// Different ways of estimating the error. + +static float evaluate_mse(const Vector3 & p, const Vector3 & c, const Vector3 & w) { + Vector3 d = (p - c) * w * 255; + return dot(d, d); +} + +static float evaluate_mse(const Color32 & p, const Vector3 & c, const Vector3 & w) { + Vector3 d = (color_to_vector3(p) - c) * w * 255; + return dot(d, d); +} + + +/*static float evaluate_mse(const Vector3 & p, const Vector3 & c, const Vector3 & w) { + return ww.x * square(p.x-c.x) + ww.y * square(p.y-c.y) + ww.z * square(p.z-c.z); +}*/ + +static int evaluate_mse(const Color32 & p, const Color32 & c) { + return (square(int(p.r)-c.r) + square(int(p.g)-c.g) + square(int(p.b)-c.b)); +} + +/*static float evaluate_mse(const Vector3 palette[4], const Vector3 & c, const Vector3 & w) { + float e0 = evaluate_mse(palette[0], c, w); + float e1 = evaluate_mse(palette[1], c, w); + float e2 = evaluate_mse(palette[2], c, w); + float e3 = evaluate_mse(palette[3], c, w); + return min(min(e0, e1), min(e2, e3)); +}*/ + +static int evaluate_mse(const Color32 palette[4], const Color32 & c) { + int e0 = evaluate_mse(palette[0], c); + int e1 = evaluate_mse(palette[1], c); + int e2 = evaluate_mse(palette[2], c); + int e3 = evaluate_mse(palette[3], c); + return min(min(e0, e1), min(e2, e3)); +} + +// Returns MSE error in [0-255] range. +static int evaluate_mse(const BlockDXT1 * output, Color32 color, int index) { + Color32 palette[4]; + evaluate_palette(output->col0, output->col1, palette); + + return evaluate_mse(palette[index], color); +} + +// Returns weighted MSE error in [0-255] range. +static float evaluate_palette_error(Color32 palette[4], const Color32 * colors, const float * weights, int count) { + + float total = 0.0f; + for (int i = 0; i < count; i++) { + total += weights[i] * evaluate_mse(palette, colors[i]); + } + + return total; +} + +static float evaluate_palette_error(Color32 palette[4], const Color32 * colors, int count) { + + float total = 0.0f; + for (int i = 0; i < count; i++) { + total += evaluate_mse(palette, colors[i]); + } + + return total; +} + +static float evaluate_mse(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, const BlockDXT1 * output) { + Color32 palette[4]; + evaluate_palette(output->col0, output->col1, palette); + + // evaluate error for each index. + float error = 0.0f; + for (int i = 0; i < 16; i++) { + int index = (output->indices >> (2 * i)) & 3; + error += input_weights[i] * evaluate_mse(palette[index], input_colors[i].xyz, color_weights); + } + return error; +} + +float evaluate_dxt1_error(const uint8 rgba_block[16*4], const BlockDXT1 * block, Decoder decoder) { + Color32 palette[4]; + if (decoder == Decoder_NVIDIA) { + evaluate_palette_nv(block->col0, block->col1, palette); + } + else if (decoder == Decoder_AMD) { + evaluate_palette_amd(block->col0, block->col1, palette); + } + else { + evaluate_palette(block->col0, block->col1, palette); + } + + // evaluate error for each index. + float error = 0.0f; + for (int i = 0; i < 16; i++) { + int index = (block->indices >> (2 * i)) & 3; + Color32 c; + c.r = rgba_block[4 * i + 0]; + c.g = rgba_block[4 * i + 1]; + c.b = rgba_block[4 * i + 2]; + c.a = 255; + error += evaluate_mse(palette[index], c); + } + return error; +} + + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Index selection + +// @@ Can we interleave the two uint16 at once? +inline uint32 interleave_uint16_with_zeros(uint32 input) { + uint32 word = input; + word = (word ^ (word << 8 )) & 0x00ff00ff; + word = (word ^ (word << 4 )) & 0x0f0f0f0f; + word = (word ^ (word << 2 )) & 0x33333333; + word = (word ^ (word << 1 )) & 0x55555555; + return word; +} + +// Interleave the bits. https://lemire.me/blog/2018/01/08/how-fast-can-you-bit-interleave-32-bit-integers/ +ICBC_FORCEINLINE uint32 interleave(uint32 a, uint32 b) { +#if ICBC_BMI2 + return _pdep_u32(a, 0x55555555) | _pdep_u32(b, 0xaaaaaaaa); +#else + return interleave_uint16_with_zeros(a) | (interleave_uint16_with_zeros(b) << 1); +#endif +} + +static uint compute_indices4(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 palette[4]) { + uint indices0 = 0; + uint indices1 = 0; + + VVector3 vw = vbroadcast(color_weights); + VVector3 vp0 = vbroadcast(palette[0]) * vw; + VVector3 vp1 = vbroadcast(palette[1]) * vw; + VVector3 vp2 = vbroadcast(palette[2]) * vw; + VVector3 vp3 = vbroadcast(palette[3]) * vw; + + for (int i = 0; i < 16; i += VEC_SIZE) { + VVector3 vc = vload(&input_colors[i]) * vw; + + VFloat d0 = vlen2(vc - vp0); + VFloat d1 = vlen2(vc - vp1); + VFloat d2 = vlen2(vc - vp2); + VFloat d3 = vlen2(vc - vp3); + + VMask b1 = d1 > d2; + VMask b2 = d0 > d2; + VMask x0 = b1 & b2; + + VMask b0 = d0 > d3; + VMask b3 = d1 > d3; + x0 = x0 | (b0 & b3); + + VMask b4 = d2 > d3; + VMask x1 = b0 & b4; + + indices0 |= mask(x0) << i; + indices1 |= mask(x1) << i; + } + + return interleave(indices1, indices0); +} + +static uint compute_indices3(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 palette[4]) { +#if 0 + Vector3 p0 = palette[0] * color_weights; + Vector3 p1 = palette[1] * color_weights; + Vector3 p2 = palette[2] * color_weights; + Vector3 p3 = palette[3] * color_weights; + + uint indices = 0; + for (int i = 0; i < 16; i++) { + Vector3 ci = input_colors[i].xyz * color_weights; + float d0 = lengthSquared(p0 - ci); + float d1 = lengthSquared(p1 - ci); + float d2 = lengthSquared(p2 - ci); + float d3 = lengthSquared(p3 - ci); + + uint index; + if (d0 < d1 && d0 < d2 && d0 < d3) index = 0; + else if (d1 < d2 && d1 < d3) index = 1; + else if (d2 < d3) index = 2; + else index = 3; + + indices |= index << (2 * i); + } + + return indices; +#else + uint indices0 = 0; + uint indices1 = 0; + + VVector3 vw = vbroadcast(color_weights); + VVector3 vp0 = vbroadcast(palette[0]) * vw; + VVector3 vp1 = vbroadcast(palette[1]) * vw; + VVector3 vp2 = vbroadcast(palette[2]) * vw; + + for (int i = 0; i < 16; i += VEC_SIZE) { + VVector3 vc = vload(&input_colors[i]) * vw; + + VFloat d0 = vlen2(vc - vp0); + VFloat d1 = vlen2(vc - vp1); + VFloat d2 = vlen2(vc - vp2); + VFloat d3 = vdot(vc, vc); + + // @@ simplify i1 & i2 + //VMask i0 = (d0 < d1) & (d0 < d2) & (d0 < d3); // 0 + VMask i1 = (d1 <= d0) & (d1 < d2) & (d1 < d3); // 1 + VMask i2 = (d2 <= d0) & (d2 <= d1) & (d2 < d3); // 2 + VMask i3 = (d3 <= d0) & (d3 <= d1) & (d3 <= d2); // 3 + //VFloat vindex = vselect(i0, vselect(i1, vselect(i2, vbroadcast(3), vbroadcast(2)), vbroadcast(1)), vbroadcast(0)); + + indices0 |= mask(i2 | i3) << i; + indices1 |= mask(i1 | i3) << i; + } + + uint indices = interleave(indices1, indices0); + return indices; +#endif +} + + + +static uint compute_indices4(const Vector3 input_colors[16], const Vector3 palette[4]) { +#if 0 + uint indices0 = 0; + uint indices1 = 0; + + VVector3 vp0 = vbroadcast(palette[0]); + VVector3 vp1 = vbroadcast(palette[1]); + VVector3 vp2 = vbroadcast(palette[2]); + VVector3 vp3 = vbroadcast(palette[3]); + + for (int i = 0; i < 16; i += VEC_SIZE) { + VVector3 vc = vload(&input_colors[i]); + + VFloat d0 = vlen2(vc - vp0); + VFloat d1 = vlen2(vc - vp1); + VFloat d2 = vlen2(vc - vp2); + VFloat d3 = vlen2(vc - vp3); + + VMask b1 = d1 > d2; + VMask b2 = d0 > d2; + VMask x0 = b1 & b2; + + VMask b0 = d0 > d3; + VMask b3 = d1 > d3; + x0 = x0 | (b0 & b3); + + VMask b4 = d2 > d3; + VMask x1 = b0 & b4; + + indices0 |= mask(x0) << i; + indices1 |= mask(x1) << i; + } + + return interleave(indices1, indices0); +#else + uint indices = 0; + for (int i = 0; i < 16; i++) { + Vector3 ci = input_colors[i]; + float d0 = lengthSquared(palette[0] - ci); + float d1 = lengthSquared(palette[1] - ci); + float d2 = lengthSquared(palette[2] - ci); + float d3 = lengthSquared(palette[3] - ci); + + uint b0 = d0 > d3; + uint b1 = d1 > d2; + uint b2 = d0 > d2; + uint b3 = d1 > d3; + uint b4 = d2 > d3; + + uint x0 = b1 & b2; + uint x1 = b0 & b3; + uint x2 = b0 & b4; + + indices |= (x2 | ((x0 | x1) << 1)) << (2 * i); + } + + return indices; +#endif +} + + +static uint compute_indices(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 palette[4]) { +#if 0 + Vector3 p0 = palette[0] * color_weights; + Vector3 p1 = palette[1] * color_weights; + Vector3 p2 = palette[2] * color_weights; + Vector3 p3 = palette[3] * color_weights; + + uint indices = 0; + for (int i = 0; i < 16; i++) { + Vector3 ci = input_colors[i].xyz * color_weights; + float d0 = lengthSquared(p0 - ci); + float d1 = lengthSquared(p1 - ci); + float d2 = lengthSquared(p2 - ci); + float d3 = lengthSquared(p3 - ci); + + uint index; + if (d0 < d1 && d0 < d2 && d0 < d3) index = 0; + else if (d1 < d2 && d1 < d3) index = 1; + else if (d2 < d3) index = 2; + else index = 3; + + indices |= index << (2 * i); + } + + return indices; +#else + uint indices0 = 0; + uint indices1 = 0; + + VVector3 vw = vbroadcast(color_weights); + VVector3 vp0 = vbroadcast(palette[0]) * vw; + VVector3 vp1 = vbroadcast(palette[1]) * vw; + VVector3 vp2 = vbroadcast(palette[2]) * vw; + VVector3 vp3 = vbroadcast(palette[3]) * vw; + + for (int i = 0; i < 16; i += VEC_SIZE) { + VVector3 vc = vload(&input_colors[i]) * vw; + + VFloat d0 = vlen2(vc - vp0); + VFloat d1 = vlen2(vc - vp1); + VFloat d2 = vlen2(vc - vp2); + VFloat d3 = vlen2(vc - vp3); + + //VMask i0 = (d0 < d1) & (d0 < d2) & (d0 < d3); + VMask i1 = (d1 <= d0) & (d1 < d2) & (d1 < d3); + VMask i2 = (d2 <= d0) & (d2 <= d1) & (d2 < d3); + VMask i3 = (d3 <= d0) & (d3 <= d1) & (d3 <= d2); + //VFloat vindex = vselect(i0, vselect(i1, vselect(i2, vbroadcast(3), vbroadcast(2)), vbroadcast(1)), vbroadcast(0)); + + indices0 |= mask(i2 | i3) << i; + indices1 |= mask(i1 | i3) << i; + } + + uint indices = interleave(indices1, indices0); + return indices; +#endif +} + + +static void output_block3(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 & v0, const Vector3 & v1, BlockDXT1 * block) +{ + Color16 color0 = vector3_to_color16(v0); + Color16 color1 = vector3_to_color16(v1); + + if (color0.u > color1.u) { + swap(color0, color1); + } + + Vector3 palette[4]; + evaluate_palette(color0, color1, palette); + + block->col0 = color0; + block->col1 = color1; + block->indices = compute_indices(input_colors, color_weights, palette); +} + +static void output_block4(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 & v0, const Vector3 & v1, BlockDXT1 * block) +{ + Color16 color0 = vector3_to_color16(v0); + Color16 color1 = vector3_to_color16(v1); + + if (color0.u < color1.u) { + swap(color0, color1); + } + + Vector3 palette[4]; + evaluate_palette(color0, color1, palette); + + block->col0 = color0; + block->col1 = color1; + block->indices = compute_indices4(input_colors, color_weights, palette); +} + + +static void output_block4(const Vector3 input_colors[16], const Vector3 & v0, const Vector3 & v1, BlockDXT1 * block) +{ + Color16 color0 = vector3_to_color16(v0); + Color16 color1 = vector3_to_color16(v1); + + if (color0.u < color1.u) { + swap(color0, color1); + } + + Vector3 palette[4]; + evaluate_palette(color0, color1, palette); + + block->col0 = color0; + block->col1 = color1; + block->indices = compute_indices4(input_colors, palette); +} + +// Least squares fitting of color end points for the given indices. @@ Take weights into account. +static bool optimize_end_points4(uint indices, const Vector4 * colors, /*const float * weights,*/ int count, Vector3 * a, Vector3 * b) +{ + float alpha2_sum = 0.0f; + float beta2_sum = 0.0f; + float alphabeta_sum = 0.0f; + Vector3 alphax_sum = { 0,0,0 }; + Vector3 betax_sum = { 0,0,0 }; + + for (int i = 0; i < count; i++) + { + const uint bits = indices >> (2 * i); + + float beta = float(bits & 1); + if (bits & 2) beta = (1 + beta) / 3.0f; + float alpha = 1.0f - beta; + + alpha2_sum += alpha * alpha; + beta2_sum += beta * beta; + alphabeta_sum += alpha * beta; + alphax_sum += alpha * colors[i].xyz; + betax_sum += beta * colors[i].xyz; + } + + float denom = alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum; + if (equal(denom, 0.0f)) return false; + + float factor = 1.0f / denom; + + *a = saturate((alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor); + *b = saturate((betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor); + + return true; +} + +// Least squares optimization with custom factors. +// This allows us passing the standard [1, 0, 2/3 1/3] weights by default, but also use alternative mappings when the number of clusters is not 4. +static bool optimize_end_points4(uint indices, const Vector3 * colors, int count, float factors[4], Vector3 * a, Vector3 * b) +{ + float alpha2_sum = 0.0f; + float beta2_sum = 0.0f; + float alphabeta_sum = 0.0f; + Vector3 alphax_sum = { 0,0,0 }; + Vector3 betax_sum = { 0,0,0 }; + + for (int i = 0; i < count; i++) + { + const uint idx = (indices >> (2 * i)) & 3; + float alpha = factors[idx]; + float beta = 1 - alpha; + + alpha2_sum += alpha * alpha; + beta2_sum += beta * beta; + alphabeta_sum += alpha * beta; + alphax_sum += alpha * colors[i]; + betax_sum += beta * colors[i]; + } + + float denom = alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum; + if (equal(denom, 0.0f)) return false; + + float factor = 1.0f / denom; + + *a = saturate((alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor); + *b = saturate((betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor); + + return true; +} + +static bool optimize_end_points4(uint indices, const Vector3 * colors, int count, Vector3 * a, Vector3 * b) +{ + float factors[4] = { 1, 0, 2.f / 3, 1.f / 3 }; + return optimize_end_points4(indices, colors, count, factors, a, b); +} + + +// Least squares fitting of color end points for the given indices. @@ This does not support black/transparent index. @@ Take weights into account. +static bool optimize_end_points3(uint indices, const Vector3 * colors, /*const float * weights,*/ int count, Vector3 * a, Vector3 * b) +{ + float alpha2_sum = 0.0f; + float beta2_sum = 0.0f; + float alphabeta_sum = 0.0f; + Vector3 alphax_sum = { 0,0,0 }; + Vector3 betax_sum = { 0,0,0 }; + + for (int i = 0; i < count; i++) + { + const uint bits = indices >> (2 * i); + + float beta = float(bits & 1); + if (bits & 2) beta = 0.5f; + float alpha = 1.0f - beta; + + alpha2_sum += alpha * alpha; + beta2_sum += beta * beta; + alphabeta_sum += alpha * beta; + alphax_sum += alpha * colors[i]; + betax_sum += beta * colors[i]; + } + + float denom = alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum; + if (equal(denom, 0.0f)) return false; + + float factor = 1.0f / denom; + + *a = saturate((alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor); + *b = saturate((betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor); + + return true; +} + + +// find minimum and maximum colors based on bounding box in color space +inline static void fit_colors_bbox(const Vector3 * colors, int count, Vector3 * __restrict c0, Vector3 * __restrict c1) +{ + *c0 = { 0,0,0 }; + *c1 = { 1,1,1 }; + + for (int i = 0; i < count; i++) { + *c0 = max(*c0, colors[i]); + *c1 = min(*c1, colors[i]); + } +} + +inline static void select_diagonal(const Vector3 * colors, int count, Vector3 * __restrict c0, Vector3 * __restrict c1) +{ + Vector3 center = (*c0 + *c1) * 0.5f; + + /*Vector3 center = colors[0]; + for (int i = 1; i < count; i++) { + center = center * float(i-1) / i + colors[i] / i; + }*/ + /*Vector3 center = colors[0]; + for (int i = 1; i < count; i++) { + center += colors[i]; + } + center /= count;*/ + + float cov_xz = 0.0f; + float cov_yz = 0.0f; + for (int i = 0; i < count; i++) { + Vector3 t = colors[i] - center; + cov_xz += t.x * t.z; + cov_yz += t.y * t.z; + } + + float x0 = c0->x; + float y0 = c0->y; + float x1 = c1->x; + float y1 = c1->y; + + if (cov_xz < 0) { + swap(x0, x1); + } + if (cov_yz < 0) { + swap(y0, y1); + } + + *c0 = { x0, y0, c0->z }; + *c1 = { x1, y1, c1->z }; +} + +inline static void inset_bbox(Vector3 * __restrict c0, Vector3 * __restrict c1) +{ + float bias = (8.0f / 255.0f) / 16.0f; + Vector3 inset = (*c0 - *c1) / 16.0f - scalar_to_vector3(bias); + *c0 = saturate(*c0 - inset); + *c1 = saturate(*c1 + inset); +} + + + +// Single color lookup tables from: +// https://github.com/nothings/stb/blob/master/stb_dxt.h +static uint8 s_match5[256][2]; +static uint8 s_match6[256][2]; + +static inline int Lerp13(int a, int b) +{ + // replace "/ 3" by "* 0xaaab) >> 17" if your compiler sucks or you really need every ounce of speed. + return (a * 2 + b) / 3; +} + +static void PrepareOptTable(uint8 * table, const uint8 * expand, int size) +{ + for (int i = 0; i < 256; i++) { + int bestErr = 256 * 100; + + for (int min = 0; min < size; min++) { + for (int max = 0; max < size; max++) { + int mine = expand[min]; + int maxe = expand[max]; + + int err = abs(Lerp13(maxe, mine) - i) * 100; + + // DX10 spec says that interpolation must be within 3% of "correct" result, + // add this as error term. (normally we'd expect a random distribution of + // +-1.5% error, but nowhere in the spec does it say that the error has to be + // unbiased - better safe than sorry). + err += abs(max - min) * 3; + + if (err < bestErr) { + bestErr = err; + table[i * 2 + 0] = max; + table[i * 2 + 1] = min; + } + } + } + } +} + +static void init_single_color_tables() +{ + // Prepare single color lookup tables. + uint8 expand5[32]; + uint8 expand6[64]; + for (int i = 0; i < 32; i++) expand5[i] = (i << 3) | (i >> 2); + for (int i = 0; i < 64; i++) expand6[i] = (i << 2) | (i >> 4); + + PrepareOptTable(&s_match5[0][0], expand5, 32); + PrepareOptTable(&s_match6[0][0], expand6, 64); +} + +// Single color compressor, based on: +// https://mollyrocket.com/forums/viewtopic.php?t=392 +static void compress_dxt1_single_color_optimal(Color32 c, BlockDXT1 * output) +{ + output->col0.r = s_match5[c.r][0]; + output->col0.g = s_match6[c.g][0]; + output->col0.b = s_match5[c.b][0]; + output->col1.r = s_match5[c.r][1]; + output->col1.g = s_match6[c.g][1]; + output->col1.b = s_match5[c.b][1]; + output->indices = 0xaaaaaaaa; + + if (output->col0.u < output->col1.u) + { + swap(output->col0.u, output->col1.u); + output->indices ^= 0x55555555; + } +} + + +// Compress block using the average color. +static float compress_dxt1_single_color(const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, BlockDXT1 * output) +{ + // Compute block average. + Vector3 color_sum = { 0,0,0 }; + float weight_sum = 0; + + for (int i = 0; i < count; i++) { + color_sum += colors[i] * weights[i]; + weight_sum += weights[i]; + } + + // Compress optimally. + compress_dxt1_single_color_optimal(vector3_to_color32(color_sum / weight_sum), output); + + // Decompress block color. + Color32 palette[4]; + evaluate_palette(output->col0, output->col1, palette); + + Vector3 block_color = color_to_vector3(palette[output->indices & 0x3]); + + // Evaluate error. + float error = 0; + for (int i = 0; i < count; i++) { + error += weights[i] * evaluate_mse(block_color, colors[i], color_weights); + } + return error; +} + +static float compress_dxt1_cluster_fit(const Vector4 input_colors[16], const float input_weights[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, bool three_color_mode, bool use_transparent_black, BlockDXT1 * output) +{ + Vector3 metric_sqr = color_weights * color_weights; + + SummedAreaTable sat; + int sat_count = compute_sat(colors, weights, count, &sat); + + Vector3 start, end; +#if ICBC_FAST_CLUSTER_FIT + if (sat_count == 16) fast_cluster_fit_four(sat, metric_sqr, &start, &end); + else cluster_fit_four(sat, sat_count, metric_sqr, &start, &end); +#else + cluster_fit_four(sat, sat_count, metric_sqr, &start, &end); +#endif + + output_block4(input_colors, color_weights, start, end, output); + + float best_error = evaluate_mse(input_colors, input_weights, color_weights, output); + + if (three_color_mode) { + if (use_transparent_black) { + Vector3 tmp_colors[16]; + float tmp_weights[16]; + int tmp_count = skip_blacks(colors, weights, count, tmp_colors, tmp_weights); + if (!tmp_count) return best_error; + + sat_count = compute_sat(tmp_colors, tmp_weights, tmp_count, &sat); + } + + cluster_fit_three(sat, sat_count, metric_sqr, &start, &end); + + BlockDXT1 three_color_block; + output_block3(input_colors, color_weights, start, end, &three_color_block); + + float three_color_error = evaluate_mse(input_colors, input_weights, color_weights, &three_color_block); + + if (three_color_error < best_error) { + best_error = three_color_error; + *output = three_color_block; + } + } + + return best_error; +} + + +static float refine_endpoints(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, bool three_color_mode, float input_error, BlockDXT1 * output) { + // TODO: + // - Optimize palette evaluation when updating only one channel. + // - try all diagonals. + + // Things that don't help: + // - Alternate endpoint updates. + // - Randomize order. + // - If one direction does not improve, test opposite direction next. + + static const int8 deltas[16][3] = { + {1,0,0}, + {0,1,0}, + {0,0,1}, + + {-1,0,0}, + {0,-1,0}, + {0,0,-1}, + + {1,1,0}, + {1,0,1}, + {0,1,1}, + + {-1,-1,0}, + {-1,0,-1}, + {0,-1,-1}, + + {-1,1,0}, + //{-1,0,1}, + + {1,-1,0}, + {0,-1,1}, + + //{1,0,-1}, + {0,1,-1}, + }; + + float best_error = input_error; + + int lastImprovement = 0; + for (int i = 0; i < 256; i++) { + BlockDXT1 refined = *output; + int8 delta[3] = { deltas[i % 16][0], deltas[i % 16][1], deltas[i % 16][2] }; + + if ((i / 16) & 1) { + refined.col0.r += delta[0]; + refined.col0.g += delta[1]; + refined.col0.b += delta[2]; + } + else { + refined.col1.r += delta[0]; + refined.col1.g += delta[1]; + refined.col1.b += delta[2]; + } + + if (!three_color_mode) { + if (refined.col0.u == refined.col1.u) refined.col1.g += 1; + if (refined.col0.u < refined.col1.u) swap(refined.col0.u, refined.col1.u); + } + + Vector3 palette[4]; + evaluate_palette(output->col0, output->col1, palette); + + refined.indices = compute_indices(input_colors, color_weights, palette); + + float refined_error = evaluate_mse(input_colors, input_weights, color_weights, &refined); + if (refined_error < best_error) { + best_error = refined_error; + *output = refined; + lastImprovement = i; + } + + // Early out if the last 32 steps didn't improve error. + if (i - lastImprovement > 32) break; + } + + return best_error; +} + +struct Options { + float threshold = 0.0f; + bool box_fit = false; + bool least_squares_fit = false; + bool cluster_fit = false; + bool cluster_fit_3 = false; + bool cluster_fit_3_black_only = false; + bool endpoint_refinement = false; +}; + +static Options setup_options(Quality level, bool enable_three_color_mode, bool enable_transparent_black) { + Options opt; + + switch (level) { + case Quality_Level1: // Box fit + least squares fit. + opt.box_fit = true; + opt.least_squares_fit = true; + opt.threshold = 1.0f / 256; + break; + + case Quality_Level2: // Cluster fit 4, threshold = 24. + opt.box_fit = true; + opt.least_squares_fit = true; + opt.cluster_fit = true; + opt.cluster_fit_3_black_only = enable_three_color_mode && enable_transparent_black; + opt.threshold = 1.0f / 24; + break; + + case Quality_Level3: // Cluster fit 4, threshold = 32. + opt.box_fit = true; + opt.cluster_fit = true; + opt.cluster_fit_3_black_only = enable_three_color_mode && enable_transparent_black; + opt.threshold = 1.0f / 32; + break; + + case Quality_Level4: // Cluster fit 3+4, threshold = 48. + opt.cluster_fit = true; + opt.cluster_fit_3_black_only = enable_three_color_mode && enable_transparent_black; + opt.threshold = 1.0f / 48; + break; + + case Quality_Level5: // Cluster fit 3+4, threshold = 64. + opt.cluster_fit = true; + opt.cluster_fit_3_black_only = enable_three_color_mode && enable_transparent_black; + opt.threshold = 1.0f / 64; + break; + + case Quality_Level6: // Cluster fit 3+4, threshold = 96. + opt.cluster_fit = true; + opt.cluster_fit_3_black_only = enable_three_color_mode && enable_transparent_black; + opt.threshold = 1.0f / 96; + break; + + case Quality_Level7: // Cluster fit 3+4, threshold = 128. + opt.cluster_fit = true; + opt.cluster_fit_3_black_only = enable_three_color_mode && enable_transparent_black; + opt.threshold = 1.0f / 128; + break; + + case Quality_Level8: // Cluster fit 3+4, threshold = 256. + opt.cluster_fit = true; + opt.cluster_fit_3 = enable_three_color_mode; + opt.threshold = 1.0f / 256; + break; + + case Quality_Level9: // Cluster fit 3+4, threshold = 256 + Refinement. + opt.cluster_fit = true; + opt.cluster_fit_3 = enable_three_color_mode; + opt.threshold = 1.0f / 256; + opt.endpoint_refinement = true; + break; + } + + return opt; +} + + +static float compress_dxt1(Quality level, const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, bool three_color_mode, bool three_color_black, BlockDXT1 * output) +{ + Options opt = setup_options(level, three_color_mode, three_color_black); + + Vector3 colors[16]; + float weights[16]; + bool any_black = false; + int count; + if (opt.cluster_fit) { + count = reduce_colors(input_colors, input_weights, 16, opt.threshold, colors, weights, &any_black); + } + else { + for (int i = 0; i < 16; i++) { + colors[i] = input_colors[i].xyz; + } + count = 16; + } + + if (count == 0) { + // Output trivial block. + output->col0.u = 0; + output->col1.u = 0; + output->indices = 0; + return 0; + } + + // Cluster fit cannot handle single color blocks, so encode them optimally. + if (count == 1) { + compress_dxt1_single_color_optimal(vector3_to_color32(colors[0]), output); + return evaluate_mse(input_colors, input_weights, color_weights, output); + } + + float error = FLT_MAX; + + // Quick end point selection. + if (opt.box_fit) { + Vector3 c0, c1; + fit_colors_bbox(colors, count, &c0, &c1); + inset_bbox(&c0, &c1); + select_diagonal(colors, count, &c0, &c1); + output_block4(input_colors, color_weights, c0, c1, output); + + error = evaluate_mse(input_colors, input_weights, color_weights, output); + + // Refine color for the selected indices. + if (opt.least_squares_fit && optimize_end_points4(output->indices, input_colors, 16, &c0, &c1)) { + BlockDXT1 optimized_block; + output_block4(input_colors, color_weights, c0, c1, &optimized_block); + + float optimized_error = evaluate_mse(input_colors, input_weights, color_weights, &optimized_block); + if (optimized_error < error) { + error = optimized_error; + *output = optimized_block; + } + } + } + + if (opt.cluster_fit) { + // @@ Use current endpoints as input for initial PCA approximation? + + bool use_three_color_black = any_black && three_color_black; + bool use_three_color_mode = opt.cluster_fit_3 || (use_three_color_black && opt.cluster_fit_3_black_only); + + // Try cluster fit. + BlockDXT1 cluster_fit_output; + float cluster_fit_error = compress_dxt1_cluster_fit(input_colors, input_weights, colors, weights, count, color_weights, use_three_color_mode, use_three_color_black, &cluster_fit_output); + if (cluster_fit_error < error) { + *output = cluster_fit_output; + error = cluster_fit_error; + } + } + + if (opt.endpoint_refinement) { + error = refine_endpoints(input_colors, input_weights, color_weights, three_color_mode, error, output); + } + + return error; +} + + +static float compress_dxt1_fast(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, BlockDXT1 * output) +{ + Vector3 colors[16]; + for (int i = 0; i < 16; i++) { + colors[i] = input_colors[i].xyz; + } + int count = 16; + + /*float error = FLT_MAX; + error = compress_dxt1_single_color(colors, input_weights, count, color_weights, output); + + if (error == 0.0f || count == 1) { + // Early out. + return error; + }*/ + + // Quick end point selection. + Vector3 c0, c1; + fit_colors_bbox(colors, count, &c0, &c1); + if (c0 == c1) { + compress_dxt1_single_color_optimal(vector3_to_color32(c0), output); + return evaluate_mse(input_colors, input_weights, color_weights, output); + } + inset_bbox(&c0, &c1); + select_diagonal(colors, count, &c0, &c1); + output_block4(input_colors, color_weights, c0, c1, output); + + // Refine color for the selected indices. + if (optimize_end_points4(output->indices, input_colors, 16, &c0, &c1)) { + output_block4(input_colors, color_weights, c0, c1, output); + } + + return evaluate_mse(input_colors, input_weights, color_weights, output); +} + + +static void compress_dxt1_fast(const uint8 input_colors[16*4], BlockDXT1 * output) { + + Vector3 vec_colors[16]; + for (int i = 0; i < 16; i++) { + vec_colors[i] = { input_colors[4 * i + 0] / 255.0f, input_colors[4 * i + 1] / 255.0f, input_colors[4 * i + 2] / 255.0f }; + } + + // Quick end point selection. + Vector3 c0, c1; + //fit_colors_bbox(colors, count, &c0, &c1); + //select_diagonal(colors, count, &c0, &c1); + fit_colors_bbox(vec_colors, 16, &c0, &c1); + if (c0 == c1) { + compress_dxt1_single_color_optimal(vector3_to_color32(c0), output); + return; + } + inset_bbox(&c0, &c1); + select_diagonal(vec_colors, 16, &c0, &c1); + output_block4(vec_colors, c0, c1, output); + + // Refine color for the selected indices. + if (optimize_end_points4(output->indices, vec_colors, 16, &c0, &c1)) { + output_block4(vec_colors, c0, c1, output); + } +} + +// Public API + +void init_dxt1() { + init_single_color_tables(); + init_cluster_tables(); +} + +float compress_dxt1(Quality level, const float * input_colors, const float * input_weights, const float rgb[3], bool three_color_mode, bool three_color_black, void * output) { + return compress_dxt1(level, (Vector4*)input_colors, input_weights, { rgb[0], rgb[1], rgb[2] }, three_color_mode, three_color_black, (BlockDXT1*)output); +} + +float compress_dxt1_fast(const float input_colors[16 * 4], const float input_weights[16], const float rgb[3], void * output) { + return compress_dxt1_fast((Vector4*)input_colors, input_weights, { rgb[0], rgb[1], rgb[2] }, (BlockDXT1*)output); +} + +void compress_dxt1_fast(const unsigned char input_colors[16 * 4], void * output) { + compress_dxt1_fast(input_colors, (BlockDXT1*)output); +} + +void decode_dxt1(const void * block, unsigned char rgba_block[16 * 4], Decoder decoder/*=Decoder_D3D10*/) { + decode_dxt1((const BlockDXT1 *)block, rgba_block, decoder); +} + +float evaluate_dxt1_error(const unsigned char rgba_block[16 * 4], const void * dxt_block, Decoder decoder/*=Decoder_D3D10*/) { + return evaluate_dxt1_error(rgba_block, (const BlockDXT1 *)dxt_block, decoder); +} + +} // icbc + +// // Do not polute preprocessor definitions. +// #undef ICBC_DECODER +// #undef ICBC_SIMD +// #undef ICBC_ASSERT + +// #undef ICBC_FLOAT +// #undef ICBC_SSE2 +// #undef ICBC_SSE41 +// #undef ICBC_AVX1 +// #undef ICBC_AVX2 +// #undef ICBC_AVX512 +// #undef ICBC_NEON +// #undef ICBC_VMX + +// #undef ICBC_USE_FMA +// #undef ICBC_USE_AVX2_PERMUTE2 +// #undef ICBC_USE_AVX512_PERMUTE +// #undef ICBC_USE_NEON_VTL + +// #undef ICBC_FAST_CLUSTER_FIT +// #undef ICBC_PERFECT_ROUND +// #undef ICBC_USE_SAT + +#endif // ICBC_IMPLEMENTATION + +// Version History: +// v1.00 - Initial release. +// v1.01 - Added SPMD code path with AVX support. +// v1.02 - Removed SIMD code path. +// v1.03 - Quality levels. AVX512, Neon, Altivec, vectorized reduction and index selection. + +// Copyright (c) 2020 Ignacio Castano +// +// Permission is hereby granted, free of charge, to any person obtaining +// a copy of this software and associated documentation files (the +// "Software"), to deal in the Software without restriction, including +// without limitation the rights to use, copy, modify, merge, publish, +// distribute, sublicense, and/or sell copies of the Software, and to +// permit persons to whom the Software is furnished to do so, subject to +// the following conditions: +// +// The above copyright notice and this permission notice shall be included +// in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +// IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +// CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +// TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +// SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/src/nvtt/tests/CMakeLists.txt b/src/nvtt/tests/CMakeLists.txt index 872fd8f..8b56d23 100644 --- a/src/nvtt/tests/CMakeLists.txt +++ b/src/nvtt/tests/CMakeLists.txt @@ -28,8 +28,8 @@ TARGET_LINK_LIBRARIES(cubemaptest nvcore nvmath nvimage nvtt) ADD_EXECUTABLE(nvhdrtest hdrtest.cpp) TARGET_LINK_LIBRARIES(nvhdrtest nvcore nvimage nvtt bc6h nvmath) -ADD_EXECUTABLE(bc1enc bc1enc.cpp) -TARGET_LINK_LIBRARIES(bc1enc nvcore nvimage nvmath squish CMP_Core) +#ADD_EXECUTABLE(bc1enc bc1enc.cpp) +#TARGET_LINK_LIBRARIES(bc1enc nvcore nvimage nvmath squish CMP_Core) INSTALL(TARGETS nvtestsuite nvhdrtest DESTINATION bin)