diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5e4bab9..3cd0dac 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,16 +13,15 @@ set (CMAKE_CXX_STANDARD 11)
 
 IF(WIN32)
 	# gnuwin32 paths:
-	SET(GNUWIN32_PATH "${NV_SOURCE_DIR}/extern/gnuwin32")
-	SET(CMAKE_INCLUDE_PATH ${CMAKE_INCLUDE_PATH} "${GNUWIN32_PATH}/include")
-	SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} "${GNUWIN32_PATH}/lib")
+	#SET(GNUWIN32_PATH "${NV_SOURCE_DIR}/extern/gnuwin32")
+	#SET(CMAKE_INCLUDE_PATH ${CMAKE_INCLUDE_PATH} "${GNUWIN32_PATH}/include")
+	#SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} "${GNUWIN32_PATH}/lib")
 	
 	# Set GLUT path:
-	SET(GLUT_ROOT_DIR "${NV_SOURCE_DIR}/extern/glut")
+	#SET(GLUT_ROOT_DIR "${NV_SOURCE_DIR}/extern/glut")
 
 	# Set FreeImage path:
-	SET(FREEIMAGE_ROOT_DIR "${NV_SOURCE_DIR}/extern/FreeImage")
-	
+	#SET(FREEIMAGE_ROOT_DIR "${NV_SOURCE_DIR}/extern/FreeImage")
 ENDIF(WIN32)
 
 INCLUDE(${NV_CMAKE_DIR}/OptimalOptions.cmake)
@@ -36,11 +35,11 @@ IF(CMAKE_BUILD_TYPE MATCHES "debug")
 ENDIF()
 
 
-IF(NVTT_SHARED)
-	SET(NVCORE_SHARED TRUE)
-	SET(NVMATH_SHARED TRUE)
-	SET(NVIMAGE_SHARED TRUE)
-ENDIF(NVTT_SHARED)
+#IF(NVTT_SHARED)
+#	SET(NVCORE_SHARED TRUE)
+#	SET(NVMATH_SHARED TRUE)
+#	SET(NVIMAGE_SHARED TRUE)
+#ENDIF(NVTT_SHARED)
 
 ADD_SUBDIRECTORY(extern)
 
diff --git a/src/nvcore/nvcore.h b/src/nvcore/nvcore.h
index 2fb68cc..5657a31 100644
--- a/src/nvcore/nvcore.h
+++ b/src/nvcore/nvcore.h
@@ -1,4 +1,4 @@
-// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+// This code is in the public domain -- Ignacio Castano <castano@gmail.com>
 
 #pragma once
 #ifndef NV_CORE_H
diff --git a/src/nvmath/SimdVector.h b/src/nvmath/SimdVector.h
index e20b7a9..3f36713 100644
--- a/src/nvmath/SimdVector.h
+++ b/src/nvmath/SimdVector.h
@@ -1,4 +1,4 @@
-// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+// This code is in the public domain -- Ignacio Castano <castano@gmail.com>
 #pragma once
 
 #include "nvmath.h"
diff --git a/src/nvtt/BlockCompressor.cpp b/src/nvtt/BlockCompressor.cpp
index adf97fe..fded226 100644
--- a/src/nvtt/BlockCompressor.cpp
+++ b/src/nvtt/BlockCompressor.cpp
@@ -206,15 +206,16 @@ void FloatColorCompressor::compress(AlphaMode alphaMode, uint w, uint h, uint d,
 
 
 // BC1
-#include "CompressorDXT1.h"
+#include "icbc.h"
 
 void FastCompressorDXT1::compressBlock(Vector4 colors[16], float weights[16], const CompressionOptions::Private & compressionOptions, void * output)
 {
-    compress_dxt1_fast(colors, weights, compressionOptions.colorWeight.xyz(), (BlockDXT1 *)output);
+    icbc::compress_dxt1_fast((float*)colors, weights, compressionOptions.colorWeight.component, output);
 }
 void CompressorDXT1::compressBlock(Vector4 colors[16], float weights[16], const CompressionOptions::Private & compressionOptions, void * output)
 {
-    compress_dxt1(colors, weights, compressionOptions.colorWeight.xyz(), /*three_color_mode*/true, false, (BlockDXT1 *)output);
+    bool hq = compressionOptions.quality > Quality_Normal;
+    icbc::compress_dxt1((float*)colors, weights, compressionOptions.colorWeight.component, /*three_color_mode*/true, hq, output);
 }
 
 
diff --git a/src/nvtt/CMakeLists.txt b/src/nvtt/CMakeLists.txt
index 0e697e6..501ce4a 100644
--- a/src/nvtt/CMakeLists.txt
+++ b/src/nvtt/CMakeLists.txt
@@ -1,63 +1,63 @@
-PROJECT(nvtt)
-
-ADD_SUBDIRECTORY(squish)
-
-SET(NVTT_SRCS
-    nvtt.h nvtt.cpp
-    nvtt_wrapper.h nvtt_wrapper.cpp
-    ClusterFit.h ClusterFit.cpp
-    Compressor.h
-    BlockCompressor.h BlockCompressor.cpp
-    CompressorDX9.h CompressorDX9.cpp
-    CompressorDX10.h CompressorDX10.cpp
-    CompressorDX11.h CompressorDX11.cpp
-    CompressorDXT1.h CompressorDXT1.cpp
-    CompressorDXT5_RGBM.h CompressorDXT5_RGBM.cpp
-    CompressorETC.h CompressorETC.cpp
-    CompressorRGB.h CompressorRGB.cpp
-    Context.h Context.cpp
-    QuickCompressDXT.h QuickCompressDXT.cpp
-    OptimalCompressDXT.h OptimalCompressDXT.cpp
-    SingleColorLookup.h SingleColorLookup.cpp
-    CompressionOptions.h CompressionOptions.cpp
-    InputOptions.h InputOptions.cpp
-    OutputOptions.h OutputOptions.cpp
-    TaskDispatcher.h #TaskDispatcher.cpp
-    Surface.h Surface.cpp
-    CubeSurface.h CubeSurface.cpp
-    cuda/CudaUtils.h cuda/CudaUtils.cpp
-    cuda/CudaMath.h
-    cuda/BitmapTable.h
-    cuda/CudaCompressorDXT.h cuda/CudaCompressorDXT.cpp)
-
-IF (CUDA_FOUND)
-    ADD_DEFINITIONS(-DHAVE_CUDA)
-    CUDA_COMPILE(CUDA_SRCS cuda/CompressKernel.cu)
-    SET(NVTT_SRCS ${NVTT_SRCS} ${CUDA_SRCS})
-    SET(LIBS ${LIBS} ${CUDA_LIBRARIES})
-    INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
-ENDIF (CUDA_FOUND)
-
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
-INCLUDE_DIRECTORIES(${NV_SOURCE_DIR}/extern/rg_etc1_v104)
-
-ADD_DEFINITIONS(-DNVTT_EXPORTS)
-
-IF(NVTT_SHARED)	
-    ADD_LIBRARY(nvtt SHARED ${NVTT_SRCS})
-ELSE(NVTT_SHARED)
-    ADD_LIBRARY(nvtt ${NVTT_SRCS})
-ENDIF(NVTT_SHARED)
-
-TARGET_LINK_LIBRARIES(nvtt ${LIBS} nvcore nvimage nvthread nvsquish bc6h bc7 nvmath rg_etc1)
-
-INSTALL(TARGETS nvtt 
-    RUNTIME DESTINATION bin
-    LIBRARY DESTINATION lib
-    ARCHIVE DESTINATION lib/static)
-
-INSTALL(FILES nvtt.h nvtt_wrapper.h DESTINATION include/nvtt)
-
-
-ADD_SUBDIRECTORY(tools)
-ADD_SUBDIRECTORY(tests)
+PROJECT(nvtt)
+
+ADD_SUBDIRECTORY(squish)
+
+SET(NVTT_SRCS
+    nvtt.h nvtt.cpp
+    nvtt_wrapper.h nvtt_wrapper.cpp
+    ClusterFit.h ClusterFit.cpp
+    Compressor.h
+    BlockCompressor.h BlockCompressor.cpp
+    CompressorDX9.h CompressorDX9.cpp
+    CompressorDX10.h CompressorDX10.cpp
+    CompressorDX11.h CompressorDX11.cpp
+    icbc.h icbc.cpp
+    CompressorDXT5_RGBM.h CompressorDXT5_RGBM.cpp
+    CompressorETC.h CompressorETC.cpp
+    CompressorRGB.h CompressorRGB.cpp
+    Context.h Context.cpp
+    QuickCompressDXT.h QuickCompressDXT.cpp
+    OptimalCompressDXT.h OptimalCompressDXT.cpp
+    SingleColorLookup.h SingleColorLookup.cpp
+    CompressionOptions.h CompressionOptions.cpp
+    InputOptions.h InputOptions.cpp
+    OutputOptions.h OutputOptions.cpp
+    TaskDispatcher.h #TaskDispatcher.cpp
+    Surface.h Surface.cpp
+    CubeSurface.h CubeSurface.cpp
+    cuda/CudaUtils.h cuda/CudaUtils.cpp
+    cuda/CudaMath.h
+    cuda/BitmapTable.h
+    cuda/CudaCompressorDXT.h cuda/CudaCompressorDXT.cpp)
+
+IF (CUDA_FOUND)
+    ADD_DEFINITIONS(-DHAVE_CUDA)
+    CUDA_COMPILE(CUDA_SRCS cuda/CompressKernel.cu)
+    SET(NVTT_SRCS ${NVTT_SRCS} ${CUDA_SRCS})
+    SET(LIBS ${LIBS} ${CUDA_LIBRARIES})
+    INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
+ENDIF (CUDA_FOUND)
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+INCLUDE_DIRECTORIES(${NV_SOURCE_DIR}/extern/rg_etc1_v104)
+
+ADD_DEFINITIONS(-DNVTT_EXPORTS)
+
+IF(NVTT_SHARED)	
+    ADD_LIBRARY(nvtt SHARED ${NVTT_SRCS})
+ELSE(NVTT_SHARED)
+    ADD_LIBRARY(nvtt ${NVTT_SRCS})
+ENDIF(NVTT_SHARED)
+
+TARGET_LINK_LIBRARIES(nvtt ${LIBS} nvcore nvimage nvthread nvsquish bc6h bc7 nvmath rg_etc1)
+
+INSTALL(TARGETS nvtt 
+    RUNTIME DESTINATION bin
+    LIBRARY DESTINATION lib
+    ARCHIVE DESTINATION lib/static)
+
+INSTALL(FILES nvtt.h nvtt_wrapper.h DESTINATION include/nvtt)
+
+
+ADD_SUBDIRECTORY(tools)
+ADD_SUBDIRECTORY(tests)
diff --git a/src/nvtt/CompressorDXT1.cpp b/src/nvtt/CompressorDXT1.cpp
deleted file mode 100644
index 1faad8c..0000000
--- a/src/nvtt/CompressorDXT1.cpp
+++ /dev/null
@@ -1,1698 +0,0 @@
-
-#include "CompressorDXT1.h"
-#include "ClusterFit.h"
-
-#include "nvmath/nvmath.h"
-
-#include <string.h> // memset
-#include <float.h> // FLT_MAX
-
-
-using namespace nv;
-
-/// Swap two values.
-/*template <typename T>
-inline void swap(T & a, T & b)
-{
-    T temp(a);
-    a = b;
-    b = temp;
-}*/
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-// Basic Types
-
-struct Color16 {
-    union {
-        struct {
-            uint16 b : 5;
-            uint16 g : 6;
-            uint16 r : 5;
-        };
-        uint16 u;
-    };
-};
-
-struct Color32 {
-    union {
-        struct {
-            uint8 b, g, r, a;
-        };
-        uint32 u;
-    };
-};
-
-namespace nv {
-    struct BlockDXT1 {
-        Color16 col0;
-        Color16 col1;
-        uint32 indices;
-    };
-
-
-    /*struct Vector3 {
-        float x, y, z;
-    };*/
-
-    inline Vector3 operator*(Vector3 v, float s) {
-        return { v.x * s, v.y * s, v.z * s };
-    }
-
-    inline Vector3 operator*(float s, Vector3 v) {
-        return { v.x * s, v.y * s, v.z * s };
-    }
-
-    inline Vector3 operator*(Vector3 a, Vector3 b) {
-        return { a.x * b.x, a.y * b.y, a.z * b.z };
-    }
-
-    inline float dot(Vector3 a, Vector3 b) {
-        return a.x * b.x + a.y * b.y + a.z * b.z;
-    }
-
-    inline Vector3 operator+(Vector3 a, Vector3 b) {
-        return { a.x + b.x, a.y + b.y, a.z + b.z };
-    }
-
-    inline Vector3 operator-(Vector3 a, Vector3 b) {
-        return { a.x - b.x, a.y - b.y, a.z - b.z };
-    }
-
-    inline Vector3 operator/(Vector3 v, float s) {
-        return { v.x / s, v.y / s, v.z / s };
-    }
-
-    /*inline float saturate(float x) {
-        return x < 0 ? 0 : (x > 1 ? 1 : x);
-    }*/
-
-    inline Vector3 saturate(Vector3 v) {
-        return { saturate(v.x), saturate(v.y), saturate(v.z) };
-    }
-
-    inline Vector3 min(Vector3 a, Vector3 b) {
-        return { min(a.x, b.x), min(a.y, b.y), min(a.z, b.z) };
-    }
-
-    inline Vector3 max(Vector3 a, Vector3 b) {
-        return { max(a.x, b.x), max(a.y, b.y), max(a.z, b.z) };
-    }
-
-    inline bool operator==(const Vector3 & a, const Vector3 & b) {
-        return memcmp(&a, &b, sizeof(Vector3));
-    }
-
-    inline void Vector3::set(float x, float y, float z) {
-        this->x = x; this->y = y; this->z = z;
-    }
-
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-// Color conversion functions.
-
-static const float midpoints5[32] = {
-    0.015686f, 0.047059f, 0.078431f, 0.111765f, 0.145098f, 0.176471f, 0.207843f, 0.241176f, 0.274510f, 0.305882f, 0.337255f, 0.370588f, 0.403922f, 0.435294f, 0.466667f, 0.5f,
-    0.533333f, 0.564706f, 0.596078f, 0.629412f, 0.662745f, 0.694118f, 0.725490f, 0.758824f, 0.792157f, 0.823529f, 0.854902f, 0.888235f, 0.921569f, 0.952941f, 0.984314f, 1.0f
-};
-
-static const float midpoints6[64] = {
-    0.007843f, 0.023529f, 0.039216f, 0.054902f, 0.070588f, 0.086275f, 0.101961f, 0.117647f, 0.133333f, 0.149020f, 0.164706f, 0.180392f, 0.196078f, 0.211765f, 0.227451f, 0.245098f, 
-    0.262745f, 0.278431f, 0.294118f, 0.309804f, 0.325490f, 0.341176f, 0.356863f, 0.372549f, 0.388235f, 0.403922f, 0.419608f, 0.435294f, 0.450980f, 0.466667f, 0.482353f, 0.500000f, 
-    0.517647f, 0.533333f, 0.549020f, 0.564706f, 0.580392f, 0.596078f, 0.611765f, 0.627451f, 0.643137f, 0.658824f, 0.674510f, 0.690196f, 0.705882f, 0.721569f, 0.737255f, 0.754902f, 
-    0.772549f, 0.788235f, 0.803922f, 0.819608f, 0.835294f, 0.850980f, 0.866667f, 0.882353f, 0.898039f, 0.913725f, 0.929412f, 0.945098f, 0.960784f, 0.976471f, 0.992157f, 1.0f
-};
-
-/*void init_tables() {
-    for (int i = 0; i < 31; i++) {
-        float f0 = float(((i+0) << 3) | ((i+0) >> 2)) / 255.0f;
-        float f1 = float(((i+1) << 3) | ((i+1) >> 2)) / 255.0f;
-        midpoints5[i] = (f0 + f1) * 0.5;
-    }
-    midpoints5[31] = 1.0f;
-
-    for (int i = 0; i < 63; i++) {
-        float f0 = float(((i+0) << 2) | ((i+0) >> 4)) / 255.0f;
-        float f1 = float(((i+1) << 2) | ((i+1) >> 4)) / 255.0f;
-        midpoints6[i] = (f0 + f1) * 0.5;
-    }
-    midpoints6[63] = 1.0f;
-}*/
-
-static Color16 vector3_to_color16(const Vector3 & v) {
-
-    // Truncate.
-    uint r = uint(clamp(v.x * 31.0f, 0.0f, 31.0f));
-	uint g = uint(clamp(v.y * 63.0f, 0.0f, 63.0f));
-	uint b = uint(clamp(v.z * 31.0f, 0.0f, 31.0f));
-
-    // Round exactly according to 565 bit-expansion.
-    r += (v.x > midpoints5[r]);
-    g += (v.y > midpoints6[g]);
-    b += (v.z > midpoints5[b]);
-
-    Color16 c;
-    c.u = (r << 11) | (g << 5) | b;
-    return c;
-}
-
-
-
-static Color32 bitexpand_color16_to_color32(Color16 c16) {
-    Color32 c32;
-    //c32.b = (c16.b << 3) | (c16.b >> 2);
-    //c32.g = (c16.g << 2) | (c16.g >> 4);
-    //c32.r = (c16.r << 3) | (c16.r >> 2);
-    //c32.a = 0xFF;
-
-    c32.u = ((c16.u << 3) & 0xf8) | ((c16.u << 5) & 0xfc00) | ((c16.u << 8) & 0xf80000);
-    c32.u |= (c32.u >> 5) & 0x070007;
-    c32.u |= (c32.u >> 6) & 0x000300;
-
-    return c32;
-}
-
-inline Vector3 color_to_vector3(Color32 c)
-{
-    return Vector3(c.r / 255.0f, c.g / 255.0f, c.b / 255.0f);
-}
-
-inline Color32 vector3_to_color32(Vector3 v)
-{
-    Color32 color;
-    color.r = uint8(saturate(v.x) * 255 + 0.5f);
-    color.g = uint8(saturate(v.y) * 255 + 0.5f);
-    color.b = uint8(saturate(v.z) * 255 + 0.5f);
-    color.a = 255;
-    return color;
-}
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-// Input block processing.
-
-// Find first valid color.
-/*static bool find_valid_color_rgb(const Vector3 * colors, const float * weights, int count, Vector3 * valid_color)
-{
-    for (int i = 0; i < count; i++) {
-        if (weights[i] > 0.0f) {
-            *valid_color = colors[i];
-            return true;
-        }
-    }
-
-    // No valid colors.
-    return false;
-}*/
-
-/*static bool is_single_color_rgb(const Vector3 * colors, const float * weights, int count, Vector3 color)
-{
-    for (int i = 0; i < count; i++) {
-        if (weights[i] > 0.0f) {
-            if (colors[i] != color) return false;
-        }
-    }
-
-    return true;
-}*/
-
-// Find similar colors and combine them together.
-static int reduce_colors(const Vector4 * input_colors, const float * input_weights, Vector3 * colors, float * weights)
-{
-    int n = 0;
-    for (int i = 0; i < 16; i++)
-    {
-        Vector3 ci = input_colors[i].xyz();
-        float wi = input_weights[i];
-
-        if (wi > 0) {
-            // Find matching color.
-            int j;
-            for (j = 0; j < n; j++) {
-                if (equal(colors[j].x, ci.x) && equal(colors[j].y, ci.y) && equal(colors[j].z, ci.z)) {
-                    weights[j] += wi;
-                    break;
-                }
-            }
-
-            // No match found. Add new color.
-            if (j == n) {
-                colors[n] = ci;
-                weights[n] = wi;
-                n++;
-            }
-        }
-    }
-
-    nvDebugCheck(n <= 16);
-
-    return n;
-}
-
-static int reduce_colors(const uint8 * input_colors, Vector3 * colors, float * weights)
-{
-    int n = 0;
-    for (int i = 0; i < 16; i++)
-    {
-        Vector3 ci;
-        ci.x = float(input_colors[4 * i + 0]);
-        ci.y = float(input_colors[4 * i + 1]);
-        ci.z = float(input_colors[4 * i + 2]);
-
-        // Find matching color.
-        int j;
-        for (j = 0; j < n; j++) {
-            if (equal(colors[j].x, ci.x) && equal(colors[j].y, ci.y) && equal(colors[j].z, ci.z)) {
-                weights[j] += 1.0f;
-                break;
-            }
-        }
-
-        // No match found. Add new color.
-        if (j == n) {
-            colors[n] = ci;
-            weights[n] = 1.0f;
-            n++;
-        }
-    }
-
-    nvDebugCheck(n <= 16);
-
-    return n;
-}
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-// Palette evaluation.
-
-#define DECODER 0
-
-inline void evaluate_palette4(Color16 c0, Color16 c1, Color32 palette[4], bool d3d9_bias) {
-#if DECODER == 0 || DECODER == 1
-    palette[2].r = (2 * palette[0].r + palette[1].r + d3d9_bias) / 3;
-    palette[2].g = (2 * palette[0].g + palette[1].g + d3d9_bias) / 3;
-    palette[2].b = (2 * palette[0].b + palette[1].b + d3d9_bias) / 3;
-    palette[3].r = (2 * palette[1].r + palette[0].r + d3d9_bias) / 3;
-    palette[3].g = (2 * palette[1].g + palette[0].g + d3d9_bias) / 3;
-    palette[3].b = (2 * palette[1].b + palette[0].b + d3d9_bias) / 3;
-#else
-    int dg = palette[1].g - palette[0].g;
-    palette[2].r = ((2 * c0.r + c1.r) * 22) / 8;
-    palette[2].g = (256 * palette[0].g + dg * 80 + dg / 4 + 128) / 256;
-    palette[2].b = ((2 * c0.b + c1.b) * 22) / 8;
-    palette[3].r = ((2 * c1.r + c0.r) * 22) / 8;
-    palette[3].g = (256 * palette[1].g - dg * 80 - dg / 4 + 128) / 256;
-    palette[3].b = ((2 * c1.b + c0.b) * 22) / 8;
-#endif
-}
-
-inline void evaluate_palette3(Color16 c0, Color16 c1, Color32 palette[4]) {
-#if DECODER == 0 || DECODER == 1
-    palette[2].r = (palette[0].r + palette[1].r) / 2;
-    palette[2].g = (palette[0].g + palette[1].g) / 2;
-    palette[2].b = (palette[0].b + palette[1].b) / 2;
-#else
-    int dg = palette[1].g - palette[0].g;
-    palette[2].r = ((c0.r + c1.r) * 33) / 8;
-    palette[2].g = (256 * palette[0].g + dg * 128 + dg / 4 + 128) / 256;
-    palette[2].b = ((c0.b + c1.b) * 33) / 8;
-#endif
-    palette[3].r = 0;
-    palette[3].g = 0;
-    palette[3].b = 0;
-}
-
-static void evaluate_palette(Color16 c0, Color16 c1, Color32 palette[4], bool d3d9_bias) {
-    palette[0] = bitexpand_color16_to_color32(c0);
-    palette[1] = bitexpand_color16_to_color32(c1);
-    if (c0.u > c1.u) {
-        evaluate_palette4(c0, c1, palette, d3d9_bias);
-    }
-    else {
-        evaluate_palette3(c0, c1, palette);
-    }
-}
-
-static void evaluate_palette_nv(Color16 c0, Color16 c1, Color32 palette[4]) {
-    palette[0].r = (3 * c0.r * 22) / 8;
-    palette[0].g = (c0.g << 2) | (c0.g >> 4);
-    palette[0].b = (3 * c0.b * 22) / 8;
-    palette[1].a = 255;
-    palette[1].r = (3 * c1.r * 22) / 8;
-    palette[1].g = (c1.g << 2) | (c1.g >> 4);
-    palette[1].b = (3 * c1.b * 22) / 8;
-    palette[1].a = 255;
-
-    int gdiff = palette[1].g - palette[0].g;
-    if (c0.u > c1.u) {
-        palette[2].r = ((2 * c0.r + c1.r) * 22) / 8;
-        palette[2].g = (256 * palette[0].g + gdiff / 4 + 128 + gdiff * 80) / 256;
-        palette[2].b = ((2 * c0.b + c1.b) * 22) / 8;
-        palette[2].a = 0xFF;
-
-        palette[3].r = ((2 * c1.r + c0.r) * 22) / 8;
-        palette[3].g = (256 * palette[1].g - gdiff / 4 + 128 - gdiff * 80) / 256;
-        palette[3].b = ((2 * c1.b + c0.b) * 22) / 8;
-        palette[3].a = 0xFF;
-    }
-    else {
-        palette[2].r = ((c0.r + c1.r) * 33) / 8;
-        palette[2].g = (256 * palette[0].g + gdiff / 4 + 128 + gdiff * 128) / 256;
-        palette[2].b = ((c0.b + c1.b) * 33) / 8;
-        palette[2].a = 0xFF;
-        palette[3].u = 0;
-    }
-}
-
-static void evaluate_palette(Color16 c0, Color16 c1, Color32 palette[4]) {
-#if DECODER == 0
-    evaluate_palette(c0, c1, palette, false);
-#elif DECODER == 1
-    evaluate_palette(c0, c1, palette, true);
-#elif DECODER == 2
-    evaluate_palette_nv(c0, c1, palette);
-#endif
-}
-
-static void evaluate_palette(Color16 c0, Color16 c1, Vector3 palette[4]) {
-    Color32 palette32[4];
-    evaluate_palette(c0, c1, palette32);
-
-    for (int i = 0; i < 4; i++) {
-        palette[i] = color_to_vector3(palette32[i]);
-    }
-}
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-// Error evaluation.
-
-// Different ways of estimating the error.
-
-static float evaluate_mse(const Vector3 & p, const Vector3 & c, const Vector3 & w) {
-    Vector3 d = (p * 255 - c * 255) * w;
-    return dot(d, d);
-}
-
-static float evaluate_mse(const Color32 & p, const Vector3 & c, const Vector3 & w) {
-    Vector3 d = (Vector3(p.r, p.g, p.b) - c * 255) * w;
-    return dot(d, d);
-}
-
-
-/*static float evaluate_mse(const Vector3 & p, const Vector3 & c, const Vector3 & w) {
-    return ww.x * square(p.x-c.x) + ww.y * square(p.y-c.y) + ww.z * square(p.z-c.z);
-}*/
-
-static int evaluate_mse(const Color32 & p, const Color32 & c) {
-    return (square(int(p.r)-c.r) + square(int(p.g)-c.g) + square(int(p.b)-c.b));
-}
-
-/*static float evaluate_mse(const Vector3 palette[4], const Vector3 & c, const Vector3 & w) {
-    float e0 = evaluate_mse(palette[0], c, w);
-    float e1 = evaluate_mse(palette[1], c, w);
-    float e2 = evaluate_mse(palette[2], c, w);
-    float e3 = evaluate_mse(palette[3], c, w);
-    return min(min(e0, e1), min(e2, e3));
-}*/
-
-static int evaluate_mse(const Color32 palette[4], const Color32 & c) {
-    int e0 = evaluate_mse(palette[0], c);
-    int e1 = evaluate_mse(palette[1], c);
-    int e2 = evaluate_mse(palette[2], c);
-    int e3 = evaluate_mse(palette[3], c);
-    return min(min(e0, e1), min(e2, e3));
-}
-
-// Returns MSE error in [0-255] range.
-static int evaluate_mse(const BlockDXT1 * output, Color32 color, int index) {
-    Color32 palette[4];
-    evaluate_palette(output->col0, output->col1, palette);
-    //evaluate_palette_nv(output->col0, output->col1, palette);
-
-    return evaluate_mse(palette[index], color);
-}
-
-// Returns weighted MSE error in [0-255] range.
-static float evaluate_palette_error(Color32 palette[4], const Color32 * colors, const float * weights, int count) {
-    
-    float total = 0.0f;
-    for (int i = 0; i < count; i++) {
-        total += weights[i] * evaluate_mse(palette, colors[i]);
-    }
-
-    return total;
-}
-
-static float evaluate_palette_error(Color32 palette[4], const Color32 * colors, int count) {
-
-    float total = 0.0f;
-    for (int i = 0; i < count; i++) {
-        total += evaluate_mse(palette, colors[i]);
-    }
-
-    return total;
-}
-
-#if 0
-static float evaluate_mse(const BlockDXT1 * output, const Vector3 colors[16]) {
-    Color32 palette[4];
-    output->evaluatePalette(palette, /*d3d9=*/false);
-
-    // convert palette to float.
-    Vector3 vector_palette[4];
-    for (int i = 0; i < 4; i++) {
-        vector_palette[i] = color_to_vector3(palette[i]);
-    }
-
-    // evaluate error for each index.
-    float error = 0.0f;
-    for (int i = 0; i < 16; i++) {
-        int index = (output->indices >> (2*i)) & 3; // @@ Is this the right order?
-        error += evaluate_mse(vector_palette[index], colors[i]);
-    }
-
-    return error;
-}
-#endif
-
-static float evaluate_mse(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, const BlockDXT1 * output) {
-    Color32 palette[4];
-    evaluate_palette(output->col0, output->col1, palette);
-    //evaluate_palette_nv5x(output->col0, output->col1, palette);
-
-    // convert palette to float.
-    /*Vector3 vector_palette[4];
-    for (int i = 0; i < 4; i++) {
-        vector_palette[i] = color_to_vector3(palette[i]);
-    }*/
-
-    // evaluate error for each index.
-    float error = 0.0f;
-    for (int i = 0; i < 16; i++) {
-        int index = (output->indices >> (2 * i)) & 3;
-        error += input_weights[i] * evaluate_mse(palette[index], input_colors[i].xyz(), color_weights);
-    }
-    return error;
-}
-
-float nv::evaluate_dxt1_error(const uint8 rgba_block[16*4], const BlockDXT1 * block, int decoder) {
-    Color32 palette[4];
-    if (decoder == 2) {
-        evaluate_palette_nv(block->col0, block->col1, palette);
-
-    }
-    else {
-        evaluate_palette(block->col0, block->col1, palette, /*d3d9=*/decoder);
-    }
-
-    // evaluate error for each index.
-    float error = 0.0f;
-    for (int i = 0; i < 16; i++) {
-        int index = (block->indices >> (2 * i)) & 3;
-        Color32 c;
-        c.r = rgba_block[4 * i + 0];
-        c.g = rgba_block[4 * i + 1];
-        c.b = rgba_block[4 * i + 2];
-        c.a = 255;
-        error += evaluate_mse(palette[index], c);
-    }
-    return error;
-}
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-// Index selection
-
-static uint compute_indices4(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 palette[4]) {
-    
-    uint indices = 0;
-    for (int i = 0; i < 16; i++) {
-        float d0 = evaluate_mse(palette[0], input_colors[i].xyz(), color_weights);
-        float d1 = evaluate_mse(palette[1], input_colors[i].xyz(), color_weights);
-        float d2 = evaluate_mse(palette[2], input_colors[i].xyz(), color_weights);
-        float d3 = evaluate_mse(palette[3], input_colors[i].xyz(), color_weights);
-
-        uint b0 = d0 > d3;
-        uint b1 = d1 > d2;
-        uint b2 = d0 > d2;
-        uint b3 = d1 > d3;
-        uint b4 = d2 > d3;
-
-        uint x0 = b1 & b2;
-        uint x1 = b0 & b3;
-        uint x2 = b0 & b4;
-
-        indices |= (x2 | ((x0 | x1) << 1)) << (2 * i);
-    }
-
-    return indices;
-}
-
-
-static uint compute_indices4(const Vector3 input_colors[16], const Vector3 palette[4]) {
-
-    uint indices = 0;
-    for (int i = 0; i < 16; i++) {
-        float d0 = evaluate_mse(palette[0], input_colors[i], Vector3(1));
-        float d1 = evaluate_mse(palette[1], input_colors[i], Vector3(1));
-        float d2 = evaluate_mse(palette[2], input_colors[i], Vector3(1));
-        float d3 = evaluate_mse(palette[3], input_colors[i], Vector3(1));
-
-        uint b0 = d0 > d3;
-        uint b1 = d1 > d2;
-        uint b2 = d0 > d2;
-        uint b3 = d1 > d3;
-        uint b4 = d2 > d3;
-
-        uint x0 = b1 & b2;
-        uint x1 = b0 & b3;
-        uint x2 = b0 & b4;
-
-        indices |= (x2 | ((x0 | x1) << 1)) << (2 * i);
-    }
-
-    return indices;
-}
-
-
-static uint compute_indices(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 palette[4]) {
-    
-    uint indices = 0;
-    for (int i = 0; i < 16; i++) {
-        float d0 = evaluate_mse(palette[0], input_colors[i].xyz(), color_weights);
-        float d1 = evaluate_mse(palette[1], input_colors[i].xyz(), color_weights);
-        float d2 = evaluate_mse(palette[2], input_colors[i].xyz(), color_weights);
-        float d3 = evaluate_mse(palette[3], input_colors[i].xyz(), color_weights);
-
-        uint index;
-        if (d0 < d1 && d0 < d2 && d0 < d3) index = 0;
-        else if (d1 < d2 && d1 < d3) index = 1;
-        else if (d2 < d3) index = 2;
-        else index = 3;
-
-		indices |= index << (2 * i);
-	}
-
-	return indices;
-}
-
-
-static void output_block3(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 & v0, const Vector3 & v1, BlockDXT1 * block)
-{
-    Color16 color0 = vector3_to_color16(v0);
-    Color16 color1 = vector3_to_color16(v1);
-
-    if (color0.u > color1.u) {
-        swap(color0, color1);
-    }
-
-    Vector3 palette[4];
-    evaluate_palette(color0, color1, palette);
-
-    block->col0 = color0;
-    block->col1 = color1;
-    block->indices = compute_indices(input_colors, color_weights, palette);
-}
-
-static void output_block4(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 & v0, const Vector3 & v1, BlockDXT1 * block)
-{
-    Color16 color0 = vector3_to_color16(v0);
-    Color16 color1 = vector3_to_color16(v1);
-
-    if (color0.u < color1.u) {
-        swap(color0, color1);
-    }
-
-    Vector3 palette[4];
-    evaluate_palette(color0, color1, palette);
-
-    block->col0 = color0;
-    block->col1 = color1;
-    block->indices = compute_indices4(input_colors, color_weights, palette);
-}
-
-
-static void output_block4(const Vector3 input_colors[16], const Vector3 & v0, const Vector3 & v1, BlockDXT1 * block)
-{
-    Color16 color0 = vector3_to_color16(v0);
-    Color16 color1 = vector3_to_color16(v1);
-
-    if (color0.u < color1.u) {
-        swap(color0, color1);
-    }
-
-    Vector3 palette[4];
-    evaluate_palette(color0, color1, palette);
-
-    block->col0 = color0;
-    block->col1 = color1;
-    block->indices = compute_indices4(input_colors, palette);
-}
-
-// Least squares fitting of color end points for the given indices. @@ Take weights into account.
-static bool optimize_end_points4(uint indices, const Vector4 * colors, /*const float * weights,*/ int count, Vector3 * a, Vector3 * b)
-{
-    float alpha2_sum = 0.0f;
-    float beta2_sum = 0.0f;
-    float alphabeta_sum = 0.0f;
-    Vector3 alphax_sum(0.0f);
-    Vector3 betax_sum(0.0f);
-
-    for (int i = 0; i < count; i++)
-    {
-        const uint bits = indices >> (2 * i);
-
-        float beta = float(bits & 1);
-        if (bits & 2) beta = (1 + beta) / 3.0f;
-        float alpha = 1.0f - beta;
-
-        alpha2_sum += alpha * alpha;
-        beta2_sum += beta * beta;
-        alphabeta_sum += alpha * beta;
-        alphax_sum += alpha * colors[i].xyz();
-        betax_sum += beta * colors[i].xyz();
-    }
-
-    float denom = alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum;
-    if (equal(denom, 0.0f)) return false;
-
-    float factor = 1.0f / denom;
-
-    *a = saturate((alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor);
-    *b = saturate((betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor);
-
-    return true;
-}
-
-static bool optimize_end_points4(uint indices, const Vector3 * colors, int count, Vector3 * a, Vector3 * b)
-{
-    float alpha2_sum = 0.0f;
-    float beta2_sum = 0.0f;
-    float alphabeta_sum = 0.0f;
-    Vector3 alphax_sum(0.0f);
-    Vector3 betax_sum(0.0f);
-
-    for (int i = 0; i < count; i++)
-    {
-        const uint bits = indices >> (2 * i);
-
-        float beta = float(bits & 1);
-        if (bits & 2) beta = (1 + beta) / 3.0f;
-        float alpha = 1.0f - beta;
-
-        alpha2_sum += alpha * alpha;
-        beta2_sum += beta * beta;
-        alphabeta_sum += alpha * beta;
-        alphax_sum += alpha * colors[i];
-        betax_sum += beta * colors[i];
-    }
-
-    float denom = alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum;
-    if (equal(denom, 0.0f)) return false;
-
-    float factor = 1.0f / denom;
-
-    *a = saturate((alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor);
-    *b = saturate((betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor);
-
-    return true;
-}
-
-
-// Least squares fitting of color end points for the given indices. @@ This does not support black/transparent index. @@ Take weights into account.
-static bool optimize_end_points3(uint indices, const Vector3 * colors, /*const float * weights,*/ int count, Vector3 * a, Vector3 * b)
-{
-    float alpha2_sum = 0.0f;
-    float beta2_sum = 0.0f;
-    float alphabeta_sum = 0.0f;
-    Vector3 alphax_sum(0.0f);
-    Vector3 betax_sum(0.0f);
-
-    for (int i = 0; i < count; i++)
-    {
-        const uint bits = indices >> (2 * i);
-
-        float beta = float(bits & 1);
-        if (bits & 2) beta = 0.5f;
-        float alpha = 1.0f - beta;
-
-        alpha2_sum += alpha * alpha;
-        beta2_sum += beta * beta;
-        alphabeta_sum += alpha * beta;
-        alphax_sum += alpha * colors[i];
-        betax_sum += beta * colors[i];
-    }
-
-    float denom = alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum;
-    if (equal(denom, 0.0f)) return false;
-
-    float factor = 1.0f / denom;
-
-    *a = saturate((alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor);
-    *b = saturate((betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor);
-
-    return true;
-}
-
-// @@ After optimization we need to round end points. Round in all possible directions, and pick best.
-
-
-
-// find minimum and maximum colors based on bounding box in color space
-inline static void fit_colors_bbox(const Vector3 * colors, int count, Vector3 * restrict c0, Vector3 * restrict c1)
-{
-    *c0 = Vector3(0);
-    *c1 = Vector3(1);
-
-    for (int i = 0; i < count; i++) {
-        *c0 = max(*c0, colors[i]);
-        *c1 = min(*c1, colors[i]);
-    }
-}
-
-inline static void select_diagonal(const Vector3 * colors, int count, Vector3 * restrict c0, Vector3 * restrict c1)
-{
-    Vector3 center = (*c0 + *c1) * 0.5f;
-
-    /*Vector3 center = colors[0];
-    for (int i = 1; i < count; i++) {
-        center = center * float(i-1) / i + colors[i] / i;
-    }*/
-    /*Vector3 center = colors[0];
-    for (int i = 1; i < count; i++) {
-        center += colors[i];
-    }
-    center /= count;*/
-
-    float cov_xz = 0.0f;
-    float cov_yz = 0.0f;
-    for (int i = 0; i < count; i++) {
-        Vector3 t = colors[i] - center;
-        cov_xz += t.x * t.z;
-        cov_yz += t.y * t.z;
-    }
-
-    float x0 = c0->x;
-    float y0 = c0->y;
-    float x1 = c1->x;
-    float y1 = c1->y;
-
-    if (cov_xz < 0) {
-        swap(x0, x1);
-    }
-    if (cov_yz < 0) {
-        swap(y0, y1);
-    }
-
-    c0->set(x0, y0, c0->z);
-    c1->set(x1, y1, c1->z);
-}
-
-inline static void inset_bbox(Vector3 * restrict c0, Vector3 * restrict c1)
-{
-    Vector3 inset = (*c0 - *c1) / 16.0f - Vector3((8.0f / 255.0f) / 16.0f);
-    *c0 = saturate(*c0 - inset);
-    *c1 = saturate(*c1 + inset);
-}
-
-
-
-// Single color lookup tables from:
-// https://github.com/nothings/stb/blob/master/stb_dxt.h
-static uint8 match5[256][2];
-static uint8 match6[256][2];
-
-static int Mul8Bit(int a, int b)
-{
-    int t = a * b + 128;
-    return (t + (t >> 8)) >> 8;
-}
-
-static inline int Lerp13(int a, int b)
-{
-#ifdef DXT_USE_ROUNDING_BIAS
-    // with rounding bias
-    return a + Mul8Bit(b - a, 0x55);
-#else
-    // without rounding bias
-    // replace "/ 3" by "* 0xaaab) >> 17" if your compiler sucks or you really need every ounce of speed.
-    return (a * 2 + b) / 3;
-#endif
-}
-
-static void PrepareOptTable(uint8 * table, const uint8 * expand, int size)
-{
-    for (int i = 0; i < 256; i++) {
-        int bestErr = 256 * 100;
-
-        for (int min = 0; min < size; min++) {
-            for (int max = 0; max < size; max++) {
-                int mine = expand[min];
-                int maxe = expand[max];
-
-                int err = abs(Lerp13(maxe, mine) - i) * 100;
-
-                // DX10 spec says that interpolation must be within 3% of "correct" result,
-                // add this as error term. (normally we'd expect a random distribution of
-                // +-1.5% error, but nowhere in the spec does it say that the error has to be
-                // unbiased - better safe than sorry).
-                err += abs(max - min) * 3;
-
-                if (err < bestErr) {
-                    bestErr = err;
-                    table[i * 2 + 0] = max;
-                    table[i * 2 + 1] = min;
-                }
-            }
-        }
-    }
-}
-
-// @@ Make this explicit.
-NV_AT_STARTUP(nv::init_dxt1());
-
-void nv::init_dxt1()
-{
-    // Prepare single color lookup tables.
-    uint8 expand5[32];
-    uint8 expand6[64];
-    for (int i = 0; i < 32; i++) expand5[i] = (i << 3) | (i >> 2);
-    for (int i = 0; i < 64; i++) expand6[i] = (i << 2) | (i >> 4);
-
-    PrepareOptTable(&match5[0][0], expand5, 32);
-    PrepareOptTable(&match6[0][0], expand6, 64);
-}
-
-// Single color compressor, based on:
-// https://mollyrocket.com/forums/viewtopic.php?t=392
-static void compress_dxt1_single_color_optimal(Color32 c, BlockDXT1 * output)
-{
-    output->col0.r = match5[c.r][0];
-    output->col0.g = match6[c.g][0];
-    output->col0.b = match5[c.b][0];
-    output->col1.r = match5[c.r][1];
-    output->col1.g = match6[c.g][1];
-    output->col1.b = match5[c.b][1];
-    output->indices = 0xaaaaaaaa;
-    
-    if (output->col0.u < output->col1.u)
-    {
-        swap(output->col0.u, output->col1.u);
-        output->indices ^= 0x55555555;
-    }
-}
-
-
-/*float nv::compress_dxt1_single_color_optimal(Color32 c, BlockDXT1 * output)
-{
-    ::compress_dxt1_single_color_optimal(c, output);
-
-    // Multiply by 16^2, the weight associated to a single color.
-    // Divide by 255*255 to covert error to [0-1] range.
-    return (256.0f / (255*255)) * evaluate_mse(output, c, output->indices & 3);
-}*/
-
-/*float nv::compress_dxt1_single_color_optimal(const Vector3 & color, BlockDXT1 * output)
-{
-    return compress_dxt1_single_color_optimal(vector3_to_color32(color), output);
-}*/
-
-
-// Compress block using the average color.
-float nv::compress_dxt1_single_color(const nv::Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, BlockDXT1 * output)
-{
-    // Compute block average.
-    Vector3 color_sum(0);
-    float weight_sum = 0;
-
-    for (int i = 0; i < count; i++) {
-        color_sum += colors[i] * weights[i];
-        weight_sum += weights[i];
-    }
-
-    // Compress optimally.
-    ::compress_dxt1_single_color_optimal(vector3_to_color32(color_sum / weight_sum), output);
-
-    // Decompress block color.
-    Color32 palette[4];
-    evaluate_palette(output->col0, output->col1, palette);
-    //output->evaluatePalette(palette, /*d3d9=*/false);
-
-    Vector3 block_color = color_to_vector3(palette[output->indices & 0x3]);
-
-    // Evaluate error.
-    float error = 0;
-    for (int i = 0; i < count; i++) {
-        error += weights[i] * evaluate_mse(block_color, colors[i], color_weights);
-    }
-    return error;
-}
-
-
-float nv::compress_dxt1_bounding_box_exhaustive(const Vector4 input_colors[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, bool three_color_mode, int max_volume, BlockDXT1 * output)
-{
-    // Compute bounding box.
-    Vector3 min_color(1.0f);
-    Vector3 max_color(0.0f);
-
-    for (int i = 0; i < count; i++) {
-        min_color = min(min_color, colors[i]);
-        max_color = max(max_color, colors[i]);
-    }
-
-    // Convert to 5:6:5
-    int min_r = int(31 * min_color.x);
-    int min_g = int(63 * min_color.y);
-    int min_b = int(31 * min_color.z);
-    int max_r = int(31 * max_color.x + 1);
-    int max_g = int(63 * max_color.y + 1);
-    int max_b = int(31 * max_color.z + 1);
-
-    // Expand the box.
-    int range_r = max_r - min_r;
-    int range_g = max_g - min_g;
-    int range_b = max_b - min_b;
-
-    min_r = max(0, min_r - range_r / 2 - 2);
-    min_g = max(0, min_g - range_g / 2 - 2);
-    min_b = max(0, min_b - range_b / 2 - 2);
-
-    max_r = min(31, max_r + range_r / 2 + 2);
-    max_g = min(63, max_g + range_g / 2 + 2);
-    max_b = min(31, max_b + range_b / 2 + 2);
-
-    // Estimate size of search space.
-    int volume = (max_r-min_r+1) * (max_g-min_g+1) * (max_b-min_b+1);
-
-    // if size under search_limit, then proceed. Note that search_volume is sqrt of number of evaluations.
-    if (volume > max_volume) {
-        return FLT_MAX;
-    }
-
-    // @@ Convert to fixed point before building box?
-    Color32 colors32[16];
-    for (int i = 0; i < count; i++) {
-        colors32[i] = vector3_to_color32(colors[i]);
-    }
-
-    float best_error = FLT_MAX;
-    Color16 best0, best1;           // @@ Record endpoints as Color16?
-
-    Color16 c0, c1;
-    Color32 palette[4];
-
-    for(int r0 = min_r; r0 <= max_r; r0++)
-    for(int g0 = min_g; g0 <= max_g; g0++)
-    for(int b0 = min_b; b0 <= max_b; b0++)
-    {
-        c0.r = r0; c0.g = g0; c0.b = b0;
-        palette[0] = bitexpand_color16_to_color32(c0);
-
-        for(int r1 = min_r; r1 <= max_r; r1++)
-        for(int g1 = min_g; g1 <= max_g; g1++)
-        for(int b1 = min_b; b1 <= max_b; b1++)
-        {
-            c1.r = r1; c1.g = g1; c1.b = b1;
-            palette[1] = bitexpand_color16_to_color32(c1);
-
-            if (c0.u > c1.u) {
-                // Evaluate error in 4 color mode.
-                evaluate_palette4(c0, c1, palette, false);
-            }
-            else {
-                if (three_color_mode) {
-                    // Evaluate error in 3 color mode.
-                    evaluate_palette3(c0, c1, palette);
-                }
-                else {
-                    // Skip 3 color mode.
-                    continue;
-                }
-            }
-
-            float error = evaluate_palette_error(palette, colors32, weights, count);
-
-            if (error < best_error) {
-                best_error = error;
-                best0 = c0;
-                best1 = c1;
-            }
-        }
-    }
-
-    output->col0 = best0;
-    output->col1 = best1;
-
-    Vector3 vector_palette[4];
-    evaluate_palette(output->col0, output->col1, vector_palette);
-
-    output->indices = compute_indices(input_colors, color_weights, vector_palette);
-
-    return best_error / (255 * 255);
-}
-
-
-void nv::compress_dxt1_cluster_fit(const Vector4 input_colors[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, bool three_color_mode, BlockDXT1 * output)
-{
-    ClusterFit fit;
-    fit.setColorWeights(Vector4(color_weights, 1));
-    fit.setColorSet(colors, weights, count);
-
-    // start & end are in [0, 1] range.
-    Vector3 start, end;
-    fit.compress4(&start, &end);
-
-    if (three_color_mode && fit.compress3(&start, &end)) {
-        output_block3(input_colors, color_weights, start, end, output);
-    }
-    else {
-        output_block4(input_colors, color_weights, start, end, output);
-    }
-}
-
-
-/*static unsigned int stb__MatchColorsBlock(uint8 *block, uint8 *color)
-{
-    uint mask = 0;
-    int dir[3];
-    dir[0] = color[0 * 4 + 0] - color[1 * 4 + 0];
-    dir[1] = color[0 * 4 + 1] - color[1 * 4 + 1];
-    dir[2] = color[0 * 4 + 2] - color[1 * 4 + 2];
-    int dots[16];
-    int stops[4];
-    int i;
-
-    for (i = 0;i < 16;i++)
-        dots[i] = block[i * 4 + 0] * dir[0] + block[i * 4 + 1] * dir[1] + block[i * 4 + 2] * dir[2];
-
-    for (i = 0;i < 4;i++)
-        stops[i] = color[i * 4 + 0] * dir[0] + color[i * 4 + 1] * dir[1] + color[i * 4 + 2] * dir[2];
-
-    // think of the colors as arranged on a line; project point onto that line, then choose
-    // next color out of available ones. we compute the crossover points for "best color in top
-    // half"/"best in bottom half" and then the same inside that subinterval.
-    //
-    // relying on this 1d approximation isn't always optimal in terms of euclidean distance,
-    // but it's very close and a lot faster.
-    // http://cbloomrants.blogspot.com/2008/12/12-08-08-dxtc-summary.html
-
-    int c0Point = (stops[1] + stops[3]);
-    int halfPoint = (stops[3] + stops[2]);
-    int c3Point = (stops[2] + stops[0]);
-
-    for (i = 15;i >= 0;i--) {
-        int dot = 2 * dots[i];
-        mask <<= 2;
-
-        uint sel;
-        if (dot < halfPoint)
-            sel = (dot < c0Point) ? 1 : 3;
-        else
-            sel = (dot < c3Point) ? 2 : 0;
-
-        mask |= sel;
-    }
-
-    return mask;
-}*/
-
-float nv::compress_dxt1(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, bool three_color_mode, bool hq, BlockDXT1 * output)
-{
-    Vector3 colors[16];
-    float weights[16];
-    int count = reduce_colors(input_colors, input_weights, colors, weights);
-
-    if (count == 0) {
-        // Output trivial block.
-        output->col0.u = 0;
-        output->col1.u = 0;
-        output->indices = 0;
-        return 0;
-    }
-
-
-    float error = FLT_MAX;
-
-    // Sometimes the single color compressor produces better results than the exhaustive. This introduces discontinuities between blocks that
-    // use different compressors. For this reason, this is not enabled by default.
-    if (0) {
-        error = compress_dxt1_single_color(colors, weights, count, color_weights, output);
-
-        if (error == 0.0f || count == 1) {
-            // Early out.
-            return error;
-        }
-    }
-
-    // This is too expensive, even with a low threshold.
-    // If high quality:
-    if (/* DISABLES CODE */ (0)) {
-        BlockDXT1 exhaustive_output;
-        float exhaustive_error = compress_dxt1_bounding_box_exhaustive(input_colors, colors, weights, count, color_weights, three_color_mode, 1400, &exhaustive_output);
-
-        if (exhaustive_error != FLT_MAX) {
-            float exhaustive_error2 = evaluate_mse(input_colors, input_weights, color_weights, &exhaustive_output);
-
-            // The exhaustive compressor does not use color_weights, so the results may be different.
-            //nvCheck(equal(exhaustive_error, exhaustive_error2));
-
-            if (exhaustive_error2 < error) {
-                *output = exhaustive_output;
-                error = exhaustive_error;
-            }
-        }
-    }
-
-    // Cluster fit cannot handle single color blocks, so encode them optimally if we haven't encoded them already.
-    if (error == FLT_MAX && count == 1) {
-        ::compress_dxt1_single_color_optimal(vector3_to_color32(colors[0]), output);
-        return evaluate_mse(input_colors, input_weights, color_weights, output);
-    }
-
-    if (count > 1) {
-        // Fast box fit encoding:
-        {
-            BlockDXT1 box_fit_output;
-
-            Vector3 colors[16];
-            for (int i = 0; i < 16; i++) {
-                colors[i] = input_colors[i].xyz();
-            }
-            int count = 16;
-
-            // Quick end point selection.
-            Vector3 c0, c1;
-            fit_colors_bbox(colors, count, &c0, &c1);
-            inset_bbox(&c0, &c1);
-            select_diagonal(colors, count, &c0, &c1);
-            output_block4(input_colors, color_weights, c0, c1, &box_fit_output);
-
-            float box_fit_error = evaluate_mse(input_colors, input_weights, color_weights, &box_fit_output);
-            if (box_fit_error < error) {
-                error = box_fit_error;
-                *output = box_fit_output;
-
-                // Refine color for the selected indices.
-                if (optimize_end_points4(output->indices, input_colors, 16, &c0, &c1)) {
-                    output_block4(input_colors, color_weights, c0, c1, &box_fit_output);
-
-                    box_fit_error = evaluate_mse(input_colors, input_weights, color_weights, &box_fit_output);
-                    if (box_fit_error < error) {
-                        error = box_fit_error;
-                        *output = box_fit_output;
-                    }
-                }
-            }
-        }
-
-        // Try cluster fit.
-        BlockDXT1 cluster_fit_output;
-        compress_dxt1_cluster_fit(input_colors, colors, weights, count, color_weights, three_color_mode, &cluster_fit_output);
-
-        float cluster_fit_error = evaluate_mse(input_colors, input_weights, color_weights, &cluster_fit_output);
-
-        if (cluster_fit_error < error) {
-            *output = cluster_fit_output;
-            error = cluster_fit_error;
-        }
-
-        if (hq) {
-            // TODO:
-            // - Optimize palette evaluation when updating only one channel.
-            // - try all diagonals.
-
-            // Things that don't help:
-            // - Alternate endpoint updates.
-            // - Randomize order.
-            // - If one direction does not improve, test opposite direction next.
-
-            static const int8 deltas[16][3] = {
-                {1,0,0},
-                {0,1,0},
-                {0,0,1},
-
-                {-1,0,0},
-                {0,-1,0},
-                {0,0,-1},
-
-                {1,1,0},
-                {1,0,1},
-                {0,1,1},
-
-                {-1,-1,0},
-                {-1,0,-1},
-                {0,-1,-1},
-
-                {-1,1,0},
-                //{-1,0,1},
-
-                {1,-1,0},
-                {0,-1,1},
-
-                //{1,0,-1},
-                {0,1,-1},
-            };
-
-            int lastImprovement = 0;
-            for (int i = 0; i < 256; i++) {
-                BlockDXT1 refined = *output;
-                int8 delta[3] = { deltas[i % 16][0], deltas[i % 16][1], deltas[i % 16][2] };
-
-                if ((i / 16) & 1) {
-                    refined.col0.r += delta[0];
-                    refined.col0.g += delta[1];
-                    refined.col0.b += delta[2];
-                }
-                else {
-                    refined.col1.r += delta[0];
-                    refined.col1.g += delta[1];
-                    refined.col1.b += delta[2];
-                }
-
-                if (!three_color_mode) {
-                    if (refined.col0.u == refined.col1.u) refined.col1.g += 1;
-                    if (refined.col0.u < refined.col1.u) swap(refined.col0.u, refined.col1.u);
-                }
-
-                Vector3 palette[4];
-                evaluate_palette(output->col0, output->col1, palette);
-
-                refined.indices = compute_indices(input_colors, color_weights, palette);
-
-                float refined_error = evaluate_mse(input_colors, input_weights, color_weights, &refined);
-                if (refined_error < error) {
-                    *output = refined;
-                    error = refined_error;
-                    lastImprovement = i;
-                }
-
-                // Early out if the last 32 steps didn't improve error.
-                if (i - lastImprovement > 32) break;
-            }
-        }
-    }
-
-    return error;
-}
-
-
-// Once we have an index assignment we have colors grouped in 1-4 clusters.
-// If 1 clusters -> Use optimal compressor.
-// If 2 clusters -> Try: (0, 1), (1, 2), (0, 2), (0, 3) - [0, 1]
-// If 3 clusters -> Try: (0, 1, 2), (0, 1, 3), (0, 2, 3) - [0, 1, 2]
-// If 4 clusters -> Try: (0, 1, 2, 3)
-
-// @@ How do we do the initial index/cluster assignment? Use standard cluster fit.
-
-
-
-float nv::compress_dxt1_fast(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, BlockDXT1 * output)
-{
-    Vector3 colors[16];
-    for (int i = 0; i < 16; i++) {
-        colors[i] = input_colors[i].xyz();
-    }
-    int count = 16;
-
-    /*float error = FLT_MAX;
-    error = compress_dxt1_single_color(colors, input_weights, count, color_weights, output);
-
-    if (error == 0.0f || count == 1) {
-        // Early out.
-        return error;
-    }*/
-
-    // Quick end point selection.
-    Vector3 c0, c1;
-    fit_colors_bbox(colors, count, &c0, &c1);
-    if (c0 == c1) {
-        ::compress_dxt1_single_color_optimal(vector3_to_color32(c0), output);
-        return evaluate_mse(input_colors, input_weights, color_weights, output);
-    }
-    inset_bbox(&c0, &c1);
-    select_diagonal(colors, count, &c0, &c1);
-    output_block4(input_colors, color_weights, c0, c1, output);
-
-    // Refine color for the selected indices.
-    if (optimize_end_points4(output->indices, input_colors, 16, &c0, &c1)) {
-        output_block4(input_colors, color_weights, c0, c1, output);
-    }
-
-    return evaluate_mse(input_colors, input_weights, color_weights, output);
-}
-
-
-void nv::compress_dxt1_fast2(const uint8 input_colors[16*4], BlockDXT1 * output) {
-    /*Vector3 colors[16];
-    float weights[16];
-    int count = reduce_colors(input_colors, colors, weights);
-
-    if (count == 0) {
-        // Output trivial block.
-        output->col0.u = 0;
-        output->col1.u = 0;
-        output->indices = 0;
-        return;
-    }
-
-
-    float error = FLT_MAX;
-    error = compress_dxt1_single_color(colors, weights, count, Vector3(1.0f), output);
-
-    if (error == 0.0f || count == 1) {
-        // Early out.
-        return;
-    }*/
-
-    Vector3 vec_colors[16];
-    for (int i = 0; i < 16; i++) {
-        vec_colors[i] = Vector3(input_colors[4 * i + 0] / 255.0f, input_colors[4 * i + 1] / 255.0f, input_colors[4 * i + 2] / 255.0f);
-    }
-
-    // Quick end point selection.
-    Vector3 c0, c1;
-    //fit_colors_bbox(colors, count, &c0, &c1);
-    //select_diagonal(colors, count, &c0, &c1);
-    fit_colors_bbox(vec_colors, 16, &c0, &c1);
-    if (c0 == c1) {
-        ::compress_dxt1_single_color_optimal(vector3_to_color32(c0), output);
-        return;
-    }
-    inset_bbox(&c0, &c1);
-    select_diagonal(vec_colors, 16, &c0, &c1);
-    output_block4(vec_colors, c0, c1, output);
-
-    // Refine color for the selected indices.
-    if (optimize_end_points4(output->indices, vec_colors, 16, &c0, &c1)) {
-        output_block4(vec_colors, c0, c1, output);
-    }
-}
-
-
-/*static int Mul8Bit(int a, int b)
-{
-    int t = a * b + 128;
-    return (t + (t >> 8)) >> 8;
-}*/
-
-static bool compute_least_squares_endpoints(const uint8 *block, uint32 mask, Vector3 *pmax, Vector3 *pmin)
-{
-    static const int w1Tab[4] = { 3,0,2,1 };
-    static const int prods[4] = { 0x090000,0x000900,0x040102,0x010402 };
-    // ^some magic to save a lot of multiplies in the accumulating loop...
-    // (precomputed products of weights for least squares system, accumulated inside one 32-bit register)
-
-    int akku = 0;
-    int At1_r, At1_g, At1_b;
-    int At2_r, At2_g, At2_b;
-    unsigned int cm = mask;
-
-    if ((mask ^ (mask << 2)) < 4) // all pixels have the same index?
-    {
-        return false;
-    }
-    else {
-        At1_r = At1_g = At1_b = 0;
-        At2_r = At2_g = At2_b = 0;
-        for (int i = 0;i < 16;++i, cm >>= 2) {
-            int step = cm & 3;
-            int w1 = w1Tab[step];
-            int r = block[i * 4 + 0];
-            int g = block[i * 4 + 1];
-            int b = block[i * 4 + 2];
-
-            akku += prods[step];
-            At1_r += w1 * r;
-            At1_g += w1 * g;
-            At1_b += w1 * b;
-            At2_r += r;
-            At2_g += g;
-            At2_b += b;
-        }
-
-        At2_r = 3 * At2_r - At1_r;
-        At2_g = 3 * At2_g - At1_g;
-        At2_b = 3 * At2_b - At1_b;
-
-        // extract solutions and decide solvability
-        int xx = akku >> 16;
-        int yy = (akku >> 8) & 0xff;
-        int xy = (akku >> 0) & 0xff;
-
-        float f = 3.0f / 255.0f / (xx*yy - xy * xy);
-
-        // solve.
-        pmax->x = (At1_r*yy - At2_r * xy) * f;
-        pmax->y = (At1_r*yy - At2_r * xy) * f;
-        pmax->z = (At1_r*yy - At2_r * xy) * f;
-
-        pmin->x = (At2_r*xx - At1_r * xy) * f;
-        pmin->y = (At2_r*xx - At1_r * xy) * f;
-        pmin->z = (At2_r*xx - At1_r * xy) * f;
-
-        return true;
-    }
-}
-
-
-static uint32 bc1_find_sels(const uint8 *input_colors, uint32_t lr, uint32_t lg, uint32_t lb, uint32_t hr, uint32_t hg, uint32_t hb)
-{
-    uint32_t block_r[4], block_g[4], block_b[4];
-
-    block_r[0] = (lr << 3) | (lr >> 2); block_g[0] = (lg << 2) | (lg >> 4); block_b[0] = (lb << 3) | (lb >> 2);
-    block_r[3] = (hr << 3) | (hr >> 2); block_g[3] = (hg << 2) | (hg >> 4); block_b[3] = (hb << 3) | (hb >> 2);
-    block_r[1] = (block_r[0] * 2 + block_r[3]) / 3; block_g[1] = (block_g[0] * 2 + block_g[3]) / 3; block_b[1] = (block_b[0] * 2 + block_b[3]) / 3;
-    block_r[2] = (block_r[3] * 2 + block_r[0]) / 3; block_g[2] = (block_g[3] * 2 + block_g[0]) / 3; block_b[2] = (block_b[3] * 2 + block_b[0]) / 3;
-
-    int ar = block_r[3] - block_r[0], ag = block_g[3] - block_g[0], ab = block_b[3] - block_b[0];
-
-    int dots[4];
-    for (uint32_t i = 0; i < 4; i++)
-        dots[i] = (int)block_r[i] * ar + (int)block_g[i] * ag + (int)block_b[i] * ab;
-
-    int t0 = dots[0] + dots[1], t1 = dots[1] + dots[2], t2 = dots[2] + dots[3];
-
-    ar *= 2; ag *= 2; ab *= 2;
-
-    uint sels = 0;
-    for (uint32_t i = 0; i < 16; i++)
-    {
-        const int d = input_colors[4*i+0] * ar + input_colors[4*i+1] * ag + input_colors[4*i+2] * ab;
-        static const uint8_t s_sels[4] = { 3, 2, 1, 0 };
-
-        // Rounding matters here!
-        // d <= t0: <=, not <, to the later LS step "sees" a wider range of selectors. It matters for quality.
-        sels |= s_sels[(d <= t0) + (d < t1) + (d < t2)] << (2 * i);
-    }
-    return sels;
-}
-
-
-void nv::compress_dxt1_fast_geld(const uint8 input_colors[16 * 4], BlockDXT1 * block) {
-
-    int fr = input_colors[0];
-    int fg = input_colors[1];
-    int fb = input_colors[2];
-
-    int total_r = fr, total_g = fg, total_b = fb;
-    int max_r = fr, max_g = fg, max_b = fb;
-    int min_r = fr, min_g = fg, min_b = fb;
-    uint32 grayscale_flag = (fr == fg) && (fr == fb);
-    for (uint32 i = 1; i < 16; i++)
-    {
-        const int r = input_colors[4*i+0], g = input_colors[4 * i + 1], b = input_colors[4 * i + 2];
-        grayscale_flag &= ((r == g) && (r == b));
-        max_r = max(max_r, r); max_g = max(max_g, g); max_b = max(max_b, b);
-        min_r = min(min_r, r); min_g = min(min_g, g); min_b = min(min_b, b);
-        total_r += r; total_g += g; total_b += b;
-    }
-
-    int lr, lg, lb;
-    int hr, hg, hb;
-
-    if (grayscale_flag) {
-        // Grayscale blocks are a common enough case to specialize.
-        lr = lb = Mul8Bit(min_r, 31);
-        lg = Mul8Bit(min_r, 63);
-
-        hr = hb = Mul8Bit(max_r, 31);
-        hg = Mul8Bit(max_r, 63);
-    }
-    else {
-        int avg_r = (total_r + 8) >> 4, avg_g = (total_g + 8) >> 4, avg_b = (total_b + 8) >> 4;
-
-        // Find the shortest vector from a AABB corner to the block's average color.
-        // This is to help avoid outliers.
-
-        uint32_t dist[3][2];
-        dist[0][0] = square(min_r - avg_r) << 3; dist[0][1] = square(max_r - avg_r) << 3;
-        dist[1][0] = square(min_g - avg_g) << 3; dist[1][1] = square(max_g - avg_g) << 3;
-        dist[2][0] = square(min_b - avg_b) << 3; dist[2][1] = square(max_b - avg_b) << 3;
-
-        uint32_t min_d0 = (dist[0][0] + dist[1][0] + dist[2][0]);
-        uint32_t d4 = (dist[0][0] + dist[1][0] + dist[2][1]) | 4;
-        min_d0 = min(min_d0, d4);
-
-        uint32_t min_d1 = (dist[0][1] + dist[1][0] + dist[2][0]) | 1;
-        uint32_t d5 = (dist[0][1] + dist[1][0] + dist[2][1]) | 5;
-        min_d1 = min(min_d1, d5);
-
-        uint32_t d2 = (dist[0][0] + dist[1][1] + dist[2][0]) | 2;
-        min_d0 = min(min_d0, d2);
-
-        uint32_t d3 = (dist[0][1] + dist[1][1] + dist[2][0]) | 3;
-        min_d1 = min(min_d1, d3);
-
-        uint32_t d6 = (dist[0][0] + dist[1][1] + dist[2][1]) | 6;
-        min_d0 = min(min_d0, d6);
-
-        uint32_t d7 = (dist[0][1] + dist[1][1] + dist[2][1]) | 7;
-        min_d1 = min(min_d1, d7);
-
-        uint32_t min_d = min(min_d0, min_d1);
-        uint32_t best_i = min_d & 7;
-
-        const int delta_r = (best_i & 1) ? (max_r - avg_r) : (avg_r - min_r);
-        const int delta_g = (best_i & 2) ? (max_g - avg_g) : (avg_g - min_g);
-        const int delta_b = (best_i & 4) ? (max_b - avg_b) : (avg_b - min_b);
-
-        // Now we have a smaller AABB going from the block's average color to a cornerpoint of the larger AABB.
-        // Project all pixels colors along the 4 vectors going from a smaller AABB cornerpoint to the opposite cornerpoint, find largest projection.
-        // One of these vectors will be a decent approximation of the block's PCA.
-        const int saxis0_r = delta_r, saxis0_g = delta_g, saxis0_b = delta_b;
-
-        int low_dot0 = INT_MAX, high_dot0 = INT_MIN;
-        int low_dot1 = INT_MAX, high_dot1 = INT_MIN;
-        int low_dot2 = INT_MAX, high_dot2 = INT_MIN;
-        int low_dot3 = INT_MAX, high_dot3 = INT_MIN;
-
-        int low_c0, low_c1, low_c2, low_c3;
-        int high_c0, high_c1, high_c2, high_c3;
-
-        for (uint32_t i = 0; i < 16; i++)
-        {
-            const int dotx = input_colors[4*i+0] * saxis0_r;
-            const int doty = input_colors[4*i+1] * saxis0_g;
-            const int dotz = input_colors[4*i+2] * saxis0_b;
-
-            const int dot0 = ((dotz + dotx + doty) << 4) + i;
-            const int dot1 = ((dotz - dotx - doty) << 4) + i;
-            const int dot2 = ((dotz - dotx + doty) << 4) + i;
-            const int dot3 = ((dotz + dotx - doty) << 4) + i;
-
-            if (dot0 < low_dot0)
-            {
-                low_dot0 = dot0;
-                low_c0 = i;
-            }
-            if ((dot0 ^ 15) > high_dot0)
-            {
-                high_dot0 = dot0 ^ 15;
-                high_c0 = i;
-            }
-
-            if (dot1 < low_dot1)
-            {
-                low_dot1 = dot1;
-                low_c1 = i;
-            }
-            if ((dot1 ^ 15) > high_dot1)
-            {
-                high_dot1 = dot1 ^ 15;
-                high_c1 = i;
-            }
-
-            if (dot2 < low_dot2)
-            {
-                low_dot2 = dot2;
-                low_c2 = i;
-            }
-            if ((dot2 ^ 15) > high_dot2)
-            {
-                high_dot2 = dot2 ^ 15;
-                high_c2 = i;
-            }
-
-            if (dot3 < low_dot3)
-            {
-                low_dot3 = dot3;
-                low_c3 = i;
-            }
-            if ((dot3 ^ 15) > high_dot3)
-            {
-                high_dot3 = dot3 ^ 15;
-                high_c3 = i;
-            }
-        }
-
-
-        uint32_t low_c = low_dot0 & 15, high_c = ~high_dot0 & 15, r = (high_dot0 & ~15) - (low_dot0 & ~15);
-
-        uint32_t tr = (high_dot1 & ~15) - (low_dot1 & ~15);
-        if (tr > r)
-            low_c = low_dot1 & 15, high_c = ~high_dot1 & 15, r = tr;
-
-        tr = (high_dot2 & ~15) - (low_dot2 & ~15);
-        if (tr > r)
-            low_c = low_dot2 & 15, high_c = ~high_dot2 & 15, r = tr;
-
-        tr = (high_dot3 & ~15) - (low_dot3 & ~15);
-        if (tr > r)
-            low_c = low_dot3 & 15, high_c = ~high_dot3 & 15;
-
-        lr = Mul8Bit(input_colors[low_c*4+0], 31);
-        lg = Mul8Bit(input_colors[low_c*4+1], 63);
-        lb = Mul8Bit(input_colors[low_c*4+2], 31);
-
-        hr = Mul8Bit(input_colors[high_c*4+0], 31);
-        hg = Mul8Bit(input_colors[high_c*4+1], 63);
-        hb = Mul8Bit(input_colors[high_c*4+2], 31);
-    }
-
-    uint32 selectors = bc1_find_sels(input_colors, lr, lg, lb, hr, hg, hb);
-
-    Vector3 c0, c1;
-    if (!compute_least_squares_endpoints(input_colors, selectors, &c0, &c1)) {
-        // @@ Single color compressor.
-        Color32 c;
-        c.r = lr;
-        c.g = lg;
-        c.b = lb;
-        ::compress_dxt1_single_color_optimal(c, block);
-    }
-    else {
-        Color16 color0 = vector3_to_color16(c0);
-        Color16 color1 = vector3_to_color16(c1);
-
-        if (color0.u < color1.u) {
-            swap(color0, color1);
-        }
-
-        Color32 palette[4];
-        evaluate_palette(color0, color1, palette);
-
-        block->col0 = color0;
-        block->col1 = color1;
-        block->indices = bc1_find_sels(input_colors, color0.r, color0.g, color0.b, color1.r, color1.g, color1.b);
-    }
-
-    /*// Quick end point selection.
-    Vector3 c0, c1;
-    //fit_colors_bbox(colors, count, &c0, &c1);
-    //select_diagonal(colors, count, &c0, &c1);
-    fit_colors_bbox(vec_colors, 16, &c0, &c1);
-    if (c0 == c1) {
-        ::compress_dxt1_single_color_optimal(vector3_to_color32(c0), output);
-        return;
-    }
-    inset_bbox(&c0, &c1);
-    select_diagonal(vec_colors, 16, &c0, &c1);
-    output_block4(vec_colors, c0, c1, output);
-
-    // Refine color for the selected indices.
-    if (optimize_end_points4(output->indices, vec_colors, 16, &c0, &c1)) {
-        output_block4(vec_colors, c0, c1, output);
-    }*/
-}
diff --git a/src/nvtt/CompressorDXT1.h b/src/nvtt/CompressorDXT1.h
deleted file mode 100644
index ca46b6c..0000000
--- a/src/nvtt/CompressorDXT1.h
+++ /dev/null
@@ -1,29 +0,0 @@
-
-namespace nv {
-
-    struct BlockDXT1;
-    class Vector3;
-    class Vector4;
-
-    void init_dxt1();
-
-    // All these functions return MSE.
-
-    float compress_dxt1_single_color(const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, BlockDXT1 * output);
-    //float compress_dxt1_least_squares_fit(const Vector4 input_colors[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, BlockDXT1 * output);
-    float compress_dxt1_bounding_box_exhaustive(const Vector4 input_colors[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, bool three_color_mode, int search_limit, BlockDXT1 * output);
-    void compress_dxt1_cluster_fit(const Vector4 input_colors[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, bool three_color_mode, BlockDXT1 * output);
-
-    // Cluster fit end point selection.
-    float compress_dxt1(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, bool three_color_mode, bool hq, BlockDXT1 * output);
-
-    // Quick end point selection followed by least squares refinement.
-    float compress_dxt1_fast(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, BlockDXT1 * output);
-
-    // @@ Change these interfaces to take a pitch argument instead of assuming (4*4), just like CMP_Core.
-    void compress_dxt1_fast2(const unsigned char input_colors[16*4], BlockDXT1 * output);
-    void compress_dxt1_fast_geld(const unsigned char input_colors[16 * 4], BlockDXT1 * output);
-
-    float evaluate_dxt1_error(const unsigned char rgba_block[16 * 4], const BlockDXT1 * block, int decoder = 0);
-
-}
diff --git a/src/nvtt/CompressorDXT5_RGBM.cpp b/src/nvtt/CompressorDXT5_RGBM.cpp
index 0002470..0ebacc3 100644
--- a/src/nvtt/CompressorDXT5_RGBM.cpp
+++ b/src/nvtt/CompressorDXT5_RGBM.cpp
@@ -1,5 +1,5 @@
 #include "CompressorDXT5_RGBM.h"
-#include "CompressorDXT1.h"
+#include "icbc.h"
 
 #include "OptimalCompressDXT.h"
 #include "QuickCompressDXT.h"
@@ -58,8 +58,10 @@ float nv::compress_dxt5_rgbm(const Vector4 input_colors[16], const float input_w
     float rgb_weights[16];
     convert_to_rgbm(input_colors, input_weights, min_m, input_colors_rgbm, rgb_weights);
 
+    float color_weights[3] = { 1.0f,1.0f,1.0f };
+
     // Compress RGB.
-    compress_dxt1(input_colors_rgbm, rgb_weights, Vector3(1), /*three_color_mode=*/false, /*hq=*/false, &output->color);
+    icbc::compress_dxt1((float *)input_colors_rgbm, rgb_weights, color_weights, /*three_color_mode=*/false, /*hq=*/false, &output->color);
 
     // Decompress RGB/M block.
     nv::ColorBlock RGB;
diff --git a/src/nvtt/CompressorETC.cpp b/src/nvtt/CompressorETC.cpp
index 4fb8e1c..d3acc87 100644
--- a/src/nvtt/CompressorETC.cpp
+++ b/src/nvtt/CompressorETC.cpp
@@ -6,7 +6,7 @@
 #include "nvmath/Color.inl"
 #include "nvcore/Utils.h"    // clamp
 
-#define HAVE_RGETC NV_OS_OSX
+#define HAVE_RGETC 1
 #define HAVE_ETCPACK 0 // Only enable in OSX for debugging.
 
 #if HAVE_RGETC
@@ -190,7 +190,7 @@ static const float midpoints5[32] = {
 // ETC2 Modes:
 // - ETC1:
 //  - two partitions (flip modes): 2*(4x2, 2x4)
-//  - two base colors sotred as 444+444 or 555+333
+//  - two base colors stored as 444+444 or 555+333
 //  - two 3 bit intensity modifiers
 // - T Mode. 2 colors 444, 3 bit intensity modifiers, 2 bit indices.
 // - H Mode. 2 colors 444, 3 bit intensity modifiers, 2 bit indices.
diff --git a/src/nvtt/Context.cpp b/src/nvtt/Context.cpp
index 37c201b..695e737 100644
--- a/src/nvtt/Context.cpp
+++ b/src/nvtt/Context.cpp
@@ -30,6 +30,7 @@
 #include "CompressionOptions.h"
 #include "OutputOptions.h"
 #include "Surface.h"
+#include "icbc.h"
 
 #include "CompressorDX9.h"
 #include "CompressorDX10.h"
@@ -67,6 +68,8 @@ Compressor::Compressor() : m(*new Compressor::Private())
     enableCudaAcceleration(m.cudaSupported);
 
     m.dispatcher = &m.defaultDispatcher;
+
+    icbc::init();
 }
 
 Compressor::~Compressor()
diff --git a/src/nvtt/icbc.cpp b/src/nvtt/icbc.cpp
new file mode 100644
index 0000000..cb2215c
--- /dev/null
+++ b/src/nvtt/icbc.cpp
@@ -0,0 +1,2 @@
+#define ICBC_IMPLEMENTATION
+#include "icbc.h"
diff --git a/src/nvtt/icbc.h b/src/nvtt/icbc.h
new file mode 100644
index 0000000..56bf3b7
--- /dev/null
+++ b/src/nvtt/icbc.h
@@ -0,0 +1,3922 @@
+
+
+namespace icbc {
+
+    void init();
+
+    float compress_dxt1(const float input_colors[16 * 4], const float input_weights[16], const float color_weights[3], bool three_color_mode, bool hq, void * output);
+    float compress_dxt1_fast(const float input_colors[16 * 4], const float input_weights[16], const float color_weights[3], void * output);
+    void compress_dxt1_fast(const unsigned char input_colors[16 * 4], void * output);
+
+    enum Decoder {
+        Decoder_D3D10 = 0,
+        Decoder_NVIDIA = 1,
+        Decoder_AMD = 2
+    };
+
+    float evaluate_dxt1_error(const unsigned char rgba_block[16 * 4], const void * block, Decoder decoder = Decoder_D3D10);
+
+}
+
+#ifdef ICBC_IMPLEMENTATION
+
+#ifndef ICBC_USE_SSE
+#define ICBC_USE_SSE 2
+#endif
+
+#ifndef ICBC_DECODER
+#define ICBC_DECODER 0       // 0 = d3d10, 1 = d3d9, 2 = nvidia, 3 = amd
+#endif
+
+#define ICBC_USE_SIMD ICBC_USE_SSE
+
+// Some testing knobs:
+#define ICBC_FAST_CLUSTER_FIT 0     // This ignores input weights for a moderate speedup.
+#define ICBC_PERFECT_ROUND 0        // Enable perfect rounding in scalar code path only.
+
+#include <stdint.h>
+#include <string.h> // memset
+#include <math.h>   // floorf
+#include <float.h>  // FLT_MAX
+
+#ifndef ICBC_ASSERT
+#define ICBC_ASSERT assert
+#include <assert.h>
+#endif
+
+namespace icbc {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Basic Templates
+
+template <typename T> inline void swap(T & a, T & b) {
+    T temp(a);
+    a = b;
+    b = temp;
+}
+
+template <typename T> inline T max(const T & a, const T & b) {
+    return (b < a) ? a : b;
+}
+
+template <typename T> inline T min(const T & a, const T & b) {
+    return (a < b) ? a : b;
+}
+
+template <typename T> inline T clamp(const T & x, const T & a, const T & b) {
+    return min(max(x, a), b);
+}
+
+template <typename T> inline T square(const T & a) {
+    return a * a;
+}
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Basic Types
+
+typedef uint8_t uint8;
+typedef int8_t int8;
+typedef uint16_t uint16;
+typedef uint32_t uint32;
+typedef uint32_t uint;
+
+
+struct Color16 {
+    union {
+        struct {
+            uint16 b : 5;
+            uint16 g : 6;
+            uint16 r : 5;
+        };
+        uint16 u;
+    };
+};
+
+struct Color32 {
+    union {
+        struct {
+            uint8 b, g, r, a;
+        };
+        uint32 u;
+    };
+};
+
+struct BlockDXT1 {
+    Color16 col0;
+    Color16 col1;
+    uint32 indices;
+};
+
+
+struct Vector3 {
+    float x;
+    float y;
+    float z;
+
+    inline void operator+=(Vector3 v) {
+        x += v.x; y += v.y; z += v.z;
+    }
+    inline void operator*=(Vector3 v) {
+        x *= v.x; y *= v.y; z *= v.z;
+    }
+    inline void operator*=(float s) {
+        x *= s; y *= s; z *= s;
+    }
+};
+
+struct Vector4 {
+    union {
+        struct {
+            float x, y, z, w;
+        };
+        Vector3 xyz;
+    };
+};
+
+
+inline Vector3 operator*(Vector3 v, float s) {
+    return { v.x * s, v.y * s, v.z * s };
+}
+
+inline Vector3 operator*(float s, Vector3 v) {
+    return { v.x * s, v.y * s, v.z * s };
+}
+
+inline Vector3 operator*(Vector3 a, Vector3 b) {
+    return { a.x * b.x, a.y * b.y, a.z * b.z };
+}
+
+inline float dot(Vector3 a, Vector3 b) {
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+inline Vector3 operator+(Vector3 a, Vector3 b) {
+    return { a.x + b.x, a.y + b.y, a.z + b.z };
+}
+
+inline Vector3 operator-(Vector3 a, Vector3 b) {
+    return { a.x - b.x, a.y - b.y, a.z - b.z };
+}
+
+inline Vector3 operator/(Vector3 v, float s) {
+    return { v.x / s, v.y / s, v.z / s };
+}
+
+inline float saturate(float x) {
+    return clamp(x, 0.0f, 1.0f);
+}
+
+inline Vector3 saturate(Vector3 v) {
+    return { saturate(v.x), saturate(v.y), saturate(v.z) };
+}
+
+inline Vector3 min(Vector3 a, Vector3 b) {
+    return { min(a.x, b.x), min(a.y, b.y), min(a.z, b.z) };
+}
+
+inline Vector3 max(Vector3 a, Vector3 b) {
+    return { max(a.x, b.x), max(a.y, b.y), max(a.z, b.z) };
+}
+
+inline Vector3 round(Vector3 v) {
+    return { floorf(v.x+0.5f), floorf(v.y + 0.5f), floorf(v.z + 0.5f) };
+}
+
+inline Vector3 floor(Vector3 v) {
+    return { floorf(v.x), floorf(v.y), floorf(v.z) };
+}
+
+inline bool operator==(const Vector3 & a, const Vector3 & b) {
+    return a.x == b.x && a.y == b.y && a.z == b.z;
+}
+
+inline Vector3 scalar_to_vector3(float f) {
+    return {f, f, f};
+}
+
+inline float lengthSquared(Vector3 v) {
+    return dot(v, v);
+}
+
+inline bool equal(float a, float b, float epsilon = 0.0001) {
+    // http://realtimecollisiondetection.net/blog/?p=89
+    //return fabsf(a - b) < epsilon * max(1.0f, max(fabsf(a), fabsf(b)));
+    return fabsf(a - b) < epsilon;
+}
+
+inline bool equal(Vector3 a, Vector3 b, float epsilon) {
+    return equal(a.x, b.x, epsilon) && equal(a.y, b.y, epsilon) && equal(a.z, b.z, epsilon);
+}
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// SIMD
+
+#ifndef ICBC_ALIGN_16
+#if __GNUC__
+#   define ICBC_ALIGN_16 __attribute__ ((__aligned__ (16)))
+#else // _MSC_VER
+#   define ICBC_ALIGN_16 __declspec(align(16))
+#endif
+#endif
+
+#if ICBC_USE_SIMD
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+#define SIMD_INLINE inline
+#define SIMD_NATIVE __forceinline
+
+class SimdVector
+{
+public:
+    __m128 vec;
+
+    typedef SimdVector const& Arg;
+
+    SIMD_NATIVE SimdVector() {}
+
+    SIMD_NATIVE explicit SimdVector(__m128 v) : vec(v) {}
+
+    SIMD_NATIVE explicit SimdVector(float f) {
+        vec = _mm_set1_ps(f);
+    }
+
+    SIMD_NATIVE explicit SimdVector(const float * v)
+    {
+        vec = _mm_load_ps(v);
+    }
+
+    SIMD_NATIVE SimdVector(float x, float y, float z, float w)
+    {
+        vec = _mm_setr_ps(x, y, z, w);
+    }
+
+    SIMD_NATIVE SimdVector(const SimdVector & arg) : vec(arg.vec) {}
+
+    SIMD_NATIVE SimdVector & operator=(const SimdVector & arg)
+    {
+        vec = arg.vec;
+        return *this;
+    }
+
+    SIMD_INLINE float toFloat() const
+    {
+        ICBC_ALIGN_16 float f;
+        _mm_store_ss(&f, vec);
+        return f;
+    }
+
+    SIMD_INLINE Vector3 toVector3() const
+    {
+        ICBC_ALIGN_16 float c[4];
+        _mm_store_ps(c, vec);
+        return { c[0], c[1], c[2] };
+    }
+
+#define SSE_SPLAT( a ) ((a) | ((a) << 2) | ((a) << 4) | ((a) << 6))
+    SIMD_NATIVE SimdVector splatX() const { return SimdVector(_mm_shuffle_ps(vec, vec, SSE_SPLAT(0))); }
+    SIMD_NATIVE SimdVector splatY() const { return SimdVector(_mm_shuffle_ps(vec, vec, SSE_SPLAT(1))); }
+    SIMD_NATIVE SimdVector splatZ() const { return SimdVector(_mm_shuffle_ps(vec, vec, SSE_SPLAT(2))); }
+    SIMD_NATIVE SimdVector splatW() const { return SimdVector(_mm_shuffle_ps(vec, vec, SSE_SPLAT(3))); }
+#undef SSE_SPLAT
+
+    SIMD_NATIVE SimdVector& operator+=(Arg v)
+    {
+        vec = _mm_add_ps(vec, v.vec);
+        return *this;
+    }
+
+    SIMD_NATIVE SimdVector& operator-=(Arg v)
+    {
+        vec = _mm_sub_ps(vec, v.vec);
+        return *this;
+    }
+
+    SIMD_NATIVE SimdVector& operator*=(Arg v)
+    {
+        vec = _mm_mul_ps(vec, v.vec);
+        return *this;
+    }
+};
+
+
+SIMD_NATIVE SimdVector operator+(SimdVector::Arg left, SimdVector::Arg right)
+{
+    return SimdVector(_mm_add_ps(left.vec, right.vec));
+}
+
+SIMD_NATIVE SimdVector operator-(SimdVector::Arg left, SimdVector::Arg right)
+{
+    return SimdVector(_mm_sub_ps(left.vec, right.vec));
+}
+
+SIMD_NATIVE SimdVector operator*(SimdVector::Arg left, SimdVector::Arg right)
+{
+    return SimdVector(_mm_mul_ps(left.vec, right.vec));
+}
+
+// Returns a*b + c
+SIMD_INLINE SimdVector multiplyAdd(SimdVector::Arg a, SimdVector::Arg b, SimdVector::Arg c)
+{
+    return SimdVector(_mm_add_ps(_mm_mul_ps(a.vec, b.vec), c.vec));
+}
+
+// Returns -( a*b - c )
+SIMD_INLINE SimdVector negativeMultiplySubtract(SimdVector::Arg a, SimdVector::Arg b, SimdVector::Arg c)
+{
+    return SimdVector(_mm_sub_ps(c.vec, _mm_mul_ps(a.vec, b.vec)));
+}
+
+SIMD_INLINE SimdVector reciprocal(SimdVector::Arg v)
+{
+    // get the reciprocal estimate
+    __m128 estimate = _mm_rcp_ps(v.vec);
+
+    // one round of Newton-Rhaphson refinement
+    __m128 diff = _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(estimate, v.vec));
+    return SimdVector(_mm_add_ps(_mm_mul_ps(diff, estimate), estimate));
+}
+
+SIMD_NATIVE SimdVector min(SimdVector::Arg left, SimdVector::Arg right)
+{
+    return SimdVector(_mm_min_ps(left.vec, right.vec));
+}
+
+SIMD_NATIVE SimdVector max(SimdVector::Arg left, SimdVector::Arg right)
+{
+    return SimdVector(_mm_max_ps(left.vec, right.vec));
+}
+
+SIMD_INLINE SimdVector truncate(SimdVector::Arg v)
+{
+#if (ICBC_USE_SSE == 1)
+    // convert to ints
+    __m128 input = v.vec;
+    __m64 lo = _mm_cvttps_pi32(input);
+    __m64 hi = _mm_cvttps_pi32(_mm_movehl_ps(input, input));
+
+    // convert to floats
+    __m128 part = _mm_movelh_ps(input, _mm_cvtpi32_ps(input, hi));
+    __m128 truncated = _mm_cvtpi32_ps(part, lo);
+
+    // clear out the MMX multimedia state to allow FP calls later
+    _mm_empty();
+    return SimdVector(truncated);
+#else
+    // use SSE2 instructions
+    return SimdVector(_mm_cvtepi32_ps(_mm_cvttps_epi32(v.vec)));
+#endif
+}
+
+SIMD_INLINE SimdVector select(SimdVector::Arg off, SimdVector::Arg on, SimdVector::Arg bits)
+{
+    __m128 a = _mm_andnot_ps(bits.vec, off.vec);
+    __m128 b = _mm_and_ps(bits.vec, on.vec);
+
+    return SimdVector(_mm_or_ps(a, b));
+}
+
+SIMD_INLINE bool compareAnyLessThan(SimdVector::Arg left, SimdVector::Arg right)
+{
+    __m128 bits = _mm_cmplt_ps(left.vec, right.vec);
+    int value = _mm_movemask_ps(bits);
+    return value != 0;
+}
+
+#endif // ICBC_USE_SIMD
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Color conversion functions.
+
+static const float midpoints5[32] = {
+    0.015686f, 0.047059f, 0.078431f, 0.111765f, 0.145098f, 0.176471f, 0.207843f, 0.241176f, 0.274510f, 0.305882f, 0.337255f, 0.370588f, 0.403922f, 0.435294f, 0.466667f, 0.5f,
+    0.533333f, 0.564706f, 0.596078f, 0.629412f, 0.662745f, 0.694118f, 0.725490f, 0.758824f, 0.792157f, 0.823529f, 0.854902f, 0.888235f, 0.921569f, 0.952941f, 0.984314f, 1.0f
+};
+
+static const float midpoints6[64] = {
+    0.007843f, 0.023529f, 0.039216f, 0.054902f, 0.070588f, 0.086275f, 0.101961f, 0.117647f, 0.133333f, 0.149020f, 0.164706f, 0.180392f, 0.196078f, 0.211765f, 0.227451f, 0.245098f, 
+    0.262745f, 0.278431f, 0.294118f, 0.309804f, 0.325490f, 0.341176f, 0.356863f, 0.372549f, 0.388235f, 0.403922f, 0.419608f, 0.435294f, 0.450980f, 0.466667f, 0.482353f, 0.500000f, 
+    0.517647f, 0.533333f, 0.549020f, 0.564706f, 0.580392f, 0.596078f, 0.611765f, 0.627451f, 0.643137f, 0.658824f, 0.674510f, 0.690196f, 0.705882f, 0.721569f, 0.737255f, 0.754902f, 
+    0.772549f, 0.788235f, 0.803922f, 0.819608f, 0.835294f, 0.850980f, 0.866667f, 0.882353f, 0.898039f, 0.913725f, 0.929412f, 0.945098f, 0.960784f, 0.976471f, 0.992157f, 1.0f
+};
+
+/*void init_tables() {
+    for (int i = 0; i < 31; i++) {
+        float f0 = float(((i+0) << 3) | ((i+0) >> 2)) / 255.0f;
+        float f1 = float(((i+1) << 3) | ((i+1) >> 2)) / 255.0f;
+        midpoints5[i] = (f0 + f1) * 0.5;
+    }
+    midpoints5[31] = 1.0f;
+
+    for (int i = 0; i < 63; i++) {
+        float f0 = float(((i+0) << 2) | ((i+0) >> 4)) / 255.0f;
+        float f1 = float(((i+1) << 2) | ((i+1) >> 4)) / 255.0f;
+        midpoints6[i] = (f0 + f1) * 0.5;
+    }
+    midpoints6[63] = 1.0f;
+}*/
+
+static Color16 vector3_to_color16(const Vector3 & v) {
+
+    // Truncate.
+    uint r = uint(clamp(v.x * 31.0f, 0.0f, 31.0f));
+	uint g = uint(clamp(v.y * 63.0f, 0.0f, 63.0f));
+	uint b = uint(clamp(v.z * 31.0f, 0.0f, 31.0f));
+
+    // Round exactly according to 565 bit-expansion.
+    r += (v.x > midpoints5[r]);
+    g += (v.y > midpoints6[g]);
+    b += (v.z > midpoints5[b]);
+
+    Color16 c;
+    c.u = (r << 11) | (g << 5) | b;
+    return c;
+}
+
+static Color32 bitexpand_color16_to_color32(Color16 c16) {
+    Color32 c32;
+    //c32.b = (c16.b << 3) | (c16.b >> 2);
+    //c32.g = (c16.g << 2) | (c16.g >> 4);
+    //c32.r = (c16.r << 3) | (c16.r >> 2);
+    //c32.a = 0xFF;
+
+    c32.u = ((c16.u << 3) & 0xf8) | ((c16.u << 5) & 0xfc00) | ((c16.u << 8) & 0xf80000);
+    c32.u |= (c32.u >> 5) & 0x070007;
+    c32.u |= (c32.u >> 6) & 0x000300;
+
+    return c32;
+}
+
+inline Vector3 color_to_vector3(Color32 c) {
+    return { c.r / 255.0f, c.g / 255.0f, c.b / 255.0f };
+}
+
+inline Color32 vector3_to_color32(Vector3 v) {
+    Color32 color;
+    color.r = uint8(saturate(v.x) * 255 + 0.5f);
+    color.g = uint8(saturate(v.y) * 255 + 0.5f);
+    color.b = uint8(saturate(v.z) * 255 + 0.5f);
+    color.a = 255;
+    return color;
+}
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Input block processing.
+
+// Find similar colors and combine them together.
+static int reduce_colors(const Vector4 * input_colors, const float * input_weights, Vector3 * colors, float * weights)
+{
+#if 0
+    for (int i = 0; i < 16; i++) {
+        colors[i] = input_colors[i].xyz;
+        weights[i] = input_weights[i];
+    }
+    return 16;
+#else
+    int n = 0;
+    for (int i = 0; i < 16; i++)
+    {
+        Vector3 ci = input_colors[i].xyz;
+        float wi = input_weights[i];
+
+        float threshold = 1.0 / 256;
+
+        if (wi > 0) {
+            // Find matching color.
+            int j;
+            for (j = 0; j < n; j++) {
+                if (equal(colors[j], ci, threshold)) {
+                    weights[j] += wi;
+                    break;
+                }
+            }
+
+            // No match found. Add new color.
+            if (j == n) {
+                colors[n] = ci;
+                weights[n] = wi;
+                n++;
+            }
+        }
+    }
+
+    ICBC_ASSERT(n <= 16);
+
+    return n;
+#endif
+}
+
+static int reduce_colors(const uint8 * input_colors, Vector3 * colors, float * weights)
+{
+    int n = 0;
+    for (int i = 0; i < 16; i++)
+    {
+        Vector3 ci;
+        ci.x = float(input_colors[4 * i + 0]);
+        ci.y = float(input_colors[4 * i + 1]);
+        ci.z = float(input_colors[4 * i + 2]);
+
+        float threshold = 1.0 / 256;
+
+        // Find matching color.
+        int j;
+        for (j = 0; j < n; j++) {
+            if (equal(colors[j], ci, threshold)) {
+                weights[j] += 1.0f;
+                break;
+            }
+        }
+
+        // No match found. Add new color.
+        if (j == n) {
+            colors[n] = ci;
+            weights[n] = 1.0f;
+            n++;
+        }
+    }
+
+    ICBC_ASSERT(n <= 16);
+
+    return n;
+}
+
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Cluster Fit
+
+class ClusterFit
+{
+public:
+    ClusterFit() {}
+
+    void setErrorMetric(const Vector3 & metric);
+
+    void setColorSet(const Vector3 * colors, const float * weights, int count, const Vector3 & metric);
+    void setColorSet(const Vector4 * colors, const Vector3 & metric);
+
+    float bestError() const;
+
+    bool compress3(Vector3 * start, Vector3 * end);
+    bool compress4(Vector3 * start, Vector3 * end);
+
+    bool fastCompress3(Vector3 * start, Vector3 * end);
+    bool fastCompress4(Vector3 * start, Vector3 * end);
+
+
+private:
+
+    uint m_count;
+
+#if ICBC_USE_SIMD
+    ICBC_ALIGN_16 SimdVector m_weighted[16]; // color | weight
+    SimdVector m_metric;                // vec3
+    SimdVector m_metricSqr;             // vec3
+    SimdVector m_xxsum;                 // color | weight
+    SimdVector m_xsum;                  // color | weight (wsum)
+    SimdVector m_besterror;             // scalar
+#else
+    Vector3 m_weighted[16];
+    float m_weights[16];
+    Vector3 m_metric;
+    Vector3 m_metricSqr;
+    Vector3 m_xxsum;
+    Vector3 m_xsum;
+    float m_wsum;
+    float m_besterror;
+#endif
+};
+
+
+static Vector3 computeCentroid(int n, const Vector3 *__restrict points, const float *__restrict weights)
+{
+    Vector3 centroid = { 0 };
+    float total = 0.0f;
+
+    for (int i = 0; i < n; i++)
+    {
+        total += weights[i];
+        centroid += weights[i] * points[i];
+    }
+    centroid *= (1.0f / total);
+
+    return centroid;
+}
+
+static Vector3 computeCovariance(int n, const Vector3 *__restrict points, const float *__restrict weights, float *__restrict covariance)
+{
+    // compute the centroid
+    Vector3 centroid = computeCentroid(n, points, weights);
+
+    // compute covariance matrix
+    for (int i = 0; i < 6; i++)
+    {
+        covariance[i] = 0.0f;
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        Vector3 a = (points[i] - centroid);    // @@ I think weight should be squared, but that seems to increase the error slightly.
+        Vector3 b = weights[i] * a;
+
+        covariance[0] += a.x * b.x;
+        covariance[1] += a.x * b.y;
+        covariance[2] += a.x * b.z;
+        covariance[3] += a.y * b.y;
+        covariance[4] += a.y * b.z;
+        covariance[5] += a.z * b.z;
+    }
+
+    return centroid;
+}
+
+// @@ We should be able to do something cheaper...
+static Vector3 estimatePrincipalComponent(const float * __restrict matrix)
+{
+    const Vector3 row0 = { matrix[0], matrix[1], matrix[2] };
+    const Vector3 row1 = { matrix[1], matrix[3], matrix[4] };
+    const Vector3 row2 = { matrix[2], matrix[4], matrix[5] };
+
+    float r0 = lengthSquared(row0);
+    float r1 = lengthSquared(row1);
+    float r2 = lengthSquared(row2);
+
+    if (r0 > r1 && r0 > r2) return row0;
+    if (r1 > r2) return row1;
+    return row2;
+}
+
+static inline Vector3 firstEigenVector_PowerMethod(const float *__restrict matrix)
+{
+    if (matrix[0] == 0 && matrix[3] == 0 && matrix[5] == 0)
+    {
+        return {0};
+    }
+
+    Vector3 v = estimatePrincipalComponent(matrix);
+
+    const int NUM = 8;
+    for (int i = 0; i < NUM; i++)
+    {
+        float x = v.x * matrix[0] + v.y * matrix[1] + v.z * matrix[2];
+        float y = v.x * matrix[1] + v.y * matrix[3] + v.z * matrix[4];
+        float z = v.x * matrix[2] + v.y * matrix[4] + v.z * matrix[5];
+
+        float norm = max(max(x, y), z);
+
+        v = { x, y, z };
+        v *= (1.0f / norm);
+    }
+
+    return v;
+}
+
+static Vector3 computePrincipalComponent_PowerMethod(int n, const Vector3 *__restrict points, const float *__restrict weights)
+{
+    float matrix[6];
+    computeCovariance(n, points, weights, matrix);
+
+    return firstEigenVector_PowerMethod(matrix);
+}
+
+
+void ClusterFit::setErrorMetric(const Vector3 & metric)
+{
+#if ICBC_USE_SIMD
+    ICBC_ALIGN_16 Vector4 tmp;
+    tmp.xyz = metric;
+    tmp.w = 1;
+    m_metric = SimdVector(&tmp.x);
+#else
+    m_metric = metric;
+#endif
+    m_metricSqr = m_metric * m_metric;
+}
+
+void ClusterFit::setColorSet(const Vector3 * colors, const float * weights, int count, const Vector3 & metric)
+{
+    setErrorMetric(metric);
+
+    // initialise the best error
+#if ICBC_USE_SIMD
+    m_besterror = SimdVector(FLT_MAX);
+#else
+    m_besterror = FLT_MAX;
+#endif
+
+    m_count = count;
+
+    // I've tried using a lower quality approximation of the principal direction, but the best fit line seems to produce best results.
+    Vector3 principal = computePrincipalComponent_PowerMethod(count, colors, weights);
+
+    // build the list of values
+    int order[16];
+    float dps[16];
+    for (uint i = 0; i < m_count; ++i)
+    {
+        order[i] = i;
+        dps[i] = dot(colors[i], principal);
+    }
+
+    // stable sort
+    for (uint i = 0; i < m_count; ++i)
+    {
+        for (uint j = i; j > 0 && dps[j] < dps[j - 1]; --j)
+        {
+            swap(dps[j], dps[j - 1]);
+            swap(order[j], order[j - 1]);
+        }
+    }
+
+    // weight all the points
+#if ICBC_USE_SIMD
+    m_xxsum = SimdVector(0.0f);
+    m_xsum = SimdVector(0.0f);
+#else
+    m_xxsum = { 0.0f };
+    m_xsum = { 0.0f };
+    m_wsum = 0.0f;
+#endif
+
+    for (uint i = 0; i < m_count; ++i)
+    {
+        int p = order[i];
+#if ICBC_USE_SIMD
+        ICBC_ALIGN_16 Vector4 tmp;
+        tmp.xyz = colors[p];
+        tmp.w = 1;
+        m_weighted[i] = SimdVector(&tmp.x) * SimdVector(weights[p]);
+        m_xxsum += m_weighted[i] * m_weighted[i];
+        m_xsum += m_weighted[i];
+#else
+        m_weighted[i] = colors[p] * weights[p];
+        m_xxsum += m_weighted[i] * m_weighted[i];
+        m_xsum += m_weighted[i];
+        m_weights[i] = weights[p];
+        m_wsum += m_weights[i];
+#endif
+    }
+}
+
+void ClusterFit::setColorSet(const Vector4 * colors, const Vector3 & metric)
+{
+    setErrorMetric(metric);
+
+    // initialise the best error
+#if ICBC_USE_SIMD
+    m_besterror = SimdVector(FLT_MAX);
+#else
+    m_besterror = FLT_MAX;
+#endif
+
+    m_count = 16;
+
+    static const float weights[16] = {1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1};
+    Vector3 vc[16];
+    for (int i = 0; i < 16; i++) vc[i] = colors[i].xyz;
+
+    // I've tried using a lower quality approximation of the principal direction, but the best fit line seems to produce best results.
+    Vector3 principal = computePrincipalComponent_PowerMethod(16, vc, weights);
+
+    // build the list of values
+    int order[16];
+    float dps[16];
+    for (uint i = 0; i < m_count; ++i)
+    {
+        order[i] = i;
+        dps[i] = dot(colors[i].xyz, principal);
+    }
+
+    // stable sort
+    for (uint i = 0; i < m_count; ++i)
+    {
+        for (uint j = i; j > 0 && dps[j] < dps[j - 1]; --j)
+        {
+            swap(dps[j], dps[j - 1]);
+            swap(order[j], order[j - 1]);
+        }
+    }
+
+    // weight all the points
+#if ICBC_USE_SIMD
+    m_xxsum = SimdVector(0.0f);
+    m_xsum = SimdVector(0.0f);
+#else
+    m_xxsum = { 0.0f };
+    m_xsum = { 0.0f };
+    m_wsum = 0.0f;
+#endif
+
+    for (uint i = 0; i < 16; ++i)
+    {
+        int p = order[i];
+#if ICBC_USE_SIMD
+        ICBC_ALIGN_16 Vector4 tmp;
+        tmp.xyz = colors[p].xyz;
+        tmp.w = 1;
+        m_weighted[i] = SimdVector(&tmp.x);
+        m_xxsum += m_weighted[i] * m_weighted[i];
+        m_xsum += m_weighted[i];
+#else
+        m_weighted[i] = colors[p].xyz;
+        m_xxsum += m_weighted[i] * m_weighted[i];
+        m_xsum += m_weighted[i];
+        m_weights[i] = 1.0f;
+        m_wsum += m_weights[i];
+#endif
+    }
+}
+
+float ClusterFit::bestError() const
+{
+#if ICBC_USE_SIMD
+    SimdVector x = m_xxsum * m_metricSqr;
+    SimdVector error = m_besterror + x.splatX() + x.splatY() + x.splatZ();
+    return error.toFloat();
+#else
+    return m_besterror + dot(m_xxsum, m_metricSqr);
+#endif
+}
+
+struct Precomp {
+    float alpha2_sum;
+    float beta2_sum;
+    float alphabeta_sum;
+    float factor;
+};
+
+static const ICBC_ALIGN_16 Precomp s_threeElement[153] = {
+    { 0.000000f, 16.000000f, 0.000000f, FLT_MAX }, // 0 (0 0 16)
+    { 0.250000f, 15.250000f, 0.250000f, 0.266667f }, // 1 (0 1 15)
+    { 0.500000f, 14.500000f, 0.500000f, 0.142857f }, // 2 (0 2 14)
+    { 0.750000f, 13.750000f, 0.750000f, 0.102564f }, // 3 (0 3 13)
+    { 1.000000f, 13.000000f, 1.000000f, 0.083333f }, // 4 (0 4 12)
+    { 1.250000f, 12.250000f, 1.250000f, 0.072727f }, // 5 (0 5 11)
+    { 1.500000f, 11.500000f, 1.500000f, 0.066667f }, // 6 (0 6 10)
+    { 1.750000f, 10.750000f, 1.750000f, 0.063492f }, // 7 (0 7 9)
+    { 2.000000f, 10.000000f, 2.000000f, 0.062500f }, // 8 (0 8 8)
+    { 2.250000f, 9.250000f, 2.250000f, 0.063492f }, // 9 (0 9 7)
+    { 2.500000f, 8.500000f, 2.500000f, 0.066667f }, // 10 (0 10 6)
+    { 2.750000f, 7.750000f, 2.750000f, 0.072727f }, // 11 (0 11 5)
+    { 3.000000f, 7.000000f, 3.000000f, 0.083333f }, // 12 (0 12 4)
+    { 3.250000f, 6.250000f, 3.250000f, 0.102564f }, // 13 (0 13 3)
+    { 3.500000f, 5.500000f, 3.500000f, 0.142857f }, // 14 (0 14 2)
+    { 3.750000f, 4.750000f, 3.750000f, 0.266667f }, // 15 (0 15 1)
+    { 4.000000f, 4.000000f, 4.000000f, FLT_MAX }, // 16 (0 16 0)
+    { 1.000000f, 15.000000f, 0.000000f, 0.066667f }, // 17 (1 0 15)
+    { 1.250000f, 14.250000f, 0.250000f, 0.056338f }, // 18 (1 1 14)
+    { 1.500000f, 13.500000f, 0.500000f, 0.050000f }, // 19 (1 2 13)
+    { 1.750000f, 12.750000f, 0.750000f, 0.045977f }, // 20 (1 3 12)
+    { 2.000000f, 12.000000f, 1.000000f, 0.043478f }, // 21 (1 4 11)
+    { 2.250000f, 11.250000f, 1.250000f, 0.042105f }, // 22 (1 5 10)
+    { 2.500000f, 10.500000f, 1.500000f, 0.041667f }, // 23 (1 6 9)
+    { 2.750000f, 9.750000f, 1.750000f, 0.042105f }, // 24 (1 7 8)
+    { 3.000000f, 9.000000f, 2.000000f, 0.043478f }, // 25 (1 8 7)
+    { 3.250000f, 8.250000f, 2.250000f, 0.045977f }, // 26 (1 9 6)
+    { 3.500000f, 7.500000f, 2.500000f, 0.050000f }, // 27 (1 10 5)
+    { 3.750000f, 6.750000f, 2.750000f, 0.056338f }, // 28 (1 11 4)
+    { 4.000000f, 6.000000f, 3.000000f, 0.066667f }, // 29 (1 12 3)
+    { 4.250000f, 5.250000f, 3.250000f, 0.085106f }, // 30 (1 13 2)
+    { 4.500000f, 4.500000f, 3.500000f, 0.125000f }, // 31 (1 14 1)
+    { 4.750000f, 3.750000f, 3.750000f, 0.266667f }, // 32 (1 15 0)
+    { 2.000000f, 14.000000f, 0.000000f, 0.035714f }, // 33 (2 0 14)
+    { 2.250000f, 13.250000f, 0.250000f, 0.033613f }, // 34 (2 1 13)
+    { 2.500000f, 12.500000f, 0.500000f, 0.032258f }, // 35 (2 2 12)
+    { 2.750000f, 11.750000f, 0.750000f, 0.031496f }, // 36 (2 3 11)
+    { 3.000000f, 11.000000f, 1.000000f, 0.031250f }, // 37 (2 4 10)
+    { 3.250000f, 10.250000f, 1.250000f, 0.031496f }, // 38 (2 5 9)
+    { 3.500000f, 9.500000f, 1.500000f, 0.032258f }, // 39 (2 6 8)
+    { 3.750000f, 8.750000f, 1.750000f, 0.033613f }, // 40 (2 7 7)
+    { 4.000000f, 8.000000f, 2.000000f, 0.035714f }, // 41 (2 8 6)
+    { 4.250000f, 7.250000f, 2.250000f, 0.038835f }, // 42 (2 9 5)
+    { 4.500000f, 6.500000f, 2.500000f, 0.043478f }, // 43 (2 10 4)
+    { 4.750000f, 5.750000f, 2.750000f, 0.050633f }, // 44 (2 11 3)
+    { 5.000000f, 5.000000f, 3.000000f, 0.062500f }, // 45 (2 12 2)
+    { 5.250000f, 4.250000f, 3.250000f, 0.085106f }, // 46 (2 13 1)
+    { 5.500000f, 3.500000f, 3.500000f, 0.142857f }, // 47 (2 14 0)
+    { 3.000000f, 13.000000f, 0.000000f, 0.025641f }, // 48 (3 0 13)
+    { 3.250000f, 12.250000f, 0.250000f, 0.025157f }, // 49 (3 1 12)
+    { 3.500000f, 11.500000f, 0.500000f, 0.025000f }, // 50 (3 2 11)
+    { 3.750000f, 10.750000f, 0.750000f, 0.025157f }, // 51 (3 3 10)
+    { 4.000000f, 10.000000f, 1.000000f, 0.025641f }, // 52 (3 4 9)
+    { 4.250000f, 9.250000f, 1.250000f, 0.026490f }, // 53 (3 5 8)
+    { 4.500000f, 8.500000f, 1.500000f, 0.027778f }, // 54 (3 6 7)
+    { 4.750000f, 7.750000f, 1.750000f, 0.029630f }, // 55 (3 7 6)
+    { 5.000000f, 7.000000f, 2.000000f, 0.032258f }, // 56 (3 8 5)
+    { 5.250000f, 6.250000f, 2.250000f, 0.036036f }, // 57 (3 9 4)
+    { 5.500000f, 5.500000f, 2.500000f, 0.041667f }, // 58 (3 10 3)
+    { 5.750000f, 4.750000f, 2.750000f, 0.050633f }, // 59 (3 11 2)
+    { 6.000000f, 4.000000f, 3.000000f, 0.066667f }, // 60 (3 12 1)
+    { 6.250000f, 3.250000f, 3.250000f, 0.102564f }, // 61 (3 13 0)
+    { 4.000000f, 12.000000f, 0.000000f, 0.020833f }, // 62 (4 0 12)
+    { 4.250000f, 11.250000f, 0.250000f, 0.020942f }, // 63 (4 1 11)
+    { 4.500000f, 10.500000f, 0.500000f, 0.021277f }, // 64 (4 2 10)
+    { 4.750000f, 9.750000f, 0.750000f, 0.021858f }, // 65 (4 3 9)
+    { 5.000000f, 9.000000f, 1.000000f, 0.022727f }, // 66 (4 4 8)
+    { 5.250000f, 8.250000f, 1.250000f, 0.023952f }, // 67 (4 5 7)
+    { 5.500000f, 7.500000f, 1.500000f, 0.025641f }, // 68 (4 6 6)
+    { 5.750000f, 6.750000f, 1.750000f, 0.027972f }, // 69 (4 7 5)
+    { 6.000000f, 6.000000f, 2.000000f, 0.031250f }, // 70 (4 8 4)
+    { 6.250000f, 5.250000f, 2.250000f, 0.036036f }, // 71 (4 9 3)
+    { 6.500000f, 4.500000f, 2.500000f, 0.043478f }, // 72 (4 10 2)
+    { 6.750000f, 3.750000f, 2.750000f, 0.056338f }, // 73 (4 11 1)
+    { 7.000000f, 3.000000f, 3.000000f, 0.083333f }, // 74 (4 12 0)
+    { 5.000000f, 11.000000f, 0.000000f, 0.018182f }, // 75 (5 0 11)
+    { 5.250000f, 10.250000f, 0.250000f, 0.018605f }, // 76 (5 1 10)
+    { 5.500000f, 9.500000f, 0.500000f, 0.019231f }, // 77 (5 2 9)
+    { 5.750000f, 8.750000f, 0.750000f, 0.020101f }, // 78 (5 3 8)
+    { 6.000000f, 8.000000f, 1.000000f, 0.021277f }, // 79 (5 4 7)
+    { 6.250000f, 7.250000f, 1.250000f, 0.022857f }, // 80 (5 5 6)
+    { 6.500000f, 6.500000f, 1.500000f, 0.025000f }, // 81 (5 6 5)
+    { 6.750000f, 5.750000f, 1.750000f, 0.027972f }, // 82 (5 7 4)
+    { 7.000000f, 5.000000f, 2.000000f, 0.032258f }, // 83 (5 8 3)
+    { 7.250000f, 4.250000f, 2.250000f, 0.038835f }, // 84 (5 9 2)
+    { 7.500000f, 3.500000f, 2.500000f, 0.050000f }, // 85 (5 10 1)
+    { 7.750000f, 2.750000f, 2.750000f, 0.072727f }, // 86 (5 11 0)
+    { 6.000000f, 10.000000f, 0.000000f, 0.016667f }, // 87 (6 0 10)
+    { 6.250000f, 9.250000f, 0.250000f, 0.017316f }, // 88 (6 1 9)
+    { 6.500000f, 8.500000f, 0.500000f, 0.018182f }, // 89 (6 2 8)
+    { 6.750000f, 7.750000f, 0.750000f, 0.019324f }, // 90 (6 3 7)
+    { 7.000000f, 7.000000f, 1.000000f, 0.020833f }, // 91 (6 4 6)
+    { 7.250000f, 6.250000f, 1.250000f, 0.022857f }, // 92 (6 5 5)
+    { 7.500000f, 5.500000f, 1.500000f, 0.025641f }, // 93 (6 6 4)
+    { 7.750000f, 4.750000f, 1.750000f, 0.029630f }, // 94 (6 7 3)
+    { 8.000000f, 4.000000f, 2.000000f, 0.035714f }, // 95 (6 8 2)
+    { 8.250000f, 3.250000f, 2.250000f, 0.045977f }, // 96 (6 9 1)
+    { 8.500000f, 2.500000f, 2.500000f, 0.066667f }, // 97 (6 10 0)
+    { 7.000000f, 9.000000f, 0.000000f, 0.015873f }, // 98 (7 0 9)
+    { 7.250000f, 8.250000f, 0.250000f, 0.016736f }, // 99 (7 1 8)
+    { 7.500000f, 7.500000f, 0.500000f, 0.017857f }, // 100 (7 2 7)
+    { 7.750000f, 6.750000f, 0.750000f, 0.019324f }, // 101 (7 3 6)
+    { 8.000000f, 6.000000f, 1.000000f, 0.021277f }, // 102 (7 4 5)
+    { 8.250000f, 5.250000f, 1.250000f, 0.023952f }, // 103 (7 5 4)
+    { 8.500000f, 4.500000f, 1.500000f, 0.027778f }, // 104 (7 6 3)
+    { 8.750000f, 3.750000f, 1.750000f, 0.033613f }, // 105 (7 7 2)
+    { 9.000000f, 3.000000f, 2.000000f, 0.043478f }, // 106 (7 8 1)
+    { 9.250000f, 2.250000f, 2.250000f, 0.063492f }, // 107 (7 9 0)
+    { 8.000000f, 8.000000f, 0.000000f, 0.015625f }, // 108 (8 0 8)
+    { 8.250000f, 7.250000f, 0.250000f, 0.016736f }, // 109 (8 1 7)
+    { 8.500000f, 6.500000f, 0.500000f, 0.018182f }, // 110 (8 2 6)
+    { 8.750000f, 5.750000f, 0.750000f, 0.020101f }, // 111 (8 3 5)
+    { 9.000000f, 5.000000f, 1.000000f, 0.022727f }, // 112 (8 4 4)
+    { 9.250000f, 4.250000f, 1.250000f, 0.026490f }, // 113 (8 5 3)
+    { 9.500000f, 3.500000f, 1.500000f, 0.032258f }, // 114 (8 6 2)
+    { 9.750000f, 2.750000f, 1.750000f, 0.042105f }, // 115 (8 7 1)
+    { 10.000000f, 2.000000f, 2.000000f, 0.062500f }, // 116 (8 8 0)
+    { 9.000000f, 7.000000f, 0.000000f, 0.015873f }, // 117 (9 0 7)
+    { 9.250000f, 6.250000f, 0.250000f, 0.017316f }, // 118 (9 1 6)
+    { 9.500000f, 5.500000f, 0.500000f, 0.019231f }, // 119 (9 2 5)
+    { 9.750000f, 4.750000f, 0.750000f, 0.021858f }, // 120 (9 3 4)
+    { 10.000000f, 4.000000f, 1.000000f, 0.025641f }, // 121 (9 4 3)
+    { 10.250000f, 3.250000f, 1.250000f, 0.031496f }, // 122 (9 5 2)
+    { 10.500000f, 2.500000f, 1.500000f, 0.041667f }, // 123 (9 6 1)
+    { 10.750000f, 1.750000f, 1.750000f, 0.063492f }, // 124 (9 7 0)
+    { 10.000000f, 6.000000f, 0.000000f, 0.016667f }, // 125 (10 0 6)
+    { 10.250000f, 5.250000f, 0.250000f, 0.018605f }, // 126 (10 1 5)
+    { 10.500000f, 4.500000f, 0.500000f, 0.021277f }, // 127 (10 2 4)
+    { 10.750000f, 3.750000f, 0.750000f, 0.025157f }, // 128 (10 3 3)
+    { 11.000000f, 3.000000f, 1.000000f, 0.031250f }, // 129 (10 4 2)
+    { 11.250000f, 2.250000f, 1.250000f, 0.042105f }, // 130 (10 5 1)
+    { 11.500000f, 1.500000f, 1.500000f, 0.066667f }, // 131 (10 6 0)
+    { 11.000000f, 5.000000f, 0.000000f, 0.018182f }, // 132 (11 0 5)
+    { 11.250000f, 4.250000f, 0.250000f, 0.020942f }, // 133 (11 1 4)
+    { 11.500000f, 3.500000f, 0.500000f, 0.025000f }, // 134 (11 2 3)
+    { 11.750000f, 2.750000f, 0.750000f, 0.031496f }, // 135 (11 3 2)
+    { 12.000000f, 2.000000f, 1.000000f, 0.043478f }, // 136 (11 4 1)
+    { 12.250000f, 1.250000f, 1.250000f, 0.072727f }, // 137 (11 5 0)
+    { 12.000000f, 4.000000f, 0.000000f, 0.020833f }, // 138 (12 0 4)
+    { 12.250000f, 3.250000f, 0.250000f, 0.025157f }, // 139 (12 1 3)
+    { 12.500000f, 2.500000f, 0.500000f, 0.032258f }, // 140 (12 2 2)
+    { 12.750000f, 1.750000f, 0.750000f, 0.045977f }, // 141 (12 3 1)
+    { 13.000000f, 1.000000f, 1.000000f, 0.083333f }, // 142 (12 4 0)
+    { 13.000000f, 3.000000f, 0.000000f, 0.025641f }, // 143 (13 0 3)
+    { 13.250000f, 2.250000f, 0.250000f, 0.033613f }, // 144 (13 1 2)
+    { 13.500000f, 1.500000f, 0.500000f, 0.050000f }, // 145 (13 2 1)
+    { 13.750000f, 0.750000f, 0.750000f, 0.102564f }, // 146 (13 3 0)
+    { 14.000000f, 2.000000f, 0.000000f, 0.035714f }, // 147 (14 0 2)
+    { 14.250000f, 1.250000f, 0.250000f, 0.056338f }, // 148 (14 1 1)
+    { 14.500000f, 0.500000f, 0.500000f, 0.142857f }, // 149 (14 2 0)
+    { 15.000000f, 1.000000f, 0.000000f, 0.066667f }, // 150 (15 0 1)
+    { 15.250000f, 0.250000f, 0.250000f, 0.266667f }, // 151 (15 1 0)
+    { 16.000000f, 0.000000f, 0.000000f, FLT_MAX }, // 152 (16 0 0)
+}; // 153 three cluster elements
+
+static const ICBC_ALIGN_16 Precomp s_fourElement[969] = {
+    { 0.000000f, 16.000000f, 0.000000f, FLT_MAX }, // 0 (0 0 0 16)
+    { 0.111111f, 15.444445f, 0.222222f, 0.600000f }, // 1 (0 0 1 15)
+    { 0.222222f, 14.888889f, 0.444444f, 0.321429f }, // 2 (0 0 2 14)
+    { 0.333333f, 14.333333f, 0.666667f, 0.230769f }, // 3 (0 0 3 13)
+    { 0.444444f, 13.777778f, 0.888889f, 0.187500f }, // 4 (0 0 4 12)
+    { 0.555556f, 13.222222f, 1.111111f, 0.163636f }, // 5 (0 0 5 11)
+    { 0.666667f, 12.666667f, 1.333333f, 0.150000f }, // 6 (0 0 6 10)
+    { 0.777778f, 12.111111f, 1.555556f, 0.142857f }, // 7 (0 0 7 9)
+    { 0.888889f, 11.555555f, 1.777778f, 0.140625f }, // 8 (0 0 8 8)
+    { 1.000000f, 11.000000f, 2.000000f, 0.142857f }, // 9 (0 0 9 7)
+    { 1.111111f, 10.444445f, 2.222222f, 0.150000f }, // 10 (0 0 10 6)
+    { 1.222222f, 9.888889f, 2.444444f, 0.163636f }, // 11 (0 0 11 5)
+    { 1.333333f, 9.333333f, 2.666667f, 0.187500f }, // 12 (0 0 12 4)
+    { 1.444444f, 8.777778f, 2.888889f, 0.230769f }, // 13 (0 0 13 3)
+    { 1.555556f, 8.222222f, 3.111111f, 0.321429f }, // 14 (0 0 14 2)
+    { 1.666667f, 7.666667f, 3.333333f, 0.600000f }, // 15 (0 0 15 1)
+    { 1.777778f, 7.111111f, 3.555556f, FLT_MAX }, // 16 (0 0 16 0)
+    { 0.444444f, 15.111111f, 0.222222f, 0.150000f }, // 17 (0 1 0 15)
+    { 0.555556f, 14.555555f, 0.444444f, 0.126761f }, // 18 (0 1 1 14)
+    { 0.666667f, 14.000000f, 0.666667f, 0.112500f }, // 19 (0 1 2 13)
+    { 0.777778f, 13.444445f, 0.888889f, 0.103448f }, // 20 (0 1 3 12)
+    { 0.888889f, 12.888889f, 1.111111f, 0.097826f }, // 21 (0 1 4 11)
+    { 1.000000f, 12.333333f, 1.333333f, 0.094737f }, // 22 (0 1 5 10)
+    { 1.111111f, 11.777778f, 1.555556f, 0.093750f }, // 23 (0 1 6 9)
+    { 1.222222f, 11.222222f, 1.777778f, 0.094737f }, // 24 (0 1 7 8)
+    { 1.333333f, 10.666667f, 2.000000f, 0.097826f }, // 25 (0 1 8 7)
+    { 1.444444f, 10.111111f, 2.222222f, 0.103448f }, // 26 (0 1 9 6)
+    { 1.555556f, 9.555555f, 2.444444f, 0.112500f }, // 27 (0 1 10 5)
+    { 1.666667f, 9.000000f, 2.666667f, 0.126761f }, // 28 (0 1 11 4)
+    { 1.777778f, 8.444445f, 2.888889f, 0.150000f }, // 29 (0 1 12 3)
+    { 1.888889f, 7.888889f, 3.111111f, 0.191489f }, // 30 (0 1 13 2)
+    { 2.000000f, 7.333333f, 3.333333f, 0.281250f }, // 31 (0 1 14 1)
+    { 2.111111f, 6.777778f, 3.555556f, 0.600000f }, // 32 (0 1 15 0)
+    { 0.888889f, 14.222222f, 0.444444f, 0.080357f }, // 33 (0 2 0 14)
+    { 1.000000f, 13.666667f, 0.666667f, 0.075630f }, // 34 (0 2 1 13)
+    { 1.111111f, 13.111111f, 0.888889f, 0.072581f }, // 35 (0 2 2 12)
+    { 1.222222f, 12.555555f, 1.111111f, 0.070866f }, // 36 (0 2 3 11)
+    { 1.333333f, 12.000000f, 1.333333f, 0.070313f }, // 37 (0 2 4 10)
+    { 1.444444f, 11.444445f, 1.555556f, 0.070866f }, // 38 (0 2 5 9)
+    { 1.555556f, 10.888889f, 1.777778f, 0.072581f }, // 39 (0 2 6 8)
+    { 1.666667f, 10.333333f, 2.000000f, 0.075630f }, // 40 (0 2 7 7)
+    { 1.777778f, 9.777778f, 2.222222f, 0.080357f }, // 41 (0 2 8 6)
+    { 1.888889f, 9.222222f, 2.444444f, 0.087379f }, // 42 (0 2 9 5)
+    { 2.000000f, 8.666667f, 2.666667f, 0.097826f }, // 43 (0 2 10 4)
+    { 2.111111f, 8.111111f, 2.888889f, 0.113924f }, // 44 (0 2 11 3)
+    { 2.222222f, 7.555556f, 3.111111f, 0.140625f }, // 45 (0 2 12 2)
+    { 2.333333f, 7.000000f, 3.333333f, 0.191489f }, // 46 (0 2 13 1)
+    { 2.444444f, 6.444445f, 3.555556f, 0.321429f }, // 47 (0 2 14 0)
+    { 1.333333f, 13.333333f, 0.666667f, 0.057692f }, // 48 (0 3 0 13)
+    { 1.444444f, 12.777778f, 0.888889f, 0.056604f }, // 49 (0 3 1 12)
+    { 1.555556f, 12.222222f, 1.111111f, 0.056250f }, // 50 (0 3 2 11)
+    { 1.666667f, 11.666667f, 1.333333f, 0.056604f }, // 51 (0 3 3 10)
+    { 1.777778f, 11.111111f, 1.555556f, 0.057692f }, // 52 (0 3 4 9)
+    { 1.888889f, 10.555555f, 1.777778f, 0.059603f }, // 53 (0 3 5 8)
+    { 2.000000f, 10.000000f, 2.000000f, 0.062500f }, // 54 (0 3 6 7)
+    { 2.111111f, 9.444445f, 2.222222f, 0.066667f }, // 55 (0 3 7 6)
+    { 2.222222f, 8.888889f, 2.444444f, 0.072581f }, // 56 (0 3 8 5)
+    { 2.333333f, 8.333333f, 2.666667f, 0.081081f }, // 57 (0 3 9 4)
+    { 2.444444f, 7.777778f, 2.888889f, 0.093750f }, // 58 (0 3 10 3)
+    { 2.555556f, 7.222222f, 3.111111f, 0.113924f }, // 59 (0 3 11 2)
+    { 2.666667f, 6.666667f, 3.333333f, 0.150000f }, // 60 (0 3 12 1)
+    { 2.777778f, 6.111111f, 3.555556f, 0.230769f }, // 61 (0 3 13 0)
+    { 1.777778f, 12.444445f, 0.888889f, 0.046875f }, // 62 (0 4 0 12)
+    { 1.888889f, 11.888889f, 1.111111f, 0.047120f }, // 63 (0 4 1 11)
+    { 2.000000f, 11.333333f, 1.333333f, 0.047872f }, // 64 (0 4 2 10)
+    { 2.111111f, 10.777778f, 1.555556f, 0.049180f }, // 65 (0 4 3 9)
+    { 2.222222f, 10.222222f, 1.777778f, 0.051136f }, // 66 (0 4 4 8)
+    { 2.333333f, 9.666667f, 2.000000f, 0.053892f }, // 67 (0 4 5 7)
+    { 2.444444f, 9.111111f, 2.222222f, 0.057692f }, // 68 (0 4 6 6)
+    { 2.555556f, 8.555555f, 2.444444f, 0.062937f }, // 69 (0 4 7 5)
+    { 2.666667f, 8.000000f, 2.666667f, 0.070313f }, // 70 (0 4 8 4)
+    { 2.777778f, 7.444445f, 2.888889f, 0.081081f }, // 71 (0 4 9 3)
+    { 2.888889f, 6.888889f, 3.111111f, 0.097826f }, // 72 (0 4 10 2)
+    { 3.000000f, 6.333333f, 3.333333f, 0.126761f }, // 73 (0 4 11 1)
+    { 3.111111f, 5.777778f, 3.555556f, 0.187500f }, // 74 (0 4 12 0)
+    { 2.222222f, 11.555555f, 1.111111f, 0.040909f }, // 75 (0 5 0 11)
+    { 2.333333f, 11.000000f, 1.333333f, 0.041860f }, // 76 (0 5 1 10)
+    { 2.444444f, 10.444445f, 1.555556f, 0.043269f }, // 77 (0 5 2 9)
+    { 2.555556f, 9.888889f, 1.777778f, 0.045226f }, // 78 (0 5 3 8)
+    { 2.666667f, 9.333333f, 2.000000f, 0.047872f }, // 79 (0 5 4 7)
+    { 2.777778f, 8.777778f, 2.222222f, 0.051429f }, // 80 (0 5 5 6)
+    { 2.888889f, 8.222222f, 2.444444f, 0.056250f }, // 81 (0 5 6 5)
+    { 3.000000f, 7.666667f, 2.666667f, 0.062937f }, // 82 (0 5 7 4)
+    { 3.111111f, 7.111111f, 2.888889f, 0.072581f }, // 83 (0 5 8 3)
+    { 3.222222f, 6.555556f, 3.111111f, 0.087379f }, // 84 (0 5 9 2)
+    { 3.333333f, 6.000000f, 3.333333f, 0.112500f }, // 85 (0 5 10 1)
+    { 3.444444f, 5.444445f, 3.555556f, 0.163636f }, // 86 (0 5 11 0)
+    { 2.666667f, 10.666667f, 1.333333f, 0.037500f }, // 87 (0 6 0 10)
+    { 2.777778f, 10.111111f, 1.555556f, 0.038961f }, // 88 (0 6 1 9)
+    { 2.888889f, 9.555555f, 1.777778f, 0.040909f }, // 89 (0 6 2 8)
+    { 3.000000f, 9.000000f, 2.000000f, 0.043478f }, // 90 (0 6 3 7)
+    { 3.111111f, 8.444445f, 2.222222f, 0.046875f }, // 91 (0 6 4 6)
+    { 3.222222f, 7.888889f, 2.444444f, 0.051429f }, // 92 (0 6 5 5)
+    { 3.333333f, 7.333333f, 2.666667f, 0.057692f }, // 93 (0 6 6 4)
+    { 3.444444f, 6.777778f, 2.888889f, 0.066667f }, // 94 (0 6 7 3)
+    { 3.555556f, 6.222222f, 3.111111f, 0.080357f }, // 95 (0 6 8 2)
+    { 3.666667f, 5.666667f, 3.333333f, 0.103448f }, // 96 (0 6 9 1)
+    { 3.777778f, 5.111111f, 3.555556f, 0.150000f }, // 97 (0 6 10 0)
+    { 3.111111f, 9.777778f, 1.555556f, 0.035714f }, // 98 (0 7 0 9)
+    { 3.222222f, 9.222222f, 1.777778f, 0.037657f }, // 99 (0 7 1 8)
+    { 3.333333f, 8.666667f, 2.000000f, 0.040179f }, // 100 (0 7 2 7)
+    { 3.444444f, 8.111111f, 2.222222f, 0.043478f }, // 101 (0 7 3 6)
+    { 3.555556f, 7.555555f, 2.444444f, 0.047872f }, // 102 (0 7 4 5)
+    { 3.666667f, 7.000000f, 2.666667f, 0.053892f }, // 103 (0 7 5 4)
+    { 3.777778f, 6.444445f, 2.888889f, 0.062500f }, // 104 (0 7 6 3)
+    { 3.888889f, 5.888889f, 3.111111f, 0.075630f }, // 105 (0 7 7 2)
+    { 4.000000f, 5.333333f, 3.333333f, 0.097826f }, // 106 (0 7 8 1)
+    { 4.111111f, 4.777778f, 3.555556f, 0.142857f }, // 107 (0 7 9 0)
+    { 3.555556f, 8.888889f, 1.777778f, 0.035156f }, // 108 (0 8 0 8)
+    { 3.666667f, 8.333333f, 2.000000f, 0.037657f }, // 109 (0 8 1 7)
+    { 3.777778f, 7.777778f, 2.222222f, 0.040909f }, // 110 (0 8 2 6)
+    { 3.888889f, 7.222222f, 2.444444f, 0.045226f }, // 111 (0 8 3 5)
+    { 4.000000f, 6.666667f, 2.666667f, 0.051136f }, // 112 (0 8 4 4)
+    { 4.111111f, 6.111111f, 2.888889f, 0.059603f }, // 113 (0 8 5 3)
+    { 4.222222f, 5.555555f, 3.111111f, 0.072581f }, // 114 (0 8 6 2)
+    { 4.333333f, 5.000000f, 3.333333f, 0.094737f }, // 115 (0 8 7 1)
+    { 4.444445f, 4.444445f, 3.555556f, 0.140625f }, // 116 (0 8 8 0)
+    { 4.000000f, 8.000000f, 2.000000f, 0.035714f }, // 117 (0 9 0 7)
+    { 4.111111f, 7.444445f, 2.222222f, 0.038961f }, // 118 (0 9 1 6)
+    { 4.222222f, 6.888889f, 2.444444f, 0.043269f }, // 119 (0 9 2 5)
+    { 4.333333f, 6.333333f, 2.666667f, 0.049180f }, // 120 (0 9 3 4)
+    { 4.444445f, 5.777778f, 2.888889f, 0.057692f }, // 121 (0 9 4 3)
+    { 4.555556f, 5.222222f, 3.111111f, 0.070866f }, // 122 (0 9 5 2)
+    { 4.666667f, 4.666667f, 3.333333f, 0.093750f }, // 123 (0 9 6 1)
+    { 4.777778f, 4.111111f, 3.555556f, 0.142857f }, // 124 (0 9 7 0)
+    { 4.444445f, 7.111111f, 2.222222f, 0.037500f }, // 125 (0 10 0 6)
+    { 4.555556f, 6.555555f, 2.444444f, 0.041860f }, // 126 (0 10 1 5)
+    { 4.666667f, 6.000000f, 2.666667f, 0.047872f }, // 127 (0 10 2 4)
+    { 4.777778f, 5.444445f, 2.888889f, 0.056604f }, // 128 (0 10 3 3)
+    { 4.888889f, 4.888889f, 3.111111f, 0.070313f }, // 129 (0 10 4 2)
+    { 5.000000f, 4.333333f, 3.333333f, 0.094737f }, // 130 (0 10 5 1)
+    { 5.111111f, 3.777778f, 3.555556f, 0.150000f }, // 131 (0 10 6 0)
+    { 4.888889f, 6.222222f, 2.444444f, 0.040909f }, // 132 (0 11 0 5)
+    { 5.000000f, 5.666667f, 2.666667f, 0.047120f }, // 133 (0 11 1 4)
+    { 5.111111f, 5.111111f, 2.888889f, 0.056250f }, // 134 (0 11 2 3)
+    { 5.222222f, 4.555555f, 3.111111f, 0.070866f }, // 135 (0 11 3 2)
+    { 5.333333f, 4.000000f, 3.333333f, 0.097826f }, // 136 (0 11 4 1)
+    { 5.444445f, 3.444444f, 3.555556f, 0.163636f }, // 137 (0 11 5 0)
+    { 5.333333f, 5.333333f, 2.666667f, 0.046875f }, // 138 (0 12 0 4)
+    { 5.444445f, 4.777778f, 2.888889f, 0.056604f }, // 139 (0 12 1 3)
+    { 5.555556f, 4.222222f, 3.111111f, 0.072581f }, // 140 (0 12 2 2)
+    { 5.666667f, 3.666667f, 3.333333f, 0.103448f }, // 141 (0 12 3 1)
+    { 5.777778f, 3.111111f, 3.555556f, 0.187500f }, // 142 (0 12 4 0)
+    { 5.777778f, 4.444445f, 2.888889f, 0.057692f }, // 143 (0 13 0 3)
+    { 5.888889f, 3.888889f, 3.111111f, 0.075630f }, // 144 (0 13 1 2)
+    { 6.000000f, 3.333333f, 3.333333f, 0.112500f }, // 145 (0 13 2 1)
+    { 6.111111f, 2.777778f, 3.555556f, 0.230769f }, // 146 (0 13 3 0)
+    { 6.222222f, 3.555556f, 3.111111f, 0.080357f }, // 147 (0 14 0 2)
+    { 6.333333f, 3.000000f, 3.333333f, 0.126761f }, // 148 (0 14 1 1)
+    { 6.444445f, 2.444444f, 3.555556f, 0.321429f }, // 149 (0 14 2 0)
+    { 6.666667f, 2.666667f, 3.333333f, 0.150000f }, // 150 (0 15 0 1)
+    { 6.777778f, 2.111111f, 3.555556f, 0.600000f }, // 151 (0 15 1 0)
+    { 7.111111f, 1.777778f, 3.555556f, FLT_MAX }, // 152 (0 16 0 0)
+    { 1.000000f, 15.000000f, 0.000000f, 0.066667f }, // 153 (1 0 0 15)
+    { 1.111111f, 14.444445f, 0.222222f, 0.062500f }, // 154 (1 0 1 14)
+    { 1.222222f, 13.888889f, 0.444444f, 0.059603f }, // 155 (1 0 2 13)
+    { 1.333333f, 13.333333f, 0.666667f, 0.057692f }, // 156 (1 0 3 12)
+    { 1.444444f, 12.777778f, 0.888889f, 0.056604f }, // 157 (1 0 4 11)
+    { 1.555556f, 12.222222f, 1.111111f, 0.056250f }, // 158 (1 0 5 10)
+    { 1.666667f, 11.666667f, 1.333333f, 0.056604f }, // 159 (1 0 6 9)
+    { 1.777778f, 11.111111f, 1.555556f, 0.057692f }, // 160 (1 0 7 8)
+    { 1.888889f, 10.555555f, 1.777778f, 0.059603f }, // 161 (1 0 8 7)
+    { 2.000000f, 10.000000f, 2.000000f, 0.062500f }, // 162 (1 0 9 6)
+    { 2.111111f, 9.444445f, 2.222222f, 0.066667f }, // 163 (1 0 10 5)
+    { 2.222222f, 8.888889f, 2.444444f, 0.072581f }, // 164 (1 0 11 4)
+    { 2.333333f, 8.333333f, 2.666667f, 0.081081f }, // 165 (1 0 12 3)
+    { 2.444444f, 7.777778f, 2.888889f, 0.093750f }, // 166 (1 0 13 2)
+    { 2.555556f, 7.222222f, 3.111111f, 0.113924f }, // 167 (1 0 14 1)
+    { 2.666667f, 6.666667f, 3.333333f, 0.150000f }, // 168 (1 0 15 0)
+    { 1.444444f, 14.111111f, 0.222222f, 0.049180f }, // 169 (1 1 0 14)
+    { 1.555556f, 13.555555f, 0.444444f, 0.047872f }, // 170 (1 1 1 13)
+    { 1.666667f, 13.000000f, 0.666667f, 0.047120f }, // 171 (1 1 2 12)
+    { 1.777778f, 12.444445f, 0.888889f, 0.046875f }, // 172 (1 1 3 11)
+    { 1.888889f, 11.888889f, 1.111111f, 0.047120f }, // 173 (1 1 4 10)
+    { 2.000000f, 11.333333f, 1.333333f, 0.047872f }, // 174 (1 1 5 9)
+    { 2.111111f, 10.777778f, 1.555556f, 0.049180f }, // 175 (1 1 6 8)
+    { 2.222222f, 10.222222f, 1.777778f, 0.051136f }, // 176 (1 1 7 7)
+    { 2.333333f, 9.666667f, 2.000000f, 0.053892f }, // 177 (1 1 8 6)
+    { 2.444444f, 9.111111f, 2.222222f, 0.057692f }, // 178 (1 1 9 5)
+    { 2.555556f, 8.555555f, 2.444444f, 0.062937f }, // 179 (1 1 10 4)
+    { 2.666667f, 8.000000f, 2.666667f, 0.070313f }, // 180 (1 1 11 3)
+    { 2.777778f, 7.444445f, 2.888889f, 0.081081f }, // 181 (1 1 12 2)
+    { 2.888889f, 6.888889f, 3.111111f, 0.097826f }, // 182 (1 1 13 1)
+    { 3.000000f, 6.333333f, 3.333333f, 0.126761f }, // 183 (1 1 14 0)
+    { 1.888889f, 13.222222f, 0.444444f, 0.040359f }, // 184 (1 2 0 13)
+    { 2.000000f, 12.666667f, 0.666667f, 0.040179f }, // 185 (1 2 1 12)
+    { 2.111111f, 12.111111f, 0.888889f, 0.040359f }, // 186 (1 2 2 11)
+    { 2.222222f, 11.555555f, 1.111111f, 0.040909f }, // 187 (1 2 3 10)
+    { 2.333333f, 11.000000f, 1.333333f, 0.041860f }, // 188 (1 2 4 9)
+    { 2.444444f, 10.444445f, 1.555556f, 0.043269f }, // 189 (1 2 5 8)
+    { 2.555556f, 9.888889f, 1.777778f, 0.045226f }, // 190 (1 2 6 7)
+    { 2.666667f, 9.333333f, 2.000000f, 0.047872f }, // 191 (1 2 7 6)
+    { 2.777778f, 8.777778f, 2.222222f, 0.051429f }, // 192 (1 2 8 5)
+    { 2.888889f, 8.222222f, 2.444444f, 0.056250f }, // 193 (1 2 9 4)
+    { 3.000000f, 7.666667f, 2.666667f, 0.062937f }, // 194 (1 2 10 3)
+    { 3.111111f, 7.111111f, 2.888889f, 0.072581f }, // 195 (1 2 11 2)
+    { 3.222222f, 6.555556f, 3.111111f, 0.087379f }, // 196 (1 2 12 1)
+    { 3.333333f, 6.000000f, 3.333333f, 0.112500f }, // 197 (1 2 13 0)
+    { 2.333333f, 12.333333f, 0.666667f, 0.035294f }, // 198 (1 3 0 12)
+    { 2.444444f, 11.777778f, 0.888889f, 0.035714f }, // 199 (1 3 1 11)
+    { 2.555556f, 11.222222f, 1.111111f, 0.036437f }, // 200 (1 3 2 10)
+    { 2.666667f, 10.666667f, 1.333333f, 0.037500f }, // 201 (1 3 3 9)
+    { 2.777778f, 10.111111f, 1.555556f, 0.038961f }, // 202 (1 3 4 8)
+    { 2.888889f, 9.555555f, 1.777778f, 0.040909f }, // 203 (1 3 5 7)
+    { 3.000000f, 9.000000f, 2.000000f, 0.043478f }, // 204 (1 3 6 6)
+    { 3.111111f, 8.444445f, 2.222222f, 0.046875f }, // 205 (1 3 7 5)
+    { 3.222222f, 7.888889f, 2.444444f, 0.051429f }, // 206 (1 3 8 4)
+    { 3.333333f, 7.333333f, 2.666667f, 0.057692f }, // 207 (1 3 9 3)
+    { 3.444444f, 6.777778f, 2.888889f, 0.066667f }, // 208 (1 3 10 2)
+    { 3.555556f, 6.222222f, 3.111111f, 0.080357f }, // 209 (1 3 11 1)
+    { 3.666667f, 5.666667f, 3.333333f, 0.103448f }, // 210 (1 3 12 0)
+    { 2.777778f, 11.444445f, 0.888889f, 0.032258f }, // 211 (1 4 0 11)
+    { 2.888889f, 10.888889f, 1.111111f, 0.033088f }, // 212 (1 4 1 10)
+    { 3.000000f, 10.333333f, 1.333333f, 0.034221f }, // 213 (1 4 2 9)
+    { 3.111111f, 9.777778f, 1.555556f, 0.035714f }, // 214 (1 4 3 8)
+    { 3.222222f, 9.222222f, 1.777778f, 0.037657f }, // 215 (1 4 4 7)
+    { 3.333333f, 8.666667f, 2.000000f, 0.040179f }, // 216 (1 4 5 6)
+    { 3.444444f, 8.111111f, 2.222222f, 0.043478f }, // 217 (1 4 6 5)
+    { 3.555556f, 7.555555f, 2.444444f, 0.047872f }, // 218 (1 4 7 4)
+    { 3.666667f, 7.000000f, 2.666667f, 0.053892f }, // 219 (1 4 8 3)
+    { 3.777778f, 6.444445f, 2.888889f, 0.062500f }, // 220 (1 4 9 2)
+    { 3.888889f, 5.888889f, 3.111111f, 0.075630f }, // 221 (1 4 10 1)
+    { 4.000000f, 5.333333f, 3.333333f, 0.097826f }, // 222 (1 4 11 0)
+    { 3.222222f, 10.555555f, 1.111111f, 0.030508f }, // 223 (1 5 0 10)
+    { 3.333333f, 10.000000f, 1.333333f, 0.031690f }, // 224 (1 5 1 9)
+    { 3.444444f, 9.444445f, 1.555556f, 0.033210f }, // 225 (1 5 2 8)
+    { 3.555556f, 8.888889f, 1.777778f, 0.035156f }, // 226 (1 5 3 7)
+    { 3.666667f, 8.333333f, 2.000000f, 0.037657f }, // 227 (1 5 4 6)
+    { 3.777778f, 7.777778f, 2.222222f, 0.040909f }, // 228 (1 5 5 5)
+    { 3.888889f, 7.222222f, 2.444444f, 0.045226f }, // 229 (1 5 6 4)
+    { 4.000000f, 6.666667f, 2.666667f, 0.051136f }, // 230 (1 5 7 3)
+    { 4.111111f, 6.111111f, 2.888889f, 0.059603f }, // 231 (1 5 8 2)
+    { 4.222222f, 5.555556f, 3.111111f, 0.072581f }, // 232 (1 5 9 1)
+    { 4.333333f, 5.000000f, 3.333333f, 0.094737f }, // 233 (1 5 10 0)
+    { 3.666667f, 9.666667f, 1.333333f, 0.029703f }, // 234 (1 6 0 9)
+    { 3.777778f, 9.111111f, 1.555556f, 0.031250f }, // 235 (1 6 1 8)
+    { 3.888889f, 8.555555f, 1.777778f, 0.033210f }, // 236 (1 6 2 7)
+    { 4.000000f, 8.000000f, 2.000000f, 0.035714f }, // 237 (1 6 3 6)
+    { 4.111111f, 7.444445f, 2.222222f, 0.038961f }, // 238 (1 6 4 5)
+    { 4.222222f, 6.888889f, 2.444444f, 0.043269f }, // 239 (1 6 5 4)
+    { 4.333333f, 6.333333f, 2.666667f, 0.049180f }, // 240 (1 6 6 3)
+    { 4.444445f, 5.777778f, 2.888889f, 0.057692f }, // 241 (1 6 7 2)
+    { 4.555555f, 5.222222f, 3.111111f, 0.070866f }, // 242 (1 6 8 1)
+    { 4.666667f, 4.666667f, 3.333333f, 0.093750f }, // 243 (1 6 9 0)
+    { 4.111111f, 8.777778f, 1.555556f, 0.029703f }, // 244 (1 7 0 8)
+    { 4.222222f, 8.222222f, 1.777778f, 0.031690f }, // 245 (1 7 1 7)
+    { 4.333333f, 7.666667f, 2.000000f, 0.034221f }, // 246 (1 7 2 6)
+    { 4.444445f, 7.111111f, 2.222222f, 0.037500f }, // 247 (1 7 3 5)
+    { 4.555555f, 6.555555f, 2.444444f, 0.041860f }, // 248 (1 7 4 4)
+    { 4.666667f, 6.000000f, 2.666667f, 0.047872f }, // 249 (1 7 5 3)
+    { 4.777778f, 5.444445f, 2.888889f, 0.056604f }, // 250 (1 7 6 2)
+    { 4.888889f, 4.888889f, 3.111111f, 0.070313f }, // 251 (1 7 7 1)
+    { 5.000000f, 4.333333f, 3.333333f, 0.094737f }, // 252 (1 7 8 0)
+    { 4.555555f, 7.888889f, 1.777778f, 0.030508f }, // 253 (1 8 0 7)
+    { 4.666667f, 7.333333f, 2.000000f, 0.033088f }, // 254 (1 8 1 6)
+    { 4.777778f, 6.777778f, 2.222222f, 0.036437f }, // 255 (1 8 2 5)
+    { 4.888889f, 6.222222f, 2.444444f, 0.040909f }, // 256 (1 8 3 4)
+    { 5.000000f, 5.666667f, 2.666667f, 0.047120f }, // 257 (1 8 4 3)
+    { 5.111111f, 5.111111f, 2.888889f, 0.056250f }, // 258 (1 8 5 2)
+    { 5.222222f, 4.555555f, 3.111111f, 0.070866f }, // 259 (1 8 6 1)
+    { 5.333333f, 4.000000f, 3.333333f, 0.097826f }, // 260 (1 8 7 0)
+    { 5.000000f, 7.000000f, 2.000000f, 0.032258f }, // 261 (1 9 0 6)
+    { 5.111111f, 6.444445f, 2.222222f, 0.035714f }, // 262 (1 9 1 5)
+    { 5.222222f, 5.888889f, 2.444444f, 0.040359f }, // 263 (1 9 2 4)
+    { 5.333333f, 5.333333f, 2.666667f, 0.046875f }, // 264 (1 9 3 3)
+    { 5.444445f, 4.777778f, 2.888889f, 0.056604f }, // 265 (1 9 4 2)
+    { 5.555556f, 4.222222f, 3.111111f, 0.072581f }, // 266 (1 9 5 1)
+    { 5.666667f, 3.666667f, 3.333333f, 0.103448f }, // 267 (1 9 6 0)
+    { 5.444445f, 6.111111f, 2.222222f, 0.035294f }, // 268 (1 10 0 5)
+    { 5.555556f, 5.555555f, 2.444444f, 0.040179f }, // 269 (1 10 1 4)
+    { 5.666667f, 5.000000f, 2.666667f, 0.047120f }, // 270 (1 10 2 3)
+    { 5.777778f, 4.444445f, 2.888889f, 0.057692f }, // 271 (1 10 3 2)
+    { 5.888889f, 3.888889f, 3.111111f, 0.075630f }, // 272 (1 10 4 1)
+    { 6.000000f, 3.333333f, 3.333333f, 0.112500f }, // 273 (1 10 5 0)
+    { 5.888889f, 5.222222f, 2.444444f, 0.040359f }, // 274 (1 11 0 4)
+    { 6.000000f, 4.666667f, 2.666667f, 0.047872f }, // 275 (1 11 1 3)
+    { 6.111111f, 4.111111f, 2.888889f, 0.059603f }, // 276 (1 11 2 2)
+    { 6.222222f, 3.555556f, 3.111111f, 0.080357f }, // 277 (1 11 3 1)
+    { 6.333333f, 3.000000f, 3.333333f, 0.126761f }, // 278 (1 11 4 0)
+    { 6.333333f, 4.333333f, 2.666667f, 0.049180f }, // 279 (1 12 0 3)
+    { 6.444445f, 3.777778f, 2.888889f, 0.062500f }, // 280 (1 12 1 2)
+    { 6.555556f, 3.222222f, 3.111111f, 0.087379f }, // 281 (1 12 2 1)
+    { 6.666667f, 2.666667f, 3.333333f, 0.150000f }, // 282 (1 12 3 0)
+    { 6.777778f, 3.444444f, 2.888889f, 0.066667f }, // 283 (1 13 0 2)
+    { 6.888889f, 2.888889f, 3.111111f, 0.097826f }, // 284 (1 13 1 1)
+    { 7.000000f, 2.333333f, 3.333333f, 0.191489f }, // 285 (1 13 2 0)
+    { 7.222222f, 2.555556f, 3.111111f, 0.113924f }, // 286 (1 14 0 1)
+    { 7.333333f, 2.000000f, 3.333333f, 0.281250f }, // 287 (1 14 1 0)
+    { 7.666667f, 1.666667f, 3.333333f, 0.600000f }, // 288 (1 15 0 0)
+    { 2.000000f, 14.000000f, 0.000000f, 0.035714f }, // 289 (2 0 0 14)
+    { 2.111111f, 13.444445f, 0.222222f, 0.035294f }, // 290 (2 0 1 13)
+    { 2.222222f, 12.888889f, 0.444444f, 0.035156f }, // 291 (2 0 2 12)
+    { 2.333333f, 12.333333f, 0.666667f, 0.035294f }, // 292 (2 0 3 11)
+    { 2.444444f, 11.777778f, 0.888889f, 0.035714f }, // 293 (2 0 4 10)
+    { 2.555556f, 11.222222f, 1.111111f, 0.036437f }, // 294 (2 0 5 9)
+    { 2.666667f, 10.666667f, 1.333333f, 0.037500f }, // 295 (2 0 6 8)
+    { 2.777778f, 10.111111f, 1.555556f, 0.038961f }, // 296 (2 0 7 7)
+    { 2.888889f, 9.555555f, 1.777778f, 0.040909f }, // 297 (2 0 8 6)
+    { 3.000000f, 9.000000f, 2.000000f, 0.043478f }, // 298 (2 0 9 5)
+    { 3.111111f, 8.444445f, 2.222222f, 0.046875f }, // 299 (2 0 10 4)
+    { 3.222222f, 7.888889f, 2.444444f, 0.051429f }, // 300 (2 0 11 3)
+    { 3.333333f, 7.333333f, 2.666667f, 0.057692f }, // 301 (2 0 12 2)
+    { 3.444444f, 6.777778f, 2.888889f, 0.066667f }, // 302 (2 0 13 1)
+    { 3.555556f, 6.222222f, 3.111111f, 0.080357f }, // 303 (2 0 14 0)
+    { 2.444444f, 13.111111f, 0.222222f, 0.031250f }, // 304 (2 1 0 13)
+    { 2.555556f, 12.555555f, 0.444444f, 0.031359f }, // 305 (2 1 1 12)
+    { 2.666667f, 12.000000f, 0.666667f, 0.031690f }, // 306 (2 1 2 11)
+    { 2.777778f, 11.444445f, 0.888889f, 0.032258f }, // 307 (2 1 3 10)
+    { 2.888889f, 10.888889f, 1.111111f, 0.033088f }, // 308 (2 1 4 9)
+    { 3.000000f, 10.333333f, 1.333333f, 0.034221f }, // 309 (2 1 5 8)
+    { 3.111111f, 9.777778f, 1.555556f, 0.035714f }, // 310 (2 1 6 7)
+    { 3.222222f, 9.222222f, 1.777778f, 0.037657f }, // 311 (2 1 7 6)
+    { 3.333333f, 8.666667f, 2.000000f, 0.040179f }, // 312 (2 1 8 5)
+    { 3.444444f, 8.111111f, 2.222222f, 0.043478f }, // 313 (2 1 9 4)
+    { 3.555556f, 7.555556f, 2.444444f, 0.047872f }, // 314 (2 1 10 3)
+    { 3.666667f, 7.000000f, 2.666667f, 0.053892f }, // 315 (2 1 11 2)
+    { 3.777778f, 6.444445f, 2.888889f, 0.062500f }, // 316 (2 1 12 1)
+    { 3.888889f, 5.888889f, 3.111111f, 0.075630f }, // 317 (2 1 13 0)
+    { 2.888889f, 12.222222f, 0.444444f, 0.028481f }, // 318 (2 2 0 12)
+    { 3.000000f, 11.666667f, 0.666667f, 0.028939f }, // 319 (2 2 1 11)
+    { 3.111111f, 11.111111f, 0.888889f, 0.029605f }, // 320 (2 2 2 10)
+    { 3.222222f, 10.555555f, 1.111111f, 0.030508f }, // 321 (2 2 3 9)
+    { 3.333333f, 10.000000f, 1.333333f, 0.031690f }, // 322 (2 2 4 8)
+    { 3.444444f, 9.444445f, 1.555556f, 0.033210f }, // 323 (2 2 5 7)
+    { 3.555556f, 8.888889f, 1.777778f, 0.035156f }, // 324 (2 2 6 6)
+    { 3.666667f, 8.333333f, 2.000000f, 0.037657f }, // 325 (2 2 7 5)
+    { 3.777778f, 7.777778f, 2.222222f, 0.040909f }, // 326 (2 2 8 4)
+    { 3.888889f, 7.222222f, 2.444444f, 0.045226f }, // 327 (2 2 9 3)
+    { 4.000000f, 6.666667f, 2.666667f, 0.051136f }, // 328 (2 2 10 2)
+    { 4.111111f, 6.111111f, 2.888889f, 0.059603f }, // 329 (2 2 11 1)
+    { 4.222222f, 5.555556f, 3.111111f, 0.072581f }, // 330 (2 2 12 0)
+    { 3.333333f, 11.333333f, 0.666667f, 0.026786f }, // 331 (2 3 0 11)
+    { 3.444444f, 10.777778f, 0.888889f, 0.027523f }, // 332 (2 3 1 10)
+    { 3.555556f, 10.222222f, 1.111111f, 0.028481f }, // 333 (2 3 2 9)
+    { 3.666667f, 9.666667f, 1.333333f, 0.029703f }, // 334 (2 3 3 8)
+    { 3.777778f, 9.111111f, 1.555556f, 0.031250f }, // 335 (2 3 4 7)
+    { 3.888889f, 8.555555f, 1.777778f, 0.033210f }, // 336 (2 3 5 6)
+    { 4.000000f, 8.000000f, 2.000000f, 0.035714f }, // 337 (2 3 6 5)
+    { 4.111111f, 7.444445f, 2.222222f, 0.038961f }, // 338 (2 3 7 4)
+    { 4.222222f, 6.888889f, 2.444444f, 0.043269f }, // 339 (2 3 8 3)
+    { 4.333333f, 6.333333f, 2.666667f, 0.049180f }, // 340 (2 3 9 2)
+    { 4.444445f, 5.777778f, 2.888889f, 0.057692f }, // 341 (2 3 10 1)
+    { 4.555555f, 5.222222f, 3.111111f, 0.070866f }, // 342 (2 3 11 0)
+    { 3.777778f, 10.444445f, 0.888889f, 0.025862f }, // 343 (2 4 0 10)
+    { 3.888889f, 9.888889f, 1.111111f, 0.026866f }, // 344 (2 4 1 9)
+    { 4.000000f, 9.333333f, 1.333333f, 0.028125f }, // 345 (2 4 2 8)
+    { 4.111111f, 8.777778f, 1.555556f, 0.029703f }, // 346 (2 4 3 7)
+    { 4.222222f, 8.222222f, 1.777778f, 0.031690f }, // 347 (2 4 4 6)
+    { 4.333333f, 7.666667f, 2.000000f, 0.034221f }, // 348 (2 4 5 5)
+    { 4.444445f, 7.111111f, 2.222222f, 0.037500f }, // 349 (2 4 6 4)
+    { 4.555555f, 6.555555f, 2.444444f, 0.041860f }, // 350 (2 4 7 3)
+    { 4.666667f, 6.000000f, 2.666667f, 0.047872f }, // 351 (2 4 8 2)
+    { 4.777778f, 5.444445f, 2.888889f, 0.056604f }, // 352 (2 4 9 1)
+    { 4.888889f, 4.888889f, 3.111111f, 0.070313f }, // 353 (2 4 10 0)
+    { 4.222222f, 9.555555f, 1.111111f, 0.025568f }, // 354 (2 5 0 9)
+    { 4.333333f, 9.000000f, 1.333333f, 0.026866f }, // 355 (2 5 1 8)
+    { 4.444445f, 8.444445f, 1.555556f, 0.028481f }, // 356 (2 5 2 7)
+    { 4.555555f, 7.888889f, 1.777778f, 0.030508f }, // 357 (2 5 3 6)
+    { 4.666667f, 7.333333f, 2.000000f, 0.033088f }, // 358 (2 5 4 5)
+    { 4.777778f, 6.777778f, 2.222222f, 0.036437f }, // 359 (2 5 5 4)
+    { 4.888889f, 6.222222f, 2.444444f, 0.040909f }, // 360 (2 5 6 3)
+    { 5.000000f, 5.666667f, 2.666667f, 0.047120f }, // 361 (2 5 7 2)
+    { 5.111111f, 5.111111f, 2.888889f, 0.056250f }, // 362 (2 5 8 1)
+    { 5.222222f, 4.555556f, 3.111111f, 0.070866f }, // 363 (2 5 9 0)
+    { 4.666667f, 8.666667f, 1.333333f, 0.025862f }, // 364 (2 6 0 8)
+    { 4.777778f, 8.111111f, 1.555556f, 0.027523f }, // 365 (2 6 1 7)
+    { 4.888889f, 7.555555f, 1.777778f, 0.029605f }, // 366 (2 6 2 6)
+    { 5.000000f, 7.000000f, 2.000000f, 0.032258f }, // 367 (2 6 3 5)
+    { 5.111111f, 6.444445f, 2.222222f, 0.035714f }, // 368 (2 6 4 4)
+    { 5.222222f, 5.888889f, 2.444444f, 0.040359f }, // 369 (2 6 5 3)
+    { 5.333333f, 5.333333f, 2.666667f, 0.046875f }, // 370 (2 6 6 2)
+    { 5.444445f, 4.777778f, 2.888889f, 0.056604f }, // 371 (2 6 7 1)
+    { 5.555555f, 4.222222f, 3.111111f, 0.072581f }, // 372 (2 6 8 0)
+    { 5.111111f, 7.777778f, 1.555556f, 0.026786f }, // 373 (2 7 0 7)
+    { 5.222222f, 7.222222f, 1.777778f, 0.028939f }, // 374 (2 7 1 6)
+    { 5.333333f, 6.666667f, 2.000000f, 0.031690f }, // 375 (2 7 2 5)
+    { 5.444445f, 6.111111f, 2.222222f, 0.035294f }, // 376 (2 7 3 4)
+    { 5.555555f, 5.555555f, 2.444444f, 0.040179f }, // 377 (2 7 4 3)
+    { 5.666667f, 5.000000f, 2.666667f, 0.047120f }, // 378 (2 7 5 2)
+    { 5.777778f, 4.444445f, 2.888889f, 0.057692f }, // 379 (2 7 6 1)
+    { 5.888889f, 3.888889f, 3.111111f, 0.075630f }, // 380 (2 7 7 0)
+    { 5.555555f, 6.888889f, 1.777778f, 0.028481f }, // 381 (2 8 0 6)
+    { 5.666667f, 6.333333f, 2.000000f, 0.031359f }, // 382 (2 8 1 5)
+    { 5.777778f, 5.777778f, 2.222222f, 0.035156f }, // 383 (2 8 2 4)
+    { 5.888889f, 5.222222f, 2.444444f, 0.040359f }, // 384 (2 8 3 3)
+    { 6.000000f, 4.666667f, 2.666667f, 0.047872f }, // 385 (2 8 4 2)
+    { 6.111111f, 4.111111f, 2.888889f, 0.059603f }, // 386 (2 8 5 1)
+    { 6.222222f, 3.555556f, 3.111111f, 0.080357f }, // 387 (2 8 6 0)
+    { 6.000000f, 6.000000f, 2.000000f, 0.031250f }, // 388 (2 9 0 5)
+    { 6.111111f, 5.444445f, 2.222222f, 0.035294f }, // 389 (2 9 1 4)
+    { 6.222222f, 4.888889f, 2.444444f, 0.040909f }, // 390 (2 9 2 3)
+    { 6.333333f, 4.333333f, 2.666667f, 0.049180f }, // 391 (2 9 3 2)
+    { 6.444445f, 3.777778f, 2.888889f, 0.062500f }, // 392 (2 9 4 1)
+    { 6.555556f, 3.222222f, 3.111111f, 0.087379f }, // 393 (2 9 5 0)
+    { 6.444445f, 5.111111f, 2.222222f, 0.035714f }, // 394 (2 10 0 4)
+    { 6.555556f, 4.555555f, 2.444444f, 0.041860f }, // 395 (2 10 1 3)
+    { 6.666667f, 4.000000f, 2.666667f, 0.051136f }, // 396 (2 10 2 2)
+    { 6.777778f, 3.444444f, 2.888889f, 0.066667f }, // 397 (2 10 3 1)
+    { 6.888889f, 2.888889f, 3.111111f, 0.097826f }, // 398 (2 10 4 0)
+    { 6.888889f, 4.222222f, 2.444444f, 0.043269f }, // 399 (2 11 0 3)
+    { 7.000000f, 3.666667f, 2.666667f, 0.053892f }, // 400 (2 11 1 2)
+    { 7.111111f, 3.111111f, 2.888889f, 0.072581f }, // 401 (2 11 2 1)
+    { 7.222222f, 2.555556f, 3.111111f, 0.113924f }, // 402 (2 11 3 0)
+    { 7.333333f, 3.333333f, 2.666667f, 0.057692f }, // 403 (2 12 0 2)
+    { 7.444445f, 2.777778f, 2.888889f, 0.081081f }, // 404 (2 12 1 1)
+    { 7.555556f, 2.222222f, 3.111111f, 0.140625f }, // 405 (2 12 2 0)
+    { 7.777778f, 2.444444f, 2.888889f, 0.093750f }, // 406 (2 13 0 1)
+    { 7.888889f, 1.888889f, 3.111111f, 0.191489f }, // 407 (2 13 1 0)
+    { 8.222222f, 1.555556f, 3.111111f, 0.321429f }, // 408 (2 14 0 0)
+    { 3.000000f, 13.000000f, 0.000000f, 0.025641f }, // 409 (3 0 0 13)
+    { 3.111111f, 12.444445f, 0.222222f, 0.025862f }, // 410 (3 0 1 12)
+    { 3.222222f, 11.888889f, 0.444444f, 0.026239f }, // 411 (3 0 2 11)
+    { 3.333333f, 11.333333f, 0.666667f, 0.026786f }, // 412 (3 0 3 10)
+    { 3.444444f, 10.777778f, 0.888889f, 0.027523f }, // 413 (3 0 4 9)
+    { 3.555556f, 10.222222f, 1.111111f, 0.028481f }, // 414 (3 0 5 8)
+    { 3.666667f, 9.666667f, 1.333333f, 0.029703f }, // 415 (3 0 6 7)
+    { 3.777778f, 9.111111f, 1.555556f, 0.031250f }, // 416 (3 0 7 6)
+    { 3.888889f, 8.555555f, 1.777778f, 0.033210f }, // 417 (3 0 8 5)
+    { 4.000000f, 8.000000f, 2.000000f, 0.035714f }, // 418 (3 0 9 4)
+    { 4.111111f, 7.444445f, 2.222222f, 0.038961f }, // 419 (3 0 10 3)
+    { 4.222222f, 6.888889f, 2.444444f, 0.043269f }, // 420 (3 0 11 2)
+    { 4.333333f, 6.333333f, 2.666667f, 0.049180f }, // 421 (3 0 12 1)
+    { 4.444445f, 5.777778f, 2.888889f, 0.057692f }, // 422 (3 0 13 0)
+    { 3.444444f, 12.111111f, 0.222222f, 0.024000f }, // 423 (3 1 0 12)
+    { 3.555556f, 11.555555f, 0.444444f, 0.024457f }, // 424 (3 1 1 11)
+    { 3.666667f, 11.000000f, 0.666667f, 0.025070f }, // 425 (3 1 2 10)
+    { 3.777778f, 10.444445f, 0.888889f, 0.025862f }, // 426 (3 1 3 9)
+    { 3.888889f, 9.888889f, 1.111111f, 0.026866f }, // 427 (3 1 4 8)
+    { 4.000000f, 9.333333f, 1.333333f, 0.028125f }, // 428 (3 1 5 7)
+    { 4.111111f, 8.777778f, 1.555556f, 0.029703f }, // 429 (3 1 6 6)
+    { 4.222222f, 8.222222f, 1.777778f, 0.031690f }, // 430 (3 1 7 5)
+    { 4.333333f, 7.666667f, 2.000000f, 0.034221f }, // 431 (3 1 8 4)
+    { 4.444445f, 7.111111f, 2.222222f, 0.037500f }, // 432 (3 1 9 3)
+    { 4.555555f, 6.555556f, 2.444444f, 0.041860f }, // 433 (3 1 10 2)
+    { 4.666667f, 6.000000f, 2.666667f, 0.047872f }, // 434 (3 1 11 1)
+    { 4.777778f, 5.444445f, 2.888889f, 0.056604f }, // 435 (3 1 12 0)
+    { 3.888889f, 11.222222f, 0.444444f, 0.023018f }, // 436 (3 2 0 11)
+    { 4.000000f, 10.666667f, 0.666667f, 0.023684f }, // 437 (3 2 1 10)
+    { 4.111111f, 10.111111f, 0.888889f, 0.024523f }, // 438 (3 2 2 9)
+    { 4.222222f, 9.555555f, 1.111111f, 0.025568f }, // 439 (3 2 3 8)
+    { 4.333333f, 9.000000f, 1.333333f, 0.026866f }, // 440 (3 2 4 7)
+    { 4.444445f, 8.444445f, 1.555556f, 0.028481f }, // 441 (3 2 5 6)
+    { 4.555555f, 7.888889f, 1.777778f, 0.030508f }, // 442 (3 2 6 5)
+    { 4.666667f, 7.333333f, 2.000000f, 0.033088f }, // 443 (3 2 7 4)
+    { 4.777778f, 6.777778f, 2.222222f, 0.036437f }, // 444 (3 2 8 3)
+    { 4.888889f, 6.222222f, 2.444444f, 0.040909f }, // 445 (3 2 9 2)
+    { 5.000000f, 5.666667f, 2.666667f, 0.047120f }, // 446 (3 2 10 1)
+    { 5.111111f, 5.111111f, 2.888889f, 0.056250f }, // 447 (3 2 11 0)
+    { 4.333333f, 10.333333f, 0.666667f, 0.022556f }, // 448 (3 3 0 10)
+    { 4.444445f, 9.777778f, 0.888889f, 0.023438f }, // 449 (3 3 1 9)
+    { 4.555555f, 9.222222f, 1.111111f, 0.024523f }, // 450 (3 3 2 8)
+    { 4.666667f, 8.666667f, 1.333333f, 0.025862f }, // 451 (3 3 3 7)
+    { 4.777778f, 8.111111f, 1.555556f, 0.027523f }, // 452 (3 3 4 6)
+    { 4.888889f, 7.555555f, 1.777778f, 0.029605f }, // 453 (3 3 5 5)
+    { 5.000000f, 7.000000f, 2.000000f, 0.032258f }, // 454 (3 3 6 4)
+    { 5.111111f, 6.444445f, 2.222222f, 0.035714f }, // 455 (3 3 7 3)
+    { 5.222222f, 5.888889f, 2.444444f, 0.040359f }, // 456 (3 3 8 2)
+    { 5.333333f, 5.333333f, 2.666667f, 0.046875f }, // 457 (3 3 9 1)
+    { 5.444445f, 4.777778f, 2.888889f, 0.056604f }, // 458 (3 3 10 0)
+    { 4.777778f, 9.444445f, 0.888889f, 0.022556f }, // 459 (3 4 0 9)
+    { 4.888889f, 8.888889f, 1.111111f, 0.023684f }, // 460 (3 4 1 8)
+    { 5.000000f, 8.333333f, 1.333333f, 0.025070f }, // 461 (3 4 2 7)
+    { 5.111111f, 7.777778f, 1.555556f, 0.026786f }, // 462 (3 4 3 6)
+    { 5.222222f, 7.222222f, 1.777778f, 0.028939f }, // 463 (3 4 4 5)
+    { 5.333333f, 6.666667f, 2.000000f, 0.031690f }, // 464 (3 4 5 4)
+    { 5.444445f, 6.111111f, 2.222222f, 0.035294f }, // 465 (3 4 6 3)
+    { 5.555555f, 5.555555f, 2.444444f, 0.040179f }, // 466 (3 4 7 2)
+    { 5.666667f, 5.000000f, 2.666667f, 0.047120f }, // 467 (3 4 8 1)
+    { 5.777778f, 4.444445f, 2.888889f, 0.057692f }, // 468 (3 4 9 0)
+    { 5.222222f, 8.555555f, 1.111111f, 0.023018f }, // 469 (3 5 0 8)
+    { 5.333333f, 8.000000f, 1.333333f, 0.024457f }, // 470 (3 5 1 7)
+    { 5.444445f, 7.444445f, 1.555556f, 0.026239f }, // 471 (3 5 2 6)
+    { 5.555555f, 6.888889f, 1.777778f, 0.028481f }, // 472 (3 5 3 5)
+    { 5.666667f, 6.333333f, 2.000000f, 0.031359f }, // 473 (3 5 4 4)
+    { 5.777778f, 5.777778f, 2.222222f, 0.035156f }, // 474 (3 5 5 3)
+    { 5.888889f, 5.222222f, 2.444444f, 0.040359f }, // 475 (3 5 6 2)
+    { 6.000000f, 4.666667f, 2.666667f, 0.047872f }, // 476 (3 5 7 1)
+    { 6.111111f, 4.111111f, 2.888889f, 0.059603f }, // 477 (3 5 8 0)
+    { 5.666667f, 7.666667f, 1.333333f, 0.024000f }, // 478 (3 6 0 7)
+    { 5.777778f, 7.111111f, 1.555556f, 0.025862f }, // 479 (3 6 1 6)
+    { 5.888889f, 6.555555f, 1.777778f, 0.028213f }, // 480 (3 6 2 5)
+    { 6.000000f, 6.000000f, 2.000000f, 0.031250f }, // 481 (3 6 3 4)
+    { 6.111111f, 5.444445f, 2.222222f, 0.035294f }, // 482 (3 6 4 3)
+    { 6.222222f, 4.888889f, 2.444444f, 0.040909f }, // 483 (3 6 5 2)
+    { 6.333333f, 4.333333f, 2.666667f, 0.049180f }, // 484 (3 6 6 1)
+    { 6.444445f, 3.777778f, 2.888889f, 0.062500f }, // 485 (3 6 7 0)
+    { 6.111111f, 6.777778f, 1.555556f, 0.025641f }, // 486 (3 7 0 6)
+    { 6.222222f, 6.222222f, 1.777778f, 0.028125f }, // 487 (3 7 1 5)
+    { 6.333333f, 5.666667f, 2.000000f, 0.031359f }, // 488 (3 7 2 4)
+    { 6.444445f, 5.111111f, 2.222222f, 0.035714f }, // 489 (3 7 3 3)
+    { 6.555555f, 4.555555f, 2.444444f, 0.041860f }, // 490 (3 7 4 2)
+    { 6.666667f, 4.000000f, 2.666667f, 0.051136f }, // 491 (3 7 5 1)
+    { 6.777778f, 3.444444f, 2.888889f, 0.066667f }, // 492 (3 7 6 0)
+    { 6.555555f, 5.888889f, 1.777778f, 0.028213f }, // 493 (3 8 0 5)
+    { 6.666667f, 5.333333f, 2.000000f, 0.031690f }, // 494 (3 8 1 4)
+    { 6.777778f, 4.777778f, 2.222222f, 0.036437f }, // 495 (3 8 2 3)
+    { 6.888889f, 4.222222f, 2.444444f, 0.043269f }, // 496 (3 8 3 2)
+    { 7.000000f, 3.666667f, 2.666667f, 0.053892f }, // 497 (3 8 4 1)
+    { 7.111111f, 3.111111f, 2.888889f, 0.072581f }, // 498 (3 8 5 0)
+    { 7.000000f, 5.000000f, 2.000000f, 0.032258f }, // 499 (3 9 0 4)
+    { 7.111111f, 4.444445f, 2.222222f, 0.037500f }, // 500 (3 9 1 3)
+    { 7.222222f, 3.888889f, 2.444444f, 0.045226f }, // 501 (3 9 2 2)
+    { 7.333333f, 3.333333f, 2.666667f, 0.057692f }, // 502 (3 9 3 1)
+    { 7.444445f, 2.777778f, 2.888889f, 0.081081f }, // 503 (3 9 4 0)
+    { 7.444445f, 4.111111f, 2.222222f, 0.038961f }, // 504 (3 10 0 3)
+    { 7.555556f, 3.555556f, 2.444444f, 0.047872f }, // 505 (3 10 1 2)
+    { 7.666667f, 3.000000f, 2.666667f, 0.062937f }, // 506 (3 10 2 1)
+    { 7.777778f, 2.444444f, 2.888889f, 0.093750f }, // 507 (3 10 3 0)
+    { 7.888889f, 3.222222f, 2.444444f, 0.051429f }, // 508 (3 11 0 2)
+    { 8.000000f, 2.666667f, 2.666667f, 0.070313f }, // 509 (3 11 1 1)
+    { 8.111111f, 2.111111f, 2.888889f, 0.113924f }, // 510 (3 11 2 0)
+    { 8.333333f, 2.333333f, 2.666667f, 0.081081f }, // 511 (3 12 0 1)
+    { 8.444445f, 1.777778f, 2.888889f, 0.150000f }, // 512 (3 12 1 0)
+    { 8.777778f, 1.444444f, 2.888889f, 0.230769f }, // 513 (3 13 0 0)
+    { 4.000000f, 12.000000f, 0.000000f, 0.020833f }, // 514 (4 0 0 12)
+    { 4.111111f, 11.444445f, 0.222222f, 0.021277f }, // 515 (4 0 1 11)
+    { 4.222222f, 10.888889f, 0.444444f, 0.021845f }, // 516 (4 0 2 10)
+    { 4.333333f, 10.333333f, 0.666667f, 0.022556f }, // 517 (4 0 3 9)
+    { 4.444445f, 9.777778f, 0.888889f, 0.023438f }, // 518 (4 0 4 8)
+    { 4.555555f, 9.222222f, 1.111111f, 0.024523f }, // 519 (4 0 5 7)
+    { 4.666667f, 8.666667f, 1.333333f, 0.025862f }, // 520 (4 0 6 6)
+    { 4.777778f, 8.111111f, 1.555556f, 0.027523f }, // 521 (4 0 7 5)
+    { 4.888889f, 7.555555f, 1.777778f, 0.029605f }, // 522 (4 0 8 4)
+    { 5.000000f, 7.000000f, 2.000000f, 0.032258f }, // 523 (4 0 9 3)
+    { 5.111111f, 6.444445f, 2.222222f, 0.035714f }, // 524 (4 0 10 2)
+    { 5.222222f, 5.888889f, 2.444444f, 0.040359f }, // 525 (4 0 11 1)
+    { 5.333333f, 5.333333f, 2.666667f, 0.046875f }, // 526 (4 0 12 0)
+    { 4.444445f, 11.111111f, 0.222222f, 0.020270f }, // 527 (4 1 0 11)
+    { 4.555555f, 10.555555f, 0.444444f, 0.020882f }, // 528 (4 1 1 10)
+    { 4.666667f, 10.000000f, 0.666667f, 0.021635f }, // 529 (4 1 2 9)
+    { 4.777778f, 9.444445f, 0.888889f, 0.022556f }, // 530 (4 1 3 8)
+    { 4.888889f, 8.888889f, 1.111111f, 0.023684f }, // 531 (4 1 4 7)
+    { 5.000000f, 8.333333f, 1.333333f, 0.025070f }, // 532 (4 1 5 6)
+    { 5.111111f, 7.777778f, 1.555556f, 0.026786f }, // 533 (4 1 6 5)
+    { 5.222222f, 7.222222f, 1.777778f, 0.028939f }, // 534 (4 1 7 4)
+    { 5.333333f, 6.666667f, 2.000000f, 0.031690f }, // 535 (4 1 8 3)
+    { 5.444445f, 6.111111f, 2.222222f, 0.035294f }, // 536 (4 1 9 2)
+    { 5.555555f, 5.555556f, 2.444444f, 0.040179f }, // 537 (4 1 10 1)
+    { 5.666667f, 5.000000f, 2.666667f, 0.047120f }, // 538 (4 1 11 0)
+    { 4.888889f, 10.222222f, 0.444444f, 0.020089f }, // 539 (4 2 0 10)
+    { 5.000000f, 9.666667f, 0.666667f, 0.020882f }, // 540 (4 2 1 9)
+    { 5.111111f, 9.111111f, 0.888889f, 0.021845f }, // 541 (4 2 2 8)
+    { 5.222222f, 8.555555f, 1.111111f, 0.023018f }, // 542 (4 2 3 7)
+    { 5.333333f, 8.000000f, 1.333333f, 0.024457f }, // 543 (4 2 4 6)
+    { 5.444445f, 7.444445f, 1.555556f, 0.026239f }, // 544 (4 2 5 5)
+    { 5.555555f, 6.888889f, 1.777778f, 0.028481f }, // 545 (4 2 6 4)
+    { 5.666667f, 6.333333f, 2.000000f, 0.031359f }, // 546 (4 2 7 3)
+    { 5.777778f, 5.777778f, 2.222222f, 0.035156f }, // 547 (4 2 8 2)
+    { 5.888889f, 5.222222f, 2.444444f, 0.040359f }, // 548 (4 2 9 1)
+    { 6.000000f, 4.666667f, 2.666667f, 0.047872f }, // 549 (4 2 10 0)
+    { 5.333333f, 9.333333f, 0.666667f, 0.020270f }, // 550 (4 3 0 9)
+    { 5.444445f, 8.777778f, 0.888889f, 0.021277f }, // 551 (4 3 1 8)
+    { 5.555555f, 8.222222f, 1.111111f, 0.022500f }, // 552 (4 3 2 7)
+    { 5.666667f, 7.666667f, 1.333333f, 0.024000f }, // 553 (4 3 3 6)
+    { 5.777778f, 7.111111f, 1.555556f, 0.025862f }, // 554 (4 3 4 5)
+    { 5.888889f, 6.555555f, 1.777778f, 0.028213f }, // 555 (4 3 5 4)
+    { 6.000000f, 6.000000f, 2.000000f, 0.031250f }, // 556 (4 3 6 3)
+    { 6.111111f, 5.444445f, 2.222222f, 0.035294f }, // 557 (4 3 7 2)
+    { 6.222222f, 4.888889f, 2.444444f, 0.040909f }, // 558 (4 3 8 1)
+    { 6.333333f, 4.333333f, 2.666667f, 0.049180f }, // 559 (4 3 9 0)
+    { 5.777778f, 8.444445f, 0.888889f, 0.020833f }, // 560 (4 4 0 8)
+    { 5.888889f, 7.888889f, 1.111111f, 0.022113f }, // 561 (4 4 1 7)
+    { 6.000000f, 7.333333f, 1.333333f, 0.023684f }, // 562 (4 4 2 6)
+    { 6.111111f, 6.777778f, 1.555556f, 0.025641f }, // 563 (4 4 3 5)
+    { 6.222222f, 6.222222f, 1.777778f, 0.028125f }, // 564 (4 4 4 4)
+    { 6.333333f, 5.666667f, 2.000000f, 0.031359f }, // 565 (4 4 5 3)
+    { 6.444445f, 5.111111f, 2.222222f, 0.035714f }, // 566 (4 4 6 2)
+    { 6.555555f, 4.555555f, 2.444444f, 0.041860f }, // 567 (4 4 7 1)
+    { 6.666667f, 4.000000f, 2.666667f, 0.051136f }, // 568 (4 4 8 0)
+    { 6.222222f, 7.555555f, 1.111111f, 0.021845f }, // 569 (4 5 0 7)
+    { 6.333333f, 7.000000f, 1.333333f, 0.023499f }, // 570 (4 5 1 6)
+    { 6.444445f, 6.444445f, 1.555556f, 0.025568f }, // 571 (4 5 2 5)
+    { 6.555555f, 5.888889f, 1.777778f, 0.028213f }, // 572 (4 5 3 4)
+    { 6.666667f, 5.333333f, 2.000000f, 0.031690f }, // 573 (4 5 4 3)
+    { 6.777778f, 4.777778f, 2.222222f, 0.036437f }, // 574 (4 5 5 2)
+    { 6.888889f, 4.222222f, 2.444444f, 0.043269f }, // 575 (4 5 6 1)
+    { 7.000000f, 3.666667f, 2.666667f, 0.053892f }, // 576 (4 5 7 0)
+    { 6.666667f, 6.666667f, 1.333333f, 0.023438f }, // 577 (4 6 0 6)
+    { 6.777778f, 6.111111f, 1.555556f, 0.025641f }, // 578 (4 6 1 5)
+    { 6.888889f, 5.555555f, 1.777778f, 0.028481f }, // 579 (4 6 2 4)
+    { 7.000000f, 5.000000f, 2.000000f, 0.032258f }, // 580 (4 6 3 3)
+    { 7.111111f, 4.444445f, 2.222222f, 0.037500f }, // 581 (4 6 4 2)
+    { 7.222222f, 3.888889f, 2.444444f, 0.045226f }, // 582 (4 6 5 1)
+    { 7.333333f, 3.333333f, 2.666667f, 0.057692f }, // 583 (4 6 6 0)
+    { 7.111111f, 5.777778f, 1.555556f, 0.025862f }, // 584 (4 7 0 5)
+    { 7.222222f, 5.222222f, 1.777778f, 0.028939f }, // 585 (4 7 1 4)
+    { 7.333333f, 4.666667f, 2.000000f, 0.033088f }, // 586 (4 7 2 3)
+    { 7.444445f, 4.111111f, 2.222222f, 0.038961f }, // 587 (4 7 3 2)
+    { 7.555555f, 3.555556f, 2.444444f, 0.047872f }, // 588 (4 7 4 1)
+    { 7.666667f, 3.000000f, 2.666667f, 0.062937f }, // 589 (4 7 5 0)
+    { 7.555555f, 4.888889f, 1.777778f, 0.029605f }, // 590 (4 8 0 4)
+    { 7.666667f, 4.333333f, 2.000000f, 0.034221f }, // 591 (4 8 1 3)
+    { 7.777778f, 3.777778f, 2.222222f, 0.040909f }, // 592 (4 8 2 2)
+    { 7.888889f, 3.222222f, 2.444444f, 0.051429f }, // 593 (4 8 3 1)
+    { 8.000000f, 2.666667f, 2.666667f, 0.070313f }, // 594 (4 8 4 0)
+    { 8.000000f, 4.000000f, 2.000000f, 0.035714f }, // 595 (4 9 0 3)
+    { 8.111111f, 3.444444f, 2.222222f, 0.043478f }, // 596 (4 9 1 2)
+    { 8.222222f, 2.888889f, 2.444444f, 0.056250f }, // 597 (4 9 2 1)
+    { 8.333333f, 2.333333f, 2.666667f, 0.081081f }, // 598 (4 9 3 0)
+    { 8.444445f, 3.111111f, 2.222222f, 0.046875f }, // 599 (4 10 0 2)
+    { 8.555555f, 2.555556f, 2.444444f, 0.062937f }, // 600 (4 10 1 1)
+    { 8.666667f, 2.000000f, 2.666667f, 0.097826f }, // 601 (4 10 2 0)
+    { 8.888889f, 2.222222f, 2.444444f, 0.072581f }, // 602 (4 11 0 1)
+    { 9.000000f, 1.666667f, 2.666667f, 0.126761f }, // 603 (4 11 1 0)
+    { 9.333333f, 1.333333f, 2.666667f, 0.187500f }, // 604 (4 12 0 0)
+    { 5.000000f, 11.000000f, 0.000000f, 0.018182f }, // 605 (5 0 0 11)
+    { 5.111111f, 10.444445f, 0.222222f, 0.018750f }, // 606 (5 0 1 10)
+    { 5.222222f, 9.888889f, 0.444444f, 0.019438f }, // 607 (5 0 2 9)
+    { 5.333333f, 9.333333f, 0.666667f, 0.020270f }, // 608 (5 0 3 8)
+    { 5.444445f, 8.777778f, 0.888889f, 0.021277f }, // 609 (5 0 4 7)
+    { 5.555555f, 8.222222f, 1.111111f, 0.022500f }, // 610 (5 0 5 6)
+    { 5.666667f, 7.666667f, 1.333333f, 0.024000f }, // 611 (5 0 6 5)
+    { 5.777778f, 7.111111f, 1.555556f, 0.025862f }, // 612 (5 0 7 4)
+    { 5.888889f, 6.555555f, 1.777778f, 0.028213f }, // 613 (5 0 8 3)
+    { 6.000000f, 6.000000f, 2.000000f, 0.031250f }, // 614 (5 0 9 2)
+    { 6.111111f, 5.444445f, 2.222222f, 0.035294f }, // 615 (5 0 10 1)
+    { 6.222222f, 4.888889f, 2.444444f, 0.040909f }, // 616 (5 0 11 0)
+    { 5.444445f, 10.111111f, 0.222222f, 0.018182f }, // 617 (5 1 0 10)
+    { 5.555555f, 9.555555f, 0.444444f, 0.018908f }, // 618 (5 1 1 9)
+    { 5.666667f, 9.000000f, 0.666667f, 0.019780f }, // 619 (5 1 2 8)
+    { 5.777778f, 8.444445f, 0.888889f, 0.020833f }, // 620 (5 1 3 7)
+    { 5.888889f, 7.888889f, 1.111111f, 0.022113f }, // 621 (5 1 4 6)
+    { 6.000000f, 7.333333f, 1.333333f, 0.023684f }, // 622 (5 1 5 5)
+    { 6.111111f, 6.777778f, 1.555556f, 0.025641f }, // 623 (5 1 6 4)
+    { 6.222222f, 6.222222f, 1.777778f, 0.028125f }, // 624 (5 1 7 3)
+    { 6.333333f, 5.666667f, 2.000000f, 0.031359f }, // 625 (5 1 8 2)
+    { 6.444445f, 5.111111f, 2.222222f, 0.035714f }, // 626 (5 1 9 1)
+    { 6.555555f, 4.555556f, 2.444444f, 0.041860f }, // 627 (5 1 10 0)
+    { 5.888889f, 9.222222f, 0.444444f, 0.018480f }, // 628 (5 2 0 9)
+    { 6.000000f, 8.666667f, 0.666667f, 0.019397f }, // 629 (5 2 1 8)
+    { 6.111111f, 8.111111f, 0.888889f, 0.020501f }, // 630 (5 2 2 7)
+    { 6.222222f, 7.555555f, 1.111111f, 0.021845f }, // 631 (5 2 3 6)
+    { 6.333333f, 7.000000f, 1.333333f, 0.023499f }, // 632 (5 2 4 5)
+    { 6.444445f, 6.444445f, 1.555556f, 0.025568f }, // 633 (5 2 5 4)
+    { 6.555555f, 5.888889f, 1.777778f, 0.028213f }, // 634 (5 2 6 3)
+    { 6.666667f, 5.333333f, 2.000000f, 0.031690f }, // 635 (5 2 7 2)
+    { 6.777778f, 4.777778f, 2.222222f, 0.036437f }, // 636 (5 2 8 1)
+    { 6.888889f, 4.222222f, 2.444444f, 0.043269f }, // 637 (5 2 9 0)
+    { 6.333333f, 8.333333f, 0.666667f, 0.019108f }, // 638 (5 3 0 8)
+    { 6.444445f, 7.777778f, 0.888889f, 0.020270f }, // 639 (5 3 1 7)
+    { 6.555555f, 7.222222f, 1.111111f, 0.021687f }, // 640 (5 3 2 6)
+    { 6.666667f, 6.666667f, 1.333333f, 0.023438f }, // 641 (5 3 3 5)
+    { 6.777778f, 6.111111f, 1.555556f, 0.025641f }, // 642 (5 3 4 4)
+    { 6.888889f, 5.555555f, 1.777778f, 0.028481f }, // 643 (5 3 5 3)
+    { 7.000000f, 5.000000f, 2.000000f, 0.032258f }, // 644 (5 3 6 2)
+    { 7.111111f, 4.444445f, 2.222222f, 0.037500f }, // 645 (5 3 7 1)
+    { 7.222222f, 3.888889f, 2.444444f, 0.045226f }, // 646 (5 3 8 0)
+    { 6.777778f, 7.444445f, 0.888889f, 0.020134f }, // 647 (5 4 0 7)
+    { 6.888889f, 6.888889f, 1.111111f, 0.021635f }, // 648 (5 4 1 6)
+    { 7.000000f, 6.333333f, 1.333333f, 0.023499f }, // 649 (5 4 2 5)
+    { 7.111111f, 5.777778f, 1.555556f, 0.025862f }, // 650 (5 4 3 4)
+    { 7.222222f, 5.222222f, 1.777778f, 0.028939f }, // 651 (5 4 4 3)
+    { 7.333333f, 4.666667f, 2.000000f, 0.033088f }, // 652 (5 4 5 2)
+    { 7.444445f, 4.111111f, 2.222222f, 0.038961f }, // 653 (5 4 6 1)
+    { 7.555555f, 3.555556f, 2.444444f, 0.047872f }, // 654 (5 4 7 0)
+    { 7.222222f, 6.555555f, 1.111111f, 0.021687f }, // 655 (5 5 0 6)
+    { 7.333333f, 6.000000f, 1.333333f, 0.023684f }, // 656 (5 5 1 5)
+    { 7.444445f, 5.444445f, 1.555556f, 0.026239f }, // 657 (5 5 2 4)
+    { 7.555555f, 4.888889f, 1.777778f, 0.029605f }, // 658 (5 5 3 3)
+    { 7.666667f, 4.333333f, 2.000000f, 0.034221f }, // 659 (5 5 4 2)
+    { 7.777778f, 3.777778f, 2.222222f, 0.040909f }, // 660 (5 5 5 1)
+    { 7.888889f, 3.222222f, 2.444444f, 0.051429f }, // 661 (5 5 6 0)
+    { 7.666667f, 5.666667f, 1.333333f, 0.024000f }, // 662 (5 6 0 5)
+    { 7.777778f, 5.111111f, 1.555556f, 0.026786f }, // 663 (5 6 1 4)
+    { 7.888889f, 4.555555f, 1.777778f, 0.030508f }, // 664 (5 6 2 3)
+    { 8.000000f, 4.000000f, 2.000000f, 0.035714f }, // 665 (5 6 3 2)
+    { 8.111111f, 3.444444f, 2.222222f, 0.043478f }, // 666 (5 6 4 1)
+    { 8.222222f, 2.888889f, 2.444444f, 0.056250f }, // 667 (5 6 5 0)
+    { 8.111111f, 4.777778f, 1.555556f, 0.027523f }, // 668 (5 7 0 4)
+    { 8.222222f, 4.222222f, 1.777778f, 0.031690f }, // 669 (5 7 1 3)
+    { 8.333333f, 3.666667f, 2.000000f, 0.037657f }, // 670 (5 7 2 2)
+    { 8.444445f, 3.111111f, 2.222222f, 0.046875f }, // 671 (5 7 3 1)
+    { 8.555555f, 2.555556f, 2.444444f, 0.062937f }, // 672 (5 7 4 0)
+    { 8.555555f, 3.888889f, 1.777778f, 0.033210f }, // 673 (5 8 0 3)
+    { 8.666667f, 3.333333f, 2.000000f, 0.040179f }, // 674 (5 8 1 2)
+    { 8.777778f, 2.777778f, 2.222222f, 0.051429f }, // 675 (5 8 2 1)
+    { 8.888889f, 2.222222f, 2.444444f, 0.072581f }, // 676 (5 8 3 0)
+    { 9.000000f, 3.000000f, 2.000000f, 0.043478f }, // 677 (5 9 0 2)
+    { 9.111111f, 2.444444f, 2.222222f, 0.057692f }, // 678 (5 9 1 1)
+    { 9.222222f, 1.888889f, 2.444444f, 0.087379f }, // 679 (5 9 2 0)
+    { 9.444445f, 2.111111f, 2.222222f, 0.066667f }, // 680 (5 10 0 1)
+    { 9.555555f, 1.555556f, 2.444444f, 0.112500f }, // 681 (5 10 1 0)
+    { 9.888889f, 1.222222f, 2.444444f, 0.163636f }, // 682 (5 11 0 0)
+    { 6.000000f, 10.000000f, 0.000000f, 0.016667f }, // 683 (6 0 0 10)
+    { 6.111111f, 9.444445f, 0.222222f, 0.017341f }, // 684 (6 0 1 9)
+    { 6.222222f, 8.888889f, 0.444444f, 0.018145f }, // 685 (6 0 2 8)
+    { 6.333333f, 8.333333f, 0.666667f, 0.019108f }, // 686 (6 0 3 7)
+    { 6.444445f, 7.777778f, 0.888889f, 0.020270f }, // 687 (6 0 4 6)
+    { 6.555555f, 7.222222f, 1.111111f, 0.021687f }, // 688 (6 0 5 5)
+    { 6.666667f, 6.666667f, 1.333333f, 0.023438f }, // 689 (6 0 6 4)
+    { 6.777778f, 6.111111f, 1.555556f, 0.025641f }, // 690 (6 0 7 3)
+    { 6.888889f, 5.555555f, 1.777778f, 0.028481f }, // 691 (6 0 8 2)
+    { 7.000000f, 5.000000f, 2.000000f, 0.032258f }, // 692 (6 0 9 1)
+    { 7.111111f, 4.444445f, 2.222222f, 0.037500f }, // 693 (6 0 10 0)
+    { 6.444445f, 9.111111f, 0.222222f, 0.017045f }, // 694 (6 1 0 9)
+    { 6.555555f, 8.555555f, 0.444444f, 0.017893f }, // 695 (6 1 1 8)
+    { 6.666667f, 8.000000f, 0.666667f, 0.018908f }, // 696 (6 1 2 7)
+    { 6.777778f, 7.444445f, 0.888889f, 0.020134f }, // 697 (6 1 3 6)
+    { 6.888889f, 6.888889f, 1.111111f, 0.021635f }, // 698 (6 1 4 5)
+    { 7.000000f, 6.333333f, 1.333333f, 0.023499f }, // 699 (6 1 5 4)
+    { 7.111111f, 5.777778f, 1.555556f, 0.025862f }, // 700 (6 1 6 3)
+    { 7.222222f, 5.222222f, 1.777778f, 0.028939f }, // 701 (6 1 7 2)
+    { 7.333333f, 4.666667f, 2.000000f, 0.033088f }, // 702 (6 1 8 1)
+    { 7.444445f, 4.111111f, 2.222222f, 0.038961f }, // 703 (6 1 9 0)
+    { 6.888889f, 8.222222f, 0.444444f, 0.017717f }, // 704 (6 2 0 8)
+    { 7.000000f, 7.666667f, 0.666667f, 0.018789f }, // 705 (6 2 1 7)
+    { 7.111111f, 7.111111f, 0.888889f, 0.020089f }, // 706 (6 2 2 6)
+    { 7.222222f, 6.555555f, 1.111111f, 0.021687f }, // 707 (6 2 3 5)
+    { 7.333333f, 6.000000f, 1.333333f, 0.023684f }, // 708 (6 2 4 4)
+    { 7.444445f, 5.444445f, 1.555556f, 0.026239f }, // 709 (6 2 5 3)
+    { 7.555555f, 4.888889f, 1.777778f, 0.029605f }, // 710 (6 2 6 2)
+    { 7.666667f, 4.333333f, 2.000000f, 0.034221f }, // 711 (6 2 7 1)
+    { 7.777778f, 3.777778f, 2.222222f, 0.040909f }, // 712 (6 2 8 0)
+    { 7.333333f, 7.333333f, 0.666667f, 0.018750f }, // 713 (6 3 0 7)
+    { 7.444445f, 6.777778f, 0.888889f, 0.020134f }, // 714 (6 3 1 6)
+    { 7.555555f, 6.222222f, 1.111111f, 0.021845f }, // 715 (6 3 2 5)
+    { 7.666667f, 5.666667f, 1.333333f, 0.024000f }, // 716 (6 3 3 4)
+    { 7.777778f, 5.111111f, 1.555556f, 0.026786f }, // 717 (6 3 4 3)
+    { 7.888889f, 4.555555f, 1.777778f, 0.030508f }, // 718 (6 3 5 2)
+    { 8.000000f, 4.000000f, 2.000000f, 0.035714f }, // 719 (6 3 6 1)
+    { 8.111111f, 3.444444f, 2.222222f, 0.043478f }, // 720 (6 3 7 0)
+    { 7.777778f, 6.444445f, 0.888889f, 0.020270f }, // 721 (6 4 0 6)
+    { 7.888889f, 5.888889f, 1.111111f, 0.022113f }, // 722 (6 4 1 5)
+    { 8.000000f, 5.333333f, 1.333333f, 0.024457f }, // 723 (6 4 2 4)
+    { 8.111111f, 4.777778f, 1.555556f, 0.027523f }, // 724 (6 4 3 3)
+    { 8.222222f, 4.222222f, 1.777778f, 0.031690f }, // 725 (6 4 4 2)
+    { 8.333333f, 3.666667f, 2.000000f, 0.037657f }, // 726 (6 4 5 1)
+    { 8.444445f, 3.111111f, 2.222222f, 0.046875f }, // 727 (6 4 6 0)
+    { 8.222222f, 5.555555f, 1.111111f, 0.022500f }, // 728 (6 5 0 5)
+    { 8.333333f, 5.000000f, 1.333333f, 0.025070f }, // 729 (6 5 1 4)
+    { 8.444445f, 4.444445f, 1.555556f, 0.028481f }, // 730 (6 5 2 3)
+    { 8.555555f, 3.888889f, 1.777778f, 0.033210f }, // 731 (6 5 3 2)
+    { 8.666667f, 3.333333f, 2.000000f, 0.040179f }, // 732 (6 5 4 1)
+    { 8.777778f, 2.777778f, 2.222222f, 0.051429f }, // 733 (6 5 5 0)
+    { 8.666667f, 4.666667f, 1.333333f, 0.025862f }, // 734 (6 6 0 4)
+    { 8.777778f, 4.111111f, 1.555556f, 0.029703f }, // 735 (6 6 1 3)
+    { 8.888889f, 3.555556f, 1.777778f, 0.035156f }, // 736 (6 6 2 2)
+    { 9.000000f, 3.000000f, 2.000000f, 0.043478f }, // 737 (6 6 3 1)
+    { 9.111111f, 2.444444f, 2.222222f, 0.057692f }, // 738 (6 6 4 0)
+    { 9.111111f, 3.777778f, 1.555556f, 0.031250f }, // 739 (6 7 0 3)
+    { 9.222222f, 3.222222f, 1.777778f, 0.037657f }, // 740 (6 7 1 2)
+    { 9.333333f, 2.666667f, 2.000000f, 0.047872f }, // 741 (6 7 2 1)
+    { 9.444445f, 2.111111f, 2.222222f, 0.066667f }, // 742 (6 7 3 0)
+    { 9.555555f, 2.888889f, 1.777778f, 0.040909f }, // 743 (6 8 0 2)
+    { 9.666667f, 2.333333f, 2.000000f, 0.053892f }, // 744 (6 8 1 1)
+    { 9.777778f, 1.777778f, 2.222222f, 0.080357f }, // 745 (6 8 2 0)
+    { 10.000000f, 2.000000f, 2.000000f, 0.062500f }, // 746 (6 9 0 1)
+    { 10.111111f, 1.444444f, 2.222222f, 0.103448f }, // 747 (6 9 1 0)
+    { 10.444445f, 1.111111f, 2.222222f, 0.150000f }, // 748 (6 10 0 0)
+    { 7.000000f, 9.000000f, 0.000000f, 0.015873f }, // 749 (7 0 0 9)
+    { 7.111111f, 8.444445f, 0.222222f, 0.016667f }, // 750 (7 0 1 8)
+    { 7.222222f, 7.888889f, 0.444444f, 0.017613f }, // 751 (7 0 2 7)
+    { 7.333333f, 7.333333f, 0.666667f, 0.018750f }, // 752 (7 0 3 6)
+    { 7.444445f, 6.777778f, 0.888889f, 0.020134f }, // 753 (7 0 4 5)
+    { 7.555555f, 6.222222f, 1.111111f, 0.021845f }, // 754 (7 0 5 4)
+    { 7.666667f, 5.666667f, 1.333333f, 0.024000f }, // 755 (7 0 6 3)
+    { 7.777778f, 5.111111f, 1.555556f, 0.026786f }, // 756 (7 0 7 2)
+    { 7.888889f, 4.555555f, 1.777778f, 0.030508f }, // 757 (7 0 8 1)
+    { 8.000000f, 4.000000f, 2.000000f, 0.035714f }, // 758 (7 0 9 0)
+    { 7.444445f, 8.111111f, 0.222222f, 0.016575f }, // 759 (7 1 0 8)
+    { 7.555555f, 7.555555f, 0.444444f, 0.017578f }, // 760 (7 1 1 7)
+    { 7.666667f, 7.000000f, 0.666667f, 0.018789f }, // 761 (7 1 2 6)
+    { 7.777778f, 6.444445f, 0.888889f, 0.020270f }, // 762 (7 1 3 5)
+    { 7.888889f, 5.888889f, 1.111111f, 0.022113f }, // 763 (7 1 4 4)
+    { 8.000000f, 5.333333f, 1.333333f, 0.024457f }, // 764 (7 1 5 3)
+    { 8.111111f, 4.777778f, 1.555556f, 0.027523f }, // 765 (7 1 6 2)
+    { 8.222222f, 4.222222f, 1.777778f, 0.031690f }, // 766 (7 1 7 1)
+    { 8.333333f, 3.666667f, 2.000000f, 0.037657f }, // 767 (7 1 8 0)
+    { 7.888889f, 7.222222f, 0.444444f, 0.017613f }, // 768 (7 2 0 7)
+    { 8.000000f, 6.666667f, 0.666667f, 0.018908f }, // 769 (7 2 1 6)
+    { 8.111111f, 6.111111f, 0.888889f, 0.020501f }, // 770 (7 2 2 5)
+    { 8.222222f, 5.555555f, 1.111111f, 0.022500f }, // 771 (7 2 3 4)
+    { 8.333333f, 5.000000f, 1.333333f, 0.025070f }, // 772 (7 2 4 3)
+    { 8.444445f, 4.444445f, 1.555556f, 0.028481f }, // 773 (7 2 5 2)
+    { 8.555555f, 3.888889f, 1.777778f, 0.033210f }, // 774 (7 2 6 1)
+    { 8.666667f, 3.333333f, 2.000000f, 0.040179f }, // 775 (7 2 7 0)
+    { 8.333333f, 6.333333f, 0.666667f, 0.019108f }, // 776 (7 3 0 6)
+    { 8.444445f, 5.777778f, 0.888889f, 0.020833f }, // 777 (7 3 1 5)
+    { 8.555555f, 5.222222f, 1.111111f, 0.023018f }, // 778 (7 3 2 4)
+    { 8.666667f, 4.666667f, 1.333333f, 0.025862f }, // 779 (7 3 3 3)
+    { 8.777778f, 4.111111f, 1.555556f, 0.029703f }, // 780 (7 3 4 2)
+    { 8.888889f, 3.555556f, 1.777778f, 0.035156f }, // 781 (7 3 5 1)
+    { 9.000000f, 3.000000f, 2.000000f, 0.043478f }, // 782 (7 3 6 0)
+    { 8.777778f, 5.444445f, 0.888889f, 0.021277f }, // 783 (7 4 0 5)
+    { 8.888889f, 4.888889f, 1.111111f, 0.023684f }, // 784 (7 4 1 4)
+    { 9.000000f, 4.333333f, 1.333333f, 0.026866f }, // 785 (7 4 2 3)
+    { 9.111111f, 3.777778f, 1.555556f, 0.031250f }, // 786 (7 4 3 2)
+    { 9.222222f, 3.222222f, 1.777778f, 0.037657f }, // 787 (7 4 4 1)
+    { 9.333333f, 2.666667f, 2.000000f, 0.047872f }, // 788 (7 4 5 0)
+    { 9.222222f, 4.555555f, 1.111111f, 0.024523f }, // 789 (7 5 0 4)
+    { 9.333333f, 4.000000f, 1.333333f, 0.028125f }, // 790 (7 5 1 3)
+    { 9.444445f, 3.444444f, 1.555556f, 0.033210f }, // 791 (7 5 2 2)
+    { 9.555555f, 2.888889f, 1.777778f, 0.040909f }, // 792 (7 5 3 1)
+    { 9.666667f, 2.333333f, 2.000000f, 0.053892f }, // 793 (7 5 4 0)
+    { 9.666667f, 3.666667f, 1.333333f, 0.029703f }, // 794 (7 6 0 3)
+    { 9.777778f, 3.111111f, 1.555556f, 0.035714f }, // 795 (7 6 1 2)
+    { 9.888889f, 2.555556f, 1.777778f, 0.045226f }, // 796 (7 6 2 1)
+    { 10.000000f, 2.000000f, 2.000000f, 0.062500f }, // 797 (7 6 3 0)
+    { 10.111111f, 2.777778f, 1.555556f, 0.038961f }, // 798 (7 7 0 2)
+    { 10.222222f, 2.222222f, 1.777778f, 0.051136f }, // 799 (7 7 1 1)
+    { 10.333333f, 1.666667f, 2.000000f, 0.075630f }, // 800 (7 7 2 0)
+    { 10.555555f, 1.888889f, 1.777778f, 0.059603f }, // 801 (7 8 0 1)
+    { 10.666667f, 1.333333f, 2.000000f, 0.097826f }, // 802 (7 8 1 0)
+    { 11.000000f, 1.000000f, 2.000000f, 0.142857f }, // 803 (7 9 0 0)
+    { 8.000000f, 8.000000f, 0.000000f, 0.015625f }, // 804 (8 0 0 8)
+    { 8.111111f, 7.444445f, 0.222222f, 0.016575f }, // 805 (8 0 1 7)
+    { 8.222222f, 6.888889f, 0.444444f, 0.017717f }, // 806 (8 0 2 6)
+    { 8.333333f, 6.333333f, 0.666667f, 0.019108f }, // 807 (8 0 3 5)
+    { 8.444445f, 5.777778f, 0.888889f, 0.020833f }, // 808 (8 0 4 4)
+    { 8.555555f, 5.222222f, 1.111111f, 0.023018f }, // 809 (8 0 5 3)
+    { 8.666667f, 4.666667f, 1.333333f, 0.025862f }, // 810 (8 0 6 2)
+    { 8.777778f, 4.111111f, 1.555556f, 0.029703f }, // 811 (8 0 7 1)
+    { 8.888889f, 3.555556f, 1.777778f, 0.035156f }, // 812 (8 0 8 0)
+    { 8.444445f, 7.111111f, 0.222222f, 0.016667f }, // 813 (8 1 0 7)
+    { 8.555555f, 6.555555f, 0.444444f, 0.017893f }, // 814 (8 1 1 6)
+    { 8.666667f, 6.000000f, 0.666667f, 0.019397f }, // 815 (8 1 2 5)
+    { 8.777778f, 5.444445f, 0.888889f, 0.021277f }, // 816 (8 1 3 4)
+    { 8.888889f, 4.888889f, 1.111111f, 0.023684f }, // 817 (8 1 4 3)
+    { 9.000000f, 4.333333f, 1.333333f, 0.026866f }, // 818 (8 1 5 2)
+    { 9.111111f, 3.777778f, 1.555556f, 0.031250f }, // 819 (8 1 6 1)
+    { 9.222222f, 3.222222f, 1.777778f, 0.037657f }, // 820 (8 1 7 0)
+    { 8.888889f, 6.222222f, 0.444444f, 0.018145f }, // 821 (8 2 0 6)
+    { 9.000000f, 5.666667f, 0.666667f, 0.019780f }, // 822 (8 2 1 5)
+    { 9.111111f, 5.111111f, 0.888889f, 0.021845f }, // 823 (8 2 2 4)
+    { 9.222222f, 4.555555f, 1.111111f, 0.024523f }, // 824 (8 2 3 3)
+    { 9.333333f, 4.000000f, 1.333333f, 0.028125f }, // 825 (8 2 4 2)
+    { 9.444445f, 3.444444f, 1.555556f, 0.033210f }, // 826 (8 2 5 1)
+    { 9.555555f, 2.888889f, 1.777778f, 0.040909f }, // 827 (8 2 6 0)
+    { 9.333333f, 5.333333f, 0.666667f, 0.020270f }, // 828 (8 3 0 5)
+    { 9.444445f, 4.777778f, 0.888889f, 0.022556f }, // 829 (8 3 1 4)
+    { 9.555555f, 4.222222f, 1.111111f, 0.025568f }, // 830 (8 3 2 3)
+    { 9.666667f, 3.666667f, 1.333333f, 0.029703f }, // 831 (8 3 3 2)
+    { 9.777778f, 3.111111f, 1.555556f, 0.035714f }, // 832 (8 3 4 1)
+    { 9.888889f, 2.555556f, 1.777778f, 0.045226f }, // 833 (8 3 5 0)
+    { 9.777778f, 4.444445f, 0.888889f, 0.023438f }, // 834 (8 4 0 4)
+    { 9.888889f, 3.888889f, 1.111111f, 0.026866f }, // 835 (8 4 1 3)
+    { 10.000000f, 3.333333f, 1.333333f, 0.031690f }, // 836 (8 4 2 2)
+    { 10.111111f, 2.777778f, 1.555556f, 0.038961f }, // 837 (8 4 3 1)
+    { 10.222222f, 2.222222f, 1.777778f, 0.051136f }, // 838 (8 4 4 0)
+    { 10.222222f, 3.555556f, 1.111111f, 0.028481f }, // 839 (8 5 0 3)
+    { 10.333333f, 3.000000f, 1.333333f, 0.034221f }, // 840 (8 5 1 2)
+    { 10.444445f, 2.444444f, 1.555556f, 0.043269f }, // 841 (8 5 2 1)
+    { 10.555555f, 1.888889f, 1.777778f, 0.059603f }, // 842 (8 5 3 0)
+    { 10.666667f, 2.666667f, 1.333333f, 0.037500f }, // 843 (8 6 0 2)
+    { 10.777778f, 2.111111f, 1.555556f, 0.049180f }, // 844 (8 6 1 1)
+    { 10.888889f, 1.555556f, 1.777778f, 0.072581f }, // 845 (8 6 2 0)
+    { 11.111111f, 1.777778f, 1.555556f, 0.057692f }, // 846 (8 7 0 1)
+    { 11.222222f, 1.222222f, 1.777778f, 0.094737f }, // 847 (8 7 1 0)
+    { 11.555555f, 0.888889f, 1.777778f, 0.140625f }, // 848 (8 8 0 0)
+    { 9.000000f, 7.000000f, 0.000000f, 0.015873f }, // 849 (9 0 0 7)
+    { 9.111111f, 6.444445f, 0.222222f, 0.017045f }, // 850 (9 0 1 6)
+    { 9.222222f, 5.888889f, 0.444444f, 0.018480f }, // 851 (9 0 2 5)
+    { 9.333333f, 5.333333f, 0.666667f, 0.020270f }, // 852 (9 0 3 4)
+    { 9.444445f, 4.777778f, 0.888889f, 0.022556f }, // 853 (9 0 4 3)
+    { 9.555555f, 4.222222f, 1.111111f, 0.025568f }, // 854 (9 0 5 2)
+    { 9.666667f, 3.666667f, 1.333333f, 0.029703f }, // 855 (9 0 6 1)
+    { 9.777778f, 3.111111f, 1.555556f, 0.035714f }, // 856 (9 0 7 0)
+    { 9.444445f, 6.111111f, 0.222222f, 0.017341f }, // 857 (9 1 0 6)
+    { 9.555555f, 5.555555f, 0.444444f, 0.018908f }, // 858 (9 1 1 5)
+    { 9.666667f, 5.000000f, 0.666667f, 0.020882f }, // 859 (9 1 2 4)
+    { 9.777778f, 4.444445f, 0.888889f, 0.023438f }, // 860 (9 1 3 3)
+    { 9.888889f, 3.888889f, 1.111111f, 0.026866f }, // 861 (9 1 4 2)
+    { 10.000000f, 3.333333f, 1.333333f, 0.031690f }, // 862 (9 1 5 1)
+    { 10.111111f, 2.777778f, 1.555556f, 0.038961f }, // 863 (9 1 6 0)
+    { 9.888889f, 5.222222f, 0.444444f, 0.019438f }, // 864 (9 2 0 5)
+    { 10.000000f, 4.666667f, 0.666667f, 0.021635f }, // 865 (9 2 1 4)
+    { 10.111111f, 4.111111f, 0.888889f, 0.024523f }, // 866 (9 2 2 3)
+    { 10.222222f, 3.555556f, 1.111111f, 0.028481f }, // 867 (9 2 3 2)
+    { 10.333333f, 3.000000f, 1.333333f, 0.034221f }, // 868 (9 2 4 1)
+    { 10.444445f, 2.444444f, 1.555556f, 0.043269f }, // 869 (9 2 5 0)
+    { 10.333333f, 4.333333f, 0.666667f, 0.022556f }, // 870 (9 3 0 4)
+    { 10.444445f, 3.777778f, 0.888889f, 0.025862f }, // 871 (9 3 1 3)
+    { 10.555555f, 3.222222f, 1.111111f, 0.030508f }, // 872 (9 3 2 2)
+    { 10.666667f, 2.666667f, 1.333333f, 0.037500f }, // 873 (9 3 3 1)
+    { 10.777778f, 2.111111f, 1.555556f, 0.049180f }, // 874 (9 3 4 0)
+    { 10.777778f, 3.444444f, 0.888889f, 0.027523f }, // 875 (9 4 0 3)
+    { 10.888889f, 2.888889f, 1.111111f, 0.033088f }, // 876 (9 4 1 2)
+    { 11.000000f, 2.333333f, 1.333333f, 0.041860f }, // 877 (9 4 2 1)
+    { 11.111111f, 1.777778f, 1.555556f, 0.057692f }, // 878 (9 4 3 0)
+    { 11.222222f, 2.555556f, 1.111111f, 0.036437f }, // 879 (9 5 0 2)
+    { 11.333333f, 2.000000f, 1.333333f, 0.047872f }, // 880 (9 5 1 1)
+    { 11.444445f, 1.444444f, 1.555556f, 0.070866f }, // 881 (9 5 2 0)
+    { 11.666667f, 1.666667f, 1.333333f, 0.056604f }, // 882 (9 6 0 1)
+    { 11.777778f, 1.111111f, 1.555556f, 0.093750f }, // 883 (9 6 1 0)
+    { 12.111111f, 0.777778f, 1.555556f, 0.142857f }, // 884 (9 7 0 0)
+    { 10.000000f, 6.000000f, 0.000000f, 0.016667f }, // 885 (10 0 0 6)
+    { 10.111111f, 5.444445f, 0.222222f, 0.018182f }, // 886 (10 0 1 5)
+    { 10.222222f, 4.888889f, 0.444444f, 0.020089f }, // 887 (10 0 2 4)
+    { 10.333333f, 4.333333f, 0.666667f, 0.022556f }, // 888 (10 0 3 3)
+    { 10.444445f, 3.777778f, 0.888889f, 0.025862f }, // 889 (10 0 4 2)
+    { 10.555555f, 3.222222f, 1.111111f, 0.030508f }, // 890 (10 0 5 1)
+    { 10.666667f, 2.666667f, 1.333333f, 0.037500f }, // 891 (10 0 6 0)
+    { 10.444445f, 5.111111f, 0.222222f, 0.018750f }, // 892 (10 1 0 5)
+    { 10.555555f, 4.555555f, 0.444444f, 0.020882f }, // 893 (10 1 1 4)
+    { 10.666667f, 4.000000f, 0.666667f, 0.023684f }, // 894 (10 1 2 3)
+    { 10.777778f, 3.444444f, 0.888889f, 0.027523f }, // 895 (10 1 3 2)
+    { 10.888889f, 2.888889f, 1.111111f, 0.033088f }, // 896 (10 1 4 1)
+    { 11.000000f, 2.333333f, 1.333333f, 0.041860f }, // 897 (10 1 5 0)
+    { 10.888889f, 4.222222f, 0.444444f, 0.021845f }, // 898 (10 2 0 4)
+    { 11.000000f, 3.666667f, 0.666667f, 0.025070f }, // 899 (10 2 1 3)
+    { 11.111111f, 3.111111f, 0.888889f, 0.029605f }, // 900 (10 2 2 2)
+    { 11.222222f, 2.555556f, 1.111111f, 0.036437f }, // 901 (10 2 3 1)
+    { 11.333333f, 2.000000f, 1.333333f, 0.047872f }, // 902 (10 2 4 0)
+    { 11.333333f, 3.333333f, 0.666667f, 0.026786f }, // 903 (10 3 0 3)
+    { 11.444445f, 2.777778f, 0.888889f, 0.032258f }, // 904 (10 3 1 2)
+    { 11.555555f, 2.222222f, 1.111111f, 0.040909f }, // 905 (10 3 2 1)
+    { 11.666667f, 1.666667f, 1.333333f, 0.056604f }, // 906 (10 3 3 0)
+    { 11.777778f, 2.444444f, 0.888889f, 0.035714f }, // 907 (10 4 0 2)
+    { 11.888889f, 1.888889f, 1.111111f, 0.047120f }, // 908 (10 4 1 1)
+    { 12.000000f, 1.333333f, 1.333333f, 0.070313f }, // 909 (10 4 2 0)
+    { 12.222222f, 1.555556f, 1.111111f, 0.056250f }, // 910 (10 5 0 1)
+    { 12.333333f, 1.000000f, 1.333333f, 0.094737f }, // 911 (10 5 1 0)
+    { 12.666667f, 0.666667f, 1.333333f, 0.150000f }, // 912 (10 6 0 0)
+    { 11.000000f, 5.000000f, 0.000000f, 0.018182f }, // 913 (11 0 0 5)
+    { 11.111111f, 4.444445f, 0.222222f, 0.020270f }, // 914 (11 0 1 4)
+    { 11.222222f, 3.888889f, 0.444444f, 0.023018f }, // 915 (11 0 2 3)
+    { 11.333333f, 3.333333f, 0.666667f, 0.026786f }, // 916 (11 0 3 2)
+    { 11.444445f, 2.777778f, 0.888889f, 0.032258f }, // 917 (11 0 4 1)
+    { 11.555555f, 2.222222f, 1.111111f, 0.040909f }, // 918 (11 0 5 0)
+    { 11.444445f, 4.111111f, 0.222222f, 0.021277f }, // 919 (11 1 0 4)
+    { 11.555555f, 3.555556f, 0.444444f, 0.024457f }, // 920 (11 1 1 3)
+    { 11.666667f, 3.000000f, 0.666667f, 0.028939f }, // 921 (11 1 2 2)
+    { 11.777778f, 2.444444f, 0.888889f, 0.035714f }, // 922 (11 1 3 1)
+    { 11.888889f, 1.888889f, 1.111111f, 0.047120f }, // 923 (11 1 4 0)
+    { 11.888889f, 3.222222f, 0.444444f, 0.026239f }, // 924 (11 2 0 3)
+    { 12.000000f, 2.666667f, 0.666667f, 0.031690f }, // 925 (11 2 1 2)
+    { 12.111111f, 2.111111f, 0.888889f, 0.040359f }, // 926 (11 2 2 1)
+    { 12.222222f, 1.555556f, 1.111111f, 0.056250f }, // 927 (11 2 3 0)
+    { 12.333333f, 2.333333f, 0.666667f, 0.035294f }, // 928 (11 3 0 2)
+    { 12.444445f, 1.777778f, 0.888889f, 0.046875f }, // 929 (11 3 1 1)
+    { 12.555555f, 1.222222f, 1.111111f, 0.070866f }, // 930 (11 3 2 0)
+    { 12.777778f, 1.444444f, 0.888889f, 0.056604f }, // 931 (11 4 0 1)
+    { 12.888889f, 0.888889f, 1.111111f, 0.097826f }, // 932 (11 4 1 0)
+    { 13.222222f, 0.555556f, 1.111111f, 0.163636f }, // 933 (11 5 0 0)
+    { 12.000000f, 4.000000f, 0.000000f, 0.020833f }, // 934 (12 0 0 4)
+    { 12.111111f, 3.444444f, 0.222222f, 0.024000f }, // 935 (12 0 1 3)
+    { 12.222222f, 2.888889f, 0.444444f, 0.028481f }, // 936 (12 0 2 2)
+    { 12.333333f, 2.333333f, 0.666667f, 0.035294f }, // 937 (12 0 3 1)
+    { 12.444445f, 1.777778f, 0.888889f, 0.046875f }, // 938 (12 0 4 0)
+    { 12.444445f, 3.111111f, 0.222222f, 0.025862f }, // 939 (12 1 0 3)
+    { 12.555555f, 2.555556f, 0.444444f, 0.031359f }, // 940 (12 1 1 2)
+    { 12.666667f, 2.000000f, 0.666667f, 0.040179f }, // 941 (12 1 2 1)
+    { 12.777778f, 1.444444f, 0.888889f, 0.056604f }, // 942 (12 1 3 0)
+    { 12.888889f, 2.222222f, 0.444444f, 0.035156f }, // 943 (12 2 0 2)
+    { 13.000000f, 1.666667f, 0.666667f, 0.047120f }, // 944 (12 2 1 1)
+    { 13.111111f, 1.111111f, 0.888889f, 0.072581f }, // 945 (12 2 2 0)
+    { 13.333333f, 1.333333f, 0.666667f, 0.057692f }, // 946 (12 3 0 1)
+    { 13.444445f, 0.777778f, 0.888889f, 0.103448f }, // 947 (12 3 1 0)
+    { 13.777778f, 0.444444f, 0.888889f, 0.187500f }, // 948 (12 4 0 0)
+    { 13.000000f, 3.000000f, 0.000000f, 0.025641f }, // 949 (13 0 0 3)
+    { 13.111111f, 2.444444f, 0.222222f, 0.031250f }, // 950 (13 0 1 2)
+    { 13.222222f, 1.888889f, 0.444444f, 0.040359f }, // 951 (13 0 2 1)
+    { 13.333333f, 1.333333f, 0.666667f, 0.057692f }, // 952 (13 0 3 0)
+    { 13.444445f, 2.111111f, 0.222222f, 0.035294f }, // 953 (13 1 0 2)
+    { 13.555555f, 1.555556f, 0.444444f, 0.047872f }, // 954 (13 1 1 1)
+    { 13.666667f, 1.000000f, 0.666667f, 0.075630f }, // 955 (13 1 2 0)
+    { 13.888889f, 1.222222f, 0.444444f, 0.059603f }, // 956 (13 2 0 1)
+    { 14.000000f, 0.666667f, 0.666667f, 0.112500f }, // 957 (13 2 1 0)
+    { 14.333333f, 0.333333f, 0.666667f, 0.230769f }, // 958 (13 3 0 0)
+    { 14.000000f, 2.000000f, 0.000000f, 0.035714f }, // 959 (14 0 0 2)
+    { 14.111111f, 1.444444f, 0.222222f, 0.049180f }, // 960 (14 0 1 1)
+    { 14.222222f, 0.888889f, 0.444444f, 0.080357f }, // 961 (14 0 2 0)
+    { 14.444445f, 1.111111f, 0.222222f, 0.062500f }, // 962 (14 1 0 1)
+    { 14.555555f, 0.555556f, 0.444444f, 0.126761f }, // 963 (14 1 1 0)
+    { 14.888889f, 0.222222f, 0.444444f, 0.321429f }, // 964 (14 2 0 0)
+    { 15.000000f, 1.000000f, 0.000000f, 0.066667f }, // 965 (15 0 0 1)
+    { 15.111111f, 0.444444f, 0.222222f, 0.150000f }, // 966 (15 0 1 0)
+    { 15.444445f, 0.111111f, 0.222222f, 0.600000f }, // 967 (15 1 0 0)
+    { 16.000000f, 0.000000f, 0.000000f, FLT_MAX }, // 968 (16 0 0 0)
+}; // 969 four cluster elements
+
+#if ICBC_USE_SIMD
+
+bool ClusterFit::compress3(Vector3 * start, Vector3 * end)
+{
+    const int count = m_count;
+    const SimdVector one = SimdVector(1.0f);
+    const SimdVector zero = SimdVector(0.0f);
+    const SimdVector half(0.5f, 0.5f, 0.5f, 0.25f);
+    const SimdVector two = SimdVector(2.0);
+    const SimdVector grid(31.0f, 63.0f, 31.0f, 0.0f);
+    const SimdVector gridrcp(1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f, 0.0f);
+
+    // declare variables
+    SimdVector beststart = SimdVector(0.0f);
+    SimdVector bestend = SimdVector(0.0f);
+    SimdVector besterror = SimdVector(FLT_MAX);
+
+    SimdVector x0 = zero;
+
+    // check all possible clusters for this total order
+    for (int c0 = 0; c0 <= count; c0++)
+    {
+        SimdVector x1 = zero;
+
+        for (int c1 = 0; c1 <= count - c0; c1++)
+        {
+            const SimdVector x2 = m_xsum - x1 - x0;
+
+            //Vector3 alphax_sum = x0 + x1 * 0.5f;
+            //float alpha2_sum = w0 + w1 * 0.25f;
+            const SimdVector alphax_sum = multiplyAdd(x1, half, x0); // alphax_sum, alpha2_sum
+            const SimdVector alpha2_sum = alphax_sum.splatW();
+
+            //const Vector3 betax_sum = x2 + x1 * 0.5f;
+            //const float beta2_sum = w2 + w1 * 0.25f;
+            const SimdVector betax_sum = multiplyAdd(x1, half, x2); // betax_sum, beta2_sum
+            const SimdVector beta2_sum = betax_sum.splatW();
+
+            //const float alphabeta_sum = w1 * 0.25f;
+            const SimdVector alphabeta_sum = (x1 * half).splatW(); // alphabeta_sum
+
+            // const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+            const SimdVector factor = reciprocal(negativeMultiplySubtract(alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum));
+
+            SimdVector a = negativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor;
+            SimdVector b = negativeMultiplySubtract(alphax_sum, alphabeta_sum, betax_sum*alpha2_sum) * factor;
+
+            // clamp to the grid
+            a = min(one, max(zero, a));
+            b = min(one, max(zero, b));
+            a = truncate(multiplyAdd(grid, a, half)) * gridrcp;
+            b = truncate(multiplyAdd(grid, b, half)) * gridrcp;
+
+            // compute the error (we skip the constant xxsum)
+            SimdVector e1 = multiplyAdd(a*a, alpha2_sum, b*b*beta2_sum);
+            SimdVector e2 = negativeMultiplySubtract(a, alphax_sum, a*b*alphabeta_sum);
+            SimdVector e3 = negativeMultiplySubtract(b, betax_sum, e2);
+            SimdVector e4 = multiplyAdd(two, e3, e1);
+
+            // apply the metric to the error term
+            SimdVector e5 = e4 * m_metricSqr;
+            SimdVector error = e5.splatX() + e5.splatY() + e5.splatZ();
+
+            // keep the solution if it wins
+            if (compareAnyLessThan(error, besterror))
+            {
+                besterror = error;
+                beststart = a;
+                bestend = b;
+            }
+
+            x1 += m_weighted[c0 + c1];
+        }
+
+        x0 += m_weighted[c0];
+    }
+
+    // save the block if necessary
+    if (compareAnyLessThan(besterror, m_besterror))
+    {
+        *start = beststart.toVector3();
+        *end = bestend.toVector3();
+
+        // save the error
+        m_besterror = besterror;
+
+        return true;
+    }
+
+    return false;
+}
+
+bool ClusterFit::compress4(Vector3 * start, Vector3 * end)
+{
+    const int count = m_count;
+    const SimdVector one = SimdVector(1.0f);
+    const SimdVector zero = SimdVector(0.0f);
+    const SimdVector half = SimdVector(0.5f);
+    const SimdVector two = SimdVector(2.0);
+    const SimdVector onethird(1.0f / 3.0f, 1.0f / 3.0f, 1.0f / 3.0f, 1.0f / 9.0f);
+    const SimdVector twothirds(2.0f / 3.0f, 2.0f / 3.0f, 2.0f / 3.0f, 4.0f / 9.0f);
+    const SimdVector twonineths = SimdVector(2.0f / 9.0f);
+    const SimdVector grid(31.0f, 63.0f, 31.0f, 0.0f);
+    const SimdVector gridrcp(1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f, 0.0f);
+
+    // declare variables
+    SimdVector beststart = SimdVector(0.0f);
+    SimdVector bestend = SimdVector(0.0f);
+    SimdVector besterror = SimdVector(FLT_MAX);
+
+    SimdVector x0 = zero;
+
+    // check all possible clusters for this total order
+    for (int c0 = 0; c0 <= count; c0++)
+    {
+        SimdVector x1 = zero;
+
+        for (int c1 = 0; c1 <= count - c0; c1++)
+        {
+            SimdVector x2 = zero;
+
+            for (int c2 = 0; c2 <= count - c0 - c1; c2++)
+            {
+                const SimdVector x3 = m_xsum - x2 - x1 - x0;
+
+                //const Vector3 alphax_sum = x0 + x1 * (2.0f / 3.0f) + x2 * (1.0f / 3.0f);
+                //const float alpha2_sum = w0 + w1 * (4.0f/9.0f) + w2 * (1.0f/9.0f);
+                const SimdVector alphax_sum = multiplyAdd(x2, onethird, multiplyAdd(x1, twothirds, x0)); // alphax_sum, alpha2_sum
+                const SimdVector alpha2_sum = alphax_sum.splatW();
+
+                //const Vector3 betax_sum = x3 + x2 * (2.0f / 3.0f) + x1 * (1.0f / 3.0f);
+                //const float beta2_sum = w3 + w2 * (4.0f/9.0f) + w1 * (1.0f/9.0f);
+                const SimdVector betax_sum = multiplyAdd(x2, twothirds, multiplyAdd(x1, onethird, x3)); // betax_sum, beta2_sum
+                const SimdVector beta2_sum = betax_sum.splatW();
+
+                //const float alphabeta_sum = (w1 + w2) * (2.0f/9.0f);
+                const SimdVector alphabeta_sum = twonineths * (x1 + x2).splatW(); // alphabeta_sum
+
+                //const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+                const SimdVector factor = reciprocal(negativeMultiplySubtract(alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum));
+
+                SimdVector a = negativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor;
+                SimdVector b = negativeMultiplySubtract(alphax_sum, alphabeta_sum, betax_sum*alpha2_sum) * factor;
+
+                // clamp to the grid
+                a = min(one, max(zero, a));
+                b = min(one, max(zero, b));
+                a = truncate(multiplyAdd(grid, a, half)) * gridrcp;
+                b = truncate(multiplyAdd(grid, b, half)) * gridrcp;
+
+                // compute the error (we skip the constant xxsum)
+                // error = a*a*alpha2_sum + b*b*beta2_sum + 2.0f*( a*b*alphabeta_sum - a*alphax_sum - b*betax_sum );
+                SimdVector e1 = multiplyAdd(a*a, alpha2_sum, b*b*beta2_sum);
+                SimdVector e2 = negativeMultiplySubtract(a, alphax_sum, a*b*alphabeta_sum);
+                SimdVector e3 = negativeMultiplySubtract(b, betax_sum, e2);
+                SimdVector e4 = multiplyAdd(two, e3, e1);
+
+                // apply the metric to the error term
+                SimdVector e5 = e4 * m_metricSqr;
+                SimdVector error = e5.splatX() + e5.splatY() + e5.splatZ();
+
+                // keep the solution if it wins
+                if (compareAnyLessThan(error, besterror))
+                {
+                    besterror = error;
+                    beststart = a;
+                    bestend = b;
+                }
+
+                x2 += m_weighted[c0 + c1 + c2];
+            }
+
+            x1 += m_weighted[c0 + c1];
+        }
+
+        x0 += m_weighted[c0];
+    }
+
+    // save the block if necessary
+    if (compareAnyLessThan(besterror, m_besterror))
+    {
+        *start = beststart.toVector3();
+        *end = bestend.toVector3();
+
+        // save the error
+        m_besterror = besterror;
+
+        return true;
+    }
+
+    return false;
+}
+
+bool ClusterFit::fastCompress3(Vector3 * start, Vector3 * end)
+{
+    const int count = m_count;
+    const SimdVector one = SimdVector(1.0f);
+    const SimdVector zero = SimdVector(0.0f);
+    const SimdVector half(0.5f, 0.5f, 0.5f, 0.25f);
+    const SimdVector two = SimdVector(2.0);
+    const SimdVector grid(31.0f, 63.0f, 31.0f, 0.0f);
+    const SimdVector gridrcp(1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f, 0.0f);
+
+    // declare variables
+    SimdVector beststart = SimdVector(0.0f);
+    SimdVector bestend = SimdVector(0.0f);
+    SimdVector besterror = SimdVector(FLT_MAX);
+
+    SimdVector x0 = zero;
+
+    // check all possible clusters for this total order
+    for (int c0 = 0, i = 0; c0 <= count; c0++)
+    {
+        SimdVector x1 = zero;
+
+        for (int c1 = 0; c1 <= count - c0; c1++, i++)
+        {
+            const SimdVector constants = SimdVector((const float *)&s_threeElement[i]);
+
+            const SimdVector alpha2_sum = constants.splatX();
+            const SimdVector beta2_sum = constants.splatY();
+            const SimdVector alphabeta_sum = constants.splatZ();
+            const SimdVector factor = constants.splatW();
+
+            const SimdVector alphax_sum = multiplyAdd(x1, half, x0);
+            const SimdVector betax_sum = m_xsum - alphax_sum;
+
+            SimdVector a = negativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor;
+            SimdVector b = negativeMultiplySubtract(alphax_sum, alphabeta_sum, betax_sum*alpha2_sum) * factor;
+
+            // clamp to the grid
+            a = min(one, max(zero, a));
+            b = min(one, max(zero, b));
+            a = truncate(multiplyAdd(grid, a, half)) * gridrcp;
+            b = truncate(multiplyAdd(grid, b, half)) * gridrcp;
+
+            // compute the error (we skip the constant xxsum)
+            SimdVector e1 = multiplyAdd(a*a, alpha2_sum, b*b*beta2_sum);
+            SimdVector e2 = negativeMultiplySubtract(a, alphax_sum, a*b*alphabeta_sum);
+            SimdVector e3 = negativeMultiplySubtract(b, betax_sum, e2);
+            SimdVector e4 = multiplyAdd(two, e3, e1);
+
+            // apply the metric to the error term
+            SimdVector e5 = e4 * m_metricSqr;
+            SimdVector error = e5.splatX() + e5.splatY() + e5.splatZ();
+
+            // keep the solution if it wins
+            if (compareAnyLessThan(error, besterror))
+            {
+                besterror = error;
+                beststart = a;
+                bestend = b;
+            }
+
+            x1 += m_weighted[c0 + c1];
+        }
+
+        x0 += m_weighted[c0];
+    }
+
+    // save the block if necessary
+    if (compareAnyLessThan(besterror, m_besterror))
+    {
+        *start = beststart.toVector3();
+        *end = bestend.toVector3();
+
+        // save the error
+        m_besterror = besterror;
+
+        return true;
+    }
+
+    return false;
+}
+
+bool ClusterFit::fastCompress4(Vector3 * start, Vector3 * end)
+{
+    const SimdVector one = SimdVector(1.0f);
+    const SimdVector zero = SimdVector(0.0f);
+    const SimdVector half = SimdVector(0.5f);
+    const SimdVector two = SimdVector(2.0);
+    const SimdVector onethird(1.0f / 3.0f, 1.0f / 3.0f, 1.0f / 3.0f, 1.0f / 9.0f);
+    const SimdVector twothirds(2.0f / 3.0f, 2.0f / 3.0f, 2.0f / 3.0f, 4.0f / 9.0f);
+    const SimdVector grid(31.0f, 63.0f, 31.0f, 0.0f);
+    const SimdVector gridrcp(1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f, 0.0f);
+
+    // declare variables
+    SimdVector beststart = SimdVector(0.0f);
+    SimdVector bestend = SimdVector(0.0f);
+    SimdVector besterror = SimdVector(FLT_MAX);
+
+    SimdVector x0 = zero;
+
+    // check all possible clusters for this total order
+    for (int c0 = 0, i = 0; c0 <= 16; c0++)
+    {
+        SimdVector x1 = zero;
+
+        for (int c1 = 0; c1 <= 16 - c0; c1++)
+        {
+            SimdVector x2 = zero;
+
+            for (int c2 = 0; c2 <= 16 - c0 - c1; c2++, i++)
+            {
+                const SimdVector constants = SimdVector((const float *)&s_fourElement[i]); 
+
+                const SimdVector alpha2_sum = constants.splatX();
+                const SimdVector beta2_sum = constants.splatY();
+                const SimdVector alphabeta_sum = constants.splatZ();
+                const SimdVector factor = constants.splatW();
+                
+                const SimdVector alphax_sum = multiplyAdd(x2, onethird, multiplyAdd(x1, twothirds, x0));
+                const SimdVector betax_sum = m_xsum - alphax_sum;
+
+                SimdVector a = negativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor;
+                SimdVector b = negativeMultiplySubtract(alphax_sum, alphabeta_sum, betax_sum*alpha2_sum) * factor;
+
+                // clamp to the grid
+                a = min(one, max(zero, a));
+                b = min(one, max(zero, b));
+                a = truncate(multiplyAdd(grid, a, half)) * gridrcp;
+                b = truncate(multiplyAdd(grid, b, half)) * gridrcp;
+
+                // compute the error (we skip the constant xxsum)
+                // error = a*a*alpha2_sum + b*b*beta2_sum + 2.0f*( a*b*alphabeta_sum - a*alphax_sum - b*betax_sum );
+                SimdVector e1 = multiplyAdd(a*a, alpha2_sum, b*b*beta2_sum);
+                SimdVector e2 = negativeMultiplySubtract(a, alphax_sum, a*b*alphabeta_sum);
+                SimdVector e3 = negativeMultiplySubtract(b, betax_sum, e2);
+                SimdVector e4 = multiplyAdd(two, e3, e1);
+
+                // apply the metric to the error term
+                SimdVector e5 = e4 * m_metricSqr;
+                SimdVector error = e5.splatX() + e5.splatY() + e5.splatZ();
+
+                // keep the solution if it wins
+                if (compareAnyLessThan(error, besterror))
+                {
+                    besterror = error;
+                    beststart = a;
+                    bestend = b;
+                }
+
+                x2 += m_weighted[c0 + c1 + c2];
+            }
+
+            x1 += m_weighted[c0 + c1];
+        }
+
+        x0 += m_weighted[c0];
+    }
+
+    // save the block if necessary
+    if (compareAnyLessThan(besterror, m_besterror))
+    {
+        *start = beststart.toVector3();
+        *end = bestend.toVector3();
+
+        // save the error
+        m_besterror = besterror;
+
+        return true;
+    }
+
+    return false;
+}
+
+#else
+
+// This is the ideal way to round, but it's too expensive to do this in the inner loop.
+inline Vector3 round565(const Vector3 & v) {
+    static const Vector3 grid = { 31.0f, 63.0f, 31.0f };
+    static const Vector3 gridrcp = { 1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f };
+
+    Vector3 q = floor(grid * v);
+    q.x += (v.x > midpoints5[int(q.x)]);
+    q.y += (v.y > midpoints6[int(q.y)]);
+    q.z += (v.z > midpoints5[int(q.z)]);
+    q *= gridrcp;
+    return q;
+}
+
+bool ClusterFit::compress3(Vector3 * start, Vector3 * end)
+{
+    const uint count = m_count;
+    const Vector3 grid = { 31.0f, 63.0f, 31.0f };
+    const Vector3 gridrcp = { 1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f };
+
+    // declare variables
+    Vector3 beststart = { 0.0f };
+    Vector3 bestend = { 0.0f };
+    float besterror = FLT_MAX;
+
+    Vector3 x0 = { 0.0f };
+    float w0 = 0.0f;
+
+    // check all possible clusters for this total order
+    for (uint c0 = 0; c0 <= count; c0++)
+    {
+        Vector3 x1 = { 0.0f };
+        float w1 = 0.0f;
+
+        for (uint c1 = 0; c1 <= count - c0; c1++)
+        {
+            float w2 = m_wsum - w0 - w1;
+
+            // These factors could be entirely precomputed.
+            float const alpha2_sum = w0 + w1 * 0.25f;
+            float const beta2_sum = w2 + w1 * 0.25f;
+            float const alphabeta_sum = w1 * 0.25f;
+            float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+
+            Vector3 const alphax_sum = x0 + x1 * 0.5f;
+            Vector3 const betax_sum = m_xsum - alphax_sum;
+
+            Vector3 a = (alphax_sum*beta2_sum - betax_sum * alphabeta_sum) * factor;
+            Vector3 b = (betax_sum*alpha2_sum - alphax_sum * alphabeta_sum) * factor;
+
+            // clamp to the grid
+            a = saturate(a);
+            b = saturate(b);
+#if ICBC_PERFECT_ROUND
+            a = round565(a);
+            b = round565(b);
+#else
+            a = round(grid * a) * gridrcp;
+            b = round(grid * b) * gridrcp;
+#endif
+
+            // compute the error
+            Vector3 e1 = a * a*alpha2_sum + b * b*beta2_sum + 2.0f*(a*b*alphabeta_sum - a * alphax_sum - b * betax_sum);
+
+            // apply the metric to the error term
+            float error = dot(e1, m_metricSqr);
+
+            // keep the solution if it wins
+            if (error < besterror)
+            {
+                besterror = error;
+                beststart = a;
+                bestend = b;
+            }
+
+            x1 += m_weighted[c0 + c1];
+            w1 += m_weights[c0 + c1];
+        }
+
+        x0 += m_weighted[c0];
+        w0 += m_weights[c0];
+    }
+
+    // save the block if necessary
+    if (besterror < m_besterror)
+    {
+
+        *start = beststart;
+        *end = bestend;
+
+        // save the error
+        m_besterror = besterror;
+
+        return true;
+    }
+
+    return false;
+}
+
+bool ClusterFit::compress4(Vector3 * start, Vector3 * end)
+{
+    const uint count = m_count;
+    const Vector3 grid = { 31.0f, 63.0f, 31.0f };
+    const Vector3 gridrcp = { 1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f };
+
+    // declare variables
+    Vector3 beststart = { 0.0f };
+    Vector3 bestend = { 0.0f };
+    float besterror = FLT_MAX;
+
+    Vector3 x0 = { 0.0f };
+    float w0 = 0.0f;
+
+    // check all possible clusters for this total order
+    for (uint c0 = 0; c0 <= count; c0++)
+    {
+        Vector3 x1 = { 0.0f };
+        float w1 = 0.0f;
+
+        for (uint c1 = 0; c1 <= count - c0; c1++)
+        {
+            Vector3 x2 = { 0.0f };
+            float w2 = 0.0f;
+
+            for (uint c2 = 0; c2 <= count - c0 - c1; c2++)
+            {
+                float w3 = m_wsum - w0 - w1 - w2;
+
+                float const alpha2_sum = w0 + w1 * (4.0f / 9.0f) + w2 * (1.0f / 9.0f);
+                float const beta2_sum = w3 + w2 * (4.0f / 9.0f) + w1 * (1.0f / 9.0f);
+                float const alphabeta_sum = (w1 + w2) * (2.0f / 9.0f);
+                float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+
+                Vector3 const alphax_sum = x0 + x1 * (2.0f / 3.0f) + x2 * (1.0f / 3.0f);
+                Vector3 const betax_sum = m_xsum - alphax_sum;
+
+                Vector3 a = (alphax_sum*beta2_sum - betax_sum * alphabeta_sum)*factor;
+                Vector3 b = (betax_sum*alpha2_sum - alphax_sum * alphabeta_sum)*factor;
+
+                // clamp to the grid
+                a = saturate(a);
+                b = saturate(b);
+#if ICBC_PERFECT_ROUND
+                a = round565(a);
+                b = round565(b);
+#else
+                a = round(grid * a) * gridrcp;
+                b = round(grid * b) * gridrcp;
+#endif
+                // @@ It would be much more accurate to evaluate the error exactly. 
+
+                // compute the error
+                Vector3 e1 = a * a*alpha2_sum + b * b*beta2_sum + 2.0f*(a*b*alphabeta_sum - a * alphax_sum - b * betax_sum);
+
+                // apply the metric to the error term
+                float error = dot(e1, m_metricSqr);
+
+                // keep the solution if it wins
+                if (error < besterror)
+                {
+                    besterror = error;
+                    beststart = a;
+                    bestend = b;
+                }
+
+                x2 += m_weighted[c0 + c1 + c2];
+                w2 += m_weights[c0 + c1 + c2];
+            }
+
+            x1 += m_weighted[c0 + c1];
+            w1 += m_weights[c0 + c1];
+        }
+
+        x0 += m_weighted[c0];
+        w0 += m_weights[c0];
+    }
+
+    // save the block if necessary
+    if (besterror < m_besterror)
+    {
+        *start = beststart;
+        *end = bestend;
+
+        // save the error
+        m_besterror = besterror;
+
+        return true;
+    }
+
+    return false;
+}
+
+bool ClusterFit::fastCompress3(Vector3 * start, Vector3 * end)
+{
+    const uint count = m_count;
+    const Vector3 grid = { 31.0f, 63.0f, 31.0f };
+    const Vector3 gridrcp = { 1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f };
+
+    // declare variables
+    Vector3 beststart = { 0.0f };
+    Vector3 bestend = { 0.0f };
+    float besterror = FLT_MAX;
+
+    Vector3 x0 = { 0.0f };
+    float w0 = 0.0f;
+
+    // check all possible clusters for this total order
+    for (uint c0 = 0, i = 0; c0 <= count; c0++)
+    {
+        Vector3 x1 = { 0.0f };
+        float w1 = 0.0f;
+
+        for (uint c1 = 0; c1 <= count - c0; c1++, i++)
+        {
+            float const alpha2_sum = s_threeElement[i].alpha2_sum;
+            float const beta2_sum = s_threeElement[i].beta2_sum;
+            float const alphabeta_sum = s_threeElement[i].alphabeta_sum;
+            float const factor = s_threeElement[i].factor;
+
+            Vector3 const alphax_sum = x0 + x1 * 0.5f;
+            Vector3 const betax_sum = m_xsum - alphax_sum;
+
+            Vector3 a = (alphax_sum*beta2_sum - betax_sum * alphabeta_sum) * factor;
+            Vector3 b = (betax_sum*alpha2_sum - alphax_sum * alphabeta_sum) * factor;
+
+            // clamp to the grid
+            a = saturate(a);
+            b = saturate(b);
+#if ICBC_PERFECT_ROUND
+            a = round565(a);
+            b = round565(b);
+#else
+            a = round(grid * a) * gridrcp;
+            b = round(grid * b) * gridrcp;
+#endif
+
+            // compute the error
+            Vector3 e1 = a * a*alpha2_sum + b * b*beta2_sum + 2.0f*(a*b*alphabeta_sum - a * alphax_sum - b * betax_sum);
+
+            // apply the metric to the error term
+            float error = dot(e1, m_metricSqr);
+
+            // keep the solution if it wins
+            if (error < besterror)
+            {
+                besterror = error;
+                beststart = a;
+                bestend = b;
+            }
+
+            x1 += m_weighted[c0 + c1];
+            w1 += m_weights[c0 + c1];
+        }
+
+        x0 += m_weighted[c0];
+        w0 += m_weights[c0];
+    }
+
+    // save the block if necessary
+    if (besterror < m_besterror)
+    {
+
+        *start = beststart;
+        *end = bestend;
+
+        // save the error
+        m_besterror = besterror;
+
+        return true;
+    }
+
+    return false;
+}
+
+bool ClusterFit::fastCompress4(Vector3 * start, Vector3 * end)
+{
+    const uint count = m_count;
+    const Vector3 grid = { 31.0f, 63.0f, 31.0f };
+    const Vector3 gridrcp = { 1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f };
+
+    // declare variables
+    Vector3 beststart = { 0.0f };
+    Vector3 bestend = { 0.0f };
+    float besterror = FLT_MAX;
+
+    Vector3 x0 = { 0.0f };
+    float w0 = 0.0f;
+
+    // check all possible clusters for this total order
+    for (uint c0 = 0, i = 0; c0 <= count; c0++)
+    {
+        Vector3 x1 = { 0.0f };
+        float w1 = 0.0f;
+
+        for (uint c1 = 0; c1 <= count - c0; c1++)
+        {
+            Vector3 x2 = { 0.0f };
+            float w2 = 0.0f;
+
+            for (uint c2 = 0; c2 <= count - c0 - c1; c2++, i++)
+            {
+                float const alpha2_sum = s_fourElement[i].alpha2_sum;
+                float const beta2_sum = s_fourElement[i].beta2_sum;
+                float const alphabeta_sum = s_fourElement[i].alphabeta_sum;
+                float const factor = s_fourElement[i].factor;
+
+                Vector3 const alphax_sum = x0 + x1 * (2.0f / 3.0f) + x2 * (1.0f / 3.0f);
+                Vector3 const betax_sum = m_xsum - alphax_sum;
+
+                Vector3 a = (alphax_sum*beta2_sum - betax_sum * alphabeta_sum)*factor;
+                Vector3 b = (betax_sum*alpha2_sum - alphax_sum * alphabeta_sum)*factor;
+
+                // clamp to the grid
+                a = saturate(a);
+                b = saturate(b);
+#if ICBC_PERFECT_ROUND
+                a = round565(a);
+                b = round565(b);
+#else
+                a = round(grid * a) * gridrcp;
+                b = round(grid * b) * gridrcp;
+#endif
+                // @@ It would be much more accurate to evaluate the error exactly. 
+
+                // compute the error
+                Vector3 e1 = a * a*alpha2_sum + b * b*beta2_sum + 2.0f*(a*b*alphabeta_sum - a * alphax_sum - b * betax_sum);
+
+                // apply the metric to the error term
+                float error = dot(e1, m_metricSqr);
+
+                // keep the solution if it wins
+                if (error < besterror)
+                {
+                    besterror = error;
+                    beststart = a;
+                    bestend = b;
+                }
+
+                x2 += m_weighted[c0 + c1 + c2];
+                w2 += m_weights[c0 + c1 + c2];
+            }
+
+            x1 += m_weighted[c0 + c1];
+            w1 += m_weights[c0 + c1];
+        }
+
+        x0 += m_weighted[c0];
+        w0 += m_weights[c0];
+    }
+
+    // save the block if necessary
+    if (besterror < m_besterror)
+    {
+        *start = beststart;
+        *end = bestend;
+
+        // save the error
+        m_besterror = besterror;
+
+        return true;
+    }
+
+    return false;
+}
+
+#endif // ICBC_USE_SIMD
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Palette evaluation.
+
+// D3D10
+inline void evaluate_palette4_d3d10(Color16 c0, Color16 c1, Color32 palette[4]) {
+    palette[2].r = (2 * palette[0].r + palette[1].r) / 3;
+    palette[2].g = (2 * palette[0].g + palette[1].g) / 3;
+    palette[2].b = (2 * palette[0].b + palette[1].b) / 3;
+    palette[2].a = 0xFF;
+
+    palette[3].r = (2 * palette[1].r + palette[0].r) / 3;
+    palette[3].g = (2 * palette[1].g + palette[0].g) / 3;
+    palette[3].b = (2 * palette[1].b + palette[0].b) / 3;
+    palette[3].a = 0xFF;
+}
+inline void evaluate_palette3_d3d10(Color16 c0, Color16 c1, Color32 palette[4]) {
+    palette[2].r = (palette[0].r + palette[1].r) / 2;
+    palette[2].g = (palette[0].g + palette[1].g) / 2;
+    palette[2].b = (palette[0].b + palette[1].b) / 2;
+    palette[2].a = 0xFF;
+    palette[3].u = 0;
+}
+static void evaluate_palette_d3d10(Color16 c0, Color16 c1, Color32 palette[4]) {
+    palette[0] = bitexpand_color16_to_color32(c0);
+    palette[1] = bitexpand_color16_to_color32(c1);
+    if (c0.u > c1.u) {
+        evaluate_palette4_d3d10(c0, c1, palette);
+    }
+    else {
+        evaluate_palette3_d3d10(c0, c1, palette);
+    }
+}
+
+// NV
+inline void evaluate_palette4_nv(Color16 c0, Color16 c1, Color32 palette[4]) {
+    int gdiff = palette[1].g - palette[0].g;
+    palette[2].r = ((2 * c0.r + c1.r) * 22) / 8;
+    palette[2].g = (256 * palette[0].g + gdiff / 4 + 128 + gdiff * 80) / 256;
+    palette[2].b = ((2 * c0.b + c1.b) * 22) / 8;
+    palette[2].a = 0xFF;
+
+    palette[3].r = ((2 * c1.r + c0.r) * 22) / 8;
+    palette[3].g = (256 * palette[1].g - gdiff / 4 + 128 - gdiff * 80) / 256;
+    palette[3].b = ((2 * c1.b + c0.b) * 22) / 8;
+    palette[3].a = 0xFF;
+}
+inline void evaluate_palette3_nv(Color16 c0, Color16 c1, Color32 palette[4]) {
+    int gdiff = palette[1].g - palette[0].g;
+    palette[2].r = ((c0.r + c1.r) * 33) / 8;
+    palette[2].g = (256 * palette[0].g + gdiff / 4 + 128 + gdiff * 128) / 256;
+    palette[2].b = ((c0.b + c1.b) * 33) / 8;
+    palette[2].a = 0xFF;
+    palette[3].u = 0;
+}
+static void evaluate_palette_nv(Color16 c0, Color16 c1, Color32 palette[4]) {
+    palette[0] = bitexpand_color16_to_color32(c0);
+    palette[1] = bitexpand_color16_to_color32(c1);
+
+    if (c0.u > c1.u) {
+        evaluate_palette4_nv(c0, c1, palette);
+    }
+    else {
+        evaluate_palette3_nv(c0, c1, palette);
+    }
+}
+
+// AMD
+inline void evaluate_palette4_amd(Color16 c0, Color16 c1, Color32 palette[4]) {
+    palette[2].r = (43 * palette[0].r + 21 * palette[1].r + 32) / 8;
+    palette[2].g = (43 * palette[0].g + 21 * palette[1].g + 32) / 8;
+    palette[2].b = (43 * palette[0].b + 21 * palette[1].b + 32) / 8;
+    palette[2].a = 0xFF;
+
+    palette[3].r = (43 * palette[1].r + 21 * palette[0].r + 32) / 8;
+    palette[3].g = (43 * palette[1].g + 21 * palette[0].g + 32) / 8;
+    palette[3].b = (43 * palette[1].b + 21 * palette[0].b + 32) / 8;
+    palette[3].a = 0xFF;
+}
+inline void evaluate_palette3_amd(Color16 c0, Color16 c1, Color32 palette[4]) {
+    palette[2].r = (c0.r + c1.r + 1) / 2;
+    palette[2].g = (c0.g + c1.g + 1) / 2;
+    palette[2].b = (c0.b + c1.b + 1) / 2;
+    palette[2].a = 0xFF;
+    palette[3].u = 0;
+}
+static void evaluate_palette_amd(Color16 c0, Color16 c1, Color32 palette[4]) {
+    palette[0] = bitexpand_color16_to_color32(c0);
+    palette[1] = bitexpand_color16_to_color32(c1);
+
+    if (c0.u > c1.u) {
+        evaluate_palette4_amd(c0, c1, palette);
+    }
+    else {
+        evaluate_palette3_amd(c0, c1, palette);
+    }
+}
+
+// Use ICBC_DECODER to determine decoder used.
+inline void evaluate_palette4(Color16 c0, Color16 c1, Color32 palette[4]) {
+#if ICBC_DECODER == Decoder_D3D10
+    evaluate_palette4_d3d10(c0, c1, palette);
+#elif ICBC_DECODER == Decoder_NVIDIA
+    evaluate_palette4_nv(c0, c1, palette);
+#elif ICBC_DECODER == Decoder_AMD
+    evaluate_palette4_amd(c0, c1, palette);
+#endif
+}
+inline void evaluate_palette3(Color16 c0, Color16 c1, Color32 palette[4]) {
+#if ICBC_DECODER == Decoder_D3D10
+    evaluate_palette3_d3d10(c0, c1, palette);
+#elif ICBC_DECODER == Decoder_NVIDIA
+    evaluate_palette3_nv(c0, c1, palette);
+#elif ICBC_DECODER == Decoder_AMD
+    evaluate_palette3_amd(c0, c1, palette);
+#endif
+}
+inline void evaluate_palette(Color16 c0, Color16 c1, Color32 palette[4]) {
+#if ICBC_DECODER == Decoder_D3D10
+    evaluate_palette_d3d10(c0, c1, palette);
+#elif ICBC_DECODER == Decoder_NVIDIA
+    evaluate_palette_nv(c0, c1, palette);
+#elif ICBC_DECODER == Decoder_AMD
+    evaluate_palette_amd(c0, c1, palette);
+#endif
+}
+
+static void evaluate_palette(Color16 c0, Color16 c1, Vector3 palette[4]) {
+    Color32 palette32[4];
+    evaluate_palette(c0, c1, palette32);
+
+    for (int i = 0; i < 4; i++) {
+        palette[i] = color_to_vector3(palette32[i]);
+    }
+}
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Error evaluation.
+
+// Different ways of estimating the error.
+
+static float evaluate_mse(const Vector3 & p, const Vector3 & c, const Vector3 & w) {
+    Vector3 d = (p - c) * w * 255;
+    return dot(d, d);
+}
+
+static float evaluate_mse(const Color32 & p, const Vector3 & c, const Vector3 & w) {
+    Vector3 d = (color_to_vector3(p) - c) * w * 255;
+    return dot(d, d);
+}
+
+
+/*static float evaluate_mse(const Vector3 & p, const Vector3 & c, const Vector3 & w) {
+    return ww.x * square(p.x-c.x) + ww.y * square(p.y-c.y) + ww.z * square(p.z-c.z);
+}*/
+
+static int evaluate_mse(const Color32 & p, const Color32 & c) {
+    return (square(int(p.r)-c.r) + square(int(p.g)-c.g) + square(int(p.b)-c.b));
+}
+
+/*static float evaluate_mse(const Vector3 palette[4], const Vector3 & c, const Vector3 & w) {
+    float e0 = evaluate_mse(palette[0], c, w);
+    float e1 = evaluate_mse(palette[1], c, w);
+    float e2 = evaluate_mse(palette[2], c, w);
+    float e3 = evaluate_mse(palette[3], c, w);
+    return min(min(e0, e1), min(e2, e3));
+}*/
+
+static int evaluate_mse(const Color32 palette[4], const Color32 & c) {
+    int e0 = evaluate_mse(palette[0], c);
+    int e1 = evaluate_mse(palette[1], c);
+    int e2 = evaluate_mse(palette[2], c);
+    int e3 = evaluate_mse(palette[3], c);
+    return min(min(e0, e1), min(e2, e3));
+}
+
+// Returns MSE error in [0-255] range.
+static int evaluate_mse(const BlockDXT1 * output, Color32 color, int index) {
+    Color32 palette[4];
+    evaluate_palette(output->col0, output->col1, palette);
+
+    return evaluate_mse(palette[index], color);
+}
+
+// Returns weighted MSE error in [0-255] range.
+static float evaluate_palette_error(Color32 palette[4], const Color32 * colors, const float * weights, int count) {
+    
+    float total = 0.0f;
+    for (int i = 0; i < count; i++) {
+        total += weights[i] * evaluate_mse(palette, colors[i]);
+    }
+
+    return total;
+}
+
+static float evaluate_palette_error(Color32 palette[4], const Color32 * colors, int count) {
+
+    float total = 0.0f;
+    for (int i = 0; i < count; i++) {
+        total += evaluate_mse(palette, colors[i]);
+    }
+
+    return total;
+}
+
+#if 0
+static float evaluate_mse(const BlockDXT1 * output, const Vector3 colors[16]) {
+    Color32 palette[4];
+    output->evaluatePalette(palette, /*d3d9=*/false);
+
+    // convert palette to float.
+    Vector3 vector_palette[4];
+    for (int i = 0; i < 4; i++) {
+        vector_palette[i] = color_to_vector3(palette[i]);
+    }
+
+    // evaluate error for each index.
+    float error = 0.0f;
+    for (int i = 0; i < 16; i++) {
+        int index = (output->indices >> (2*i)) & 3;
+        error += evaluate_mse(vector_palette[index], colors[i]);
+    }
+
+    return error;
+}
+#endif
+
+static float evaluate_mse(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, const BlockDXT1 * output) {
+    Color32 palette[4];
+    evaluate_palette(output->col0, output->col1, palette);
+
+    // convert palette to float.
+    /*Vector3 vector_palette[4];
+    for (int i = 0; i < 4; i++) {
+        vector_palette[i] = color_to_vector3(palette[i]);
+    }*/
+
+    // evaluate error for each index.
+    float error = 0.0f;
+    for (int i = 0; i < 16; i++) {
+        int index = (output->indices >> (2 * i)) & 3;
+        error += input_weights[i] * evaluate_mse(palette[index], input_colors[i].xyz, color_weights);
+    }
+    return error;
+}
+
+float evaluate_dxt1_error(const uint8 rgba_block[16*4], const BlockDXT1 * block, Decoder decoder) {
+    Color32 palette[4];
+    if (decoder == Decoder_NVIDIA) {
+        evaluate_palette_nv(block->col0, block->col1, palette);
+    }
+    else if (decoder == Decoder_AMD) {
+        evaluate_palette_amd(block->col0, block->col1, palette);
+    }
+    else {
+        evaluate_palette(block->col0, block->col1, palette);
+    }
+
+    // evaluate error for each index.
+    float error = 0.0f;
+    for (int i = 0; i < 16; i++) {
+        int index = (block->indices >> (2 * i)) & 3;
+        Color32 c;
+        c.r = rgba_block[4 * i + 0];
+        c.g = rgba_block[4 * i + 1];
+        c.b = rgba_block[4 * i + 2];
+        c.a = 255;
+        error += evaluate_mse(palette[index], c);
+    }
+    return error;
+}
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Index selection
+
+static uint compute_indices4(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 palette[4]) {
+    
+    uint indices = 0;
+    for (int i = 0; i < 16; i++) {
+        float d0 = evaluate_mse(palette[0], input_colors[i].xyz, color_weights);
+        float d1 = evaluate_mse(palette[1], input_colors[i].xyz, color_weights);
+        float d2 = evaluate_mse(palette[2], input_colors[i].xyz, color_weights);
+        float d3 = evaluate_mse(palette[3], input_colors[i].xyz, color_weights);
+
+        uint b0 = d0 > d3;
+        uint b1 = d1 > d2;
+        uint b2 = d0 > d2;
+        uint b3 = d1 > d3;
+        uint b4 = d2 > d3;
+
+        uint x0 = b1 & b2;
+        uint x1 = b0 & b3;
+        uint x2 = b0 & b4;
+
+        indices |= (x2 | ((x0 | x1) << 1)) << (2 * i);
+    }
+
+    return indices;
+}
+
+
+static uint compute_indices4(const Vector3 input_colors[16], const Vector3 palette[4]) {
+
+    uint indices = 0;
+    for (int i = 0; i < 16; i++) {
+        float d0 = evaluate_mse(palette[0], input_colors[i], {1,1,1});
+        float d1 = evaluate_mse(palette[1], input_colors[i], {1,1,1});
+        float d2 = evaluate_mse(palette[2], input_colors[i], {1,1,1});
+        float d3 = evaluate_mse(palette[3], input_colors[i], {1,1,1});
+
+        uint b0 = d0 > d3;
+        uint b1 = d1 > d2;
+        uint b2 = d0 > d2;
+        uint b3 = d1 > d3;
+        uint b4 = d2 > d3;
+
+        uint x0 = b1 & b2;
+        uint x1 = b0 & b3;
+        uint x2 = b0 & b4;
+
+        indices |= (x2 | ((x0 | x1) << 1)) << (2 * i);
+    }
+
+    return indices;
+}
+
+
+static uint compute_indices(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 palette[4]) {
+    
+    uint indices = 0;
+    for (int i = 0; i < 16; i++) {
+        float d0 = evaluate_mse(palette[0], input_colors[i].xyz, color_weights);
+        float d1 = evaluate_mse(palette[1], input_colors[i].xyz, color_weights);
+        float d2 = evaluate_mse(palette[2], input_colors[i].xyz, color_weights);
+        float d3 = evaluate_mse(palette[3], input_colors[i].xyz, color_weights);
+
+        uint index;
+        if (d0 < d1 && d0 < d2 && d0 < d3) index = 0;
+        else if (d1 < d2 && d1 < d3) index = 1;
+        else if (d2 < d3) index = 2;
+        else index = 3;
+
+		indices |= index << (2 * i);
+	}
+
+	return indices;
+}
+
+
+static void output_block3(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 & v0, const Vector3 & v1, BlockDXT1 * block)
+{
+    Color16 color0 = vector3_to_color16(v0);
+    Color16 color1 = vector3_to_color16(v1);
+
+    if (color0.u > color1.u) {
+        swap(color0, color1);
+    }
+
+    Vector3 palette[4];
+    evaluate_palette(color0, color1, palette);
+
+    block->col0 = color0;
+    block->col1 = color1;
+    block->indices = compute_indices(input_colors, color_weights, palette);
+}
+
+static void output_block4(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 & v0, const Vector3 & v1, BlockDXT1 * block)
+{
+    Color16 color0 = vector3_to_color16(v0);
+    Color16 color1 = vector3_to_color16(v1);
+
+    if (color0.u < color1.u) {
+        swap(color0, color1);
+    }
+
+    Vector3 palette[4];
+    evaluate_palette(color0, color1, palette);
+
+    block->col0 = color0;
+    block->col1 = color1;
+    block->indices = compute_indices4(input_colors, color_weights, palette);
+}
+
+
+static void output_block4(const Vector3 input_colors[16], const Vector3 & v0, const Vector3 & v1, BlockDXT1 * block)
+{
+    Color16 color0 = vector3_to_color16(v0);
+    Color16 color1 = vector3_to_color16(v1);
+
+    if (color0.u < color1.u) {
+        swap(color0, color1);
+    }
+
+    Vector3 palette[4];
+    evaluate_palette(color0, color1, palette);
+
+    block->col0 = color0;
+    block->col1 = color1;
+    block->indices = compute_indices4(input_colors, palette);
+}
+
+// Least squares fitting of color end points for the given indices. @@ Take weights into account.
+static bool optimize_end_points4(uint indices, const Vector4 * colors, /*const float * weights,*/ int count, Vector3 * a, Vector3 * b)
+{
+    float alpha2_sum = 0.0f;
+    float beta2_sum = 0.0f;
+    float alphabeta_sum = 0.0f;
+    Vector3 alphax_sum = { 0,0,0 };
+    Vector3 betax_sum = { 0,0,0 };
+
+    for (int i = 0; i < count; i++)
+    {
+        const uint bits = indices >> (2 * i);
+
+        float beta = float(bits & 1);
+        if (bits & 2) beta = (1 + beta) / 3.0f;
+        float alpha = 1.0f - beta;
+
+        alpha2_sum += alpha * alpha;
+        beta2_sum += beta * beta;
+        alphabeta_sum += alpha * beta;
+        alphax_sum += alpha * colors[i].xyz;
+        betax_sum += beta * colors[i].xyz;
+    }
+
+    float denom = alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum;
+    if (equal(denom, 0.0f)) return false;
+
+    float factor = 1.0f / denom;
+
+    *a = saturate((alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor);
+    *b = saturate((betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor);
+
+    return true;
+}
+
+// Least squares optimization with custom factors.
+// This allows us passing the standard [1, 0, 2/3 1/3] weights by default, but also use alternative mappings when the number of clusters is not 4.
+static bool optimize_end_points4(uint indices, const Vector3 * colors, int count, float factors[4], Vector3 * a, Vector3 * b)
+{
+    float alpha2_sum = 0.0f;
+    float beta2_sum = 0.0f;
+    float alphabeta_sum = 0.0f;
+    Vector3 alphax_sum = { 0,0,0 };
+    Vector3 betax_sum = { 0,0,0 };
+
+    for (int i = 0; i < count; i++)
+    {
+        const uint idx = (indices >> (2 * i)) & 3;
+        float alpha = factors[idx];
+        float beta = 1 - alpha;
+
+        alpha2_sum += alpha * alpha;
+        beta2_sum += beta * beta;
+        alphabeta_sum += alpha * beta;
+        alphax_sum += alpha * colors[i];
+        betax_sum += beta * colors[i];
+    }
+
+    float denom = alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum;
+    if (equal(denom, 0.0f)) return false;
+
+    float factor = 1.0f / denom;
+
+    *a = saturate((alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor);
+    *b = saturate((betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor);
+
+    return true;
+}
+
+static bool optimize_end_points4(uint indices, const Vector3 * colors, int count, Vector3 * a, Vector3 * b)
+{
+    float factors[4] = { 1, 0, 2.f / 3, 1.f / 3 };
+    return optimize_end_points4(indices, colors, count, factors, a, b);
+}
+
+
+// Least squares fitting of color end points for the given indices. @@ This does not support black/transparent index. @@ Take weights into account.
+static bool optimize_end_points3(uint indices, const Vector3 * colors, /*const float * weights,*/ int count, Vector3 * a, Vector3 * b)
+{
+    float alpha2_sum = 0.0f;
+    float beta2_sum = 0.0f;
+    float alphabeta_sum = 0.0f;
+    Vector3 alphax_sum = { 0,0,0 };
+    Vector3 betax_sum = { 0,0,0 };
+
+    for (int i = 0; i < count; i++)
+    {
+        const uint bits = indices >> (2 * i);
+
+        float beta = float(bits & 1);
+        if (bits & 2) beta = 0.5f;
+        float alpha = 1.0f - beta;
+
+        alpha2_sum += alpha * alpha;
+        beta2_sum += beta * beta;
+        alphabeta_sum += alpha * beta;
+        alphax_sum += alpha * colors[i];
+        betax_sum += beta * colors[i];
+    }
+
+    float denom = alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum;
+    if (equal(denom, 0.0f)) return false;
+
+    float factor = 1.0f / denom;
+
+    *a = saturate((alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor);
+    *b = saturate((betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor);
+
+    return true;
+}
+
+
+// find minimum and maximum colors based on bounding box in color space
+inline static void fit_colors_bbox(const Vector3 * colors, int count, Vector3 * __restrict c0, Vector3 * __restrict c1)
+{
+    *c0 = { 0,0,0 };
+    *c1 = { 1,1,1 };
+
+    for (int i = 0; i < count; i++) {
+        *c0 = max(*c0, colors[i]);
+        *c1 = min(*c1, colors[i]);
+    }
+}
+
+inline static void select_diagonal(const Vector3 * colors, int count, Vector3 * __restrict c0, Vector3 * __restrict c1)
+{
+    Vector3 center = (*c0 + *c1) * 0.5f;
+
+    /*Vector3 center = colors[0];
+    for (int i = 1; i < count; i++) {
+        center = center * float(i-1) / i + colors[i] / i;
+    }*/
+    /*Vector3 center = colors[0];
+    for (int i = 1; i < count; i++) {
+        center += colors[i];
+    }
+    center /= count;*/
+
+    float cov_xz = 0.0f;
+    float cov_yz = 0.0f;
+    for (int i = 0; i < count; i++) {
+        Vector3 t = colors[i] - center;
+        cov_xz += t.x * t.z;
+        cov_yz += t.y * t.z;
+    }
+
+    float x0 = c0->x;
+    float y0 = c0->y;
+    float x1 = c1->x;
+    float y1 = c1->y;
+
+    if (cov_xz < 0) {
+        swap(x0, x1);
+    }
+    if (cov_yz < 0) {
+        swap(y0, y1);
+    }
+
+    *c0 = { x0, y0, c0->z };
+    *c1 = { x1, y1, c1->z };
+}
+
+inline static void inset_bbox(Vector3 * __restrict c0, Vector3 * __restrict c1)
+{
+    float bias = (8.0f / 255.0f) / 16.0f;
+    Vector3 inset = (*c0 - *c1) / 16.0f - scalar_to_vector3(bias);
+    *c0 = saturate(*c0 - inset);
+    *c1 = saturate(*c1 + inset);
+}
+
+
+
+// Single color lookup tables from:
+// https://github.com/nothings/stb/blob/master/stb_dxt.h
+static uint8 match5[256][2];
+static uint8 match6[256][2];
+
+static inline int Lerp13(int a, int b)
+{
+    // replace "/ 3" by "* 0xaaab) >> 17" if your compiler sucks or you really need every ounce of speed.
+    return (a * 2 + b) / 3;
+}
+
+static void PrepareOptTable(uint8 * table, const uint8 * expand, int size)
+{
+    for (int i = 0; i < 256; i++) {
+        int bestErr = 256 * 100;
+
+        for (int min = 0; min < size; min++) {
+            for (int max = 0; max < size; max++) {
+                int mine = expand[min];
+                int maxe = expand[max];
+
+                int err = abs(Lerp13(maxe, mine) - i) * 100;
+
+                // DX10 spec says that interpolation must be within 3% of "correct" result,
+                // add this as error term. (normally we'd expect a random distribution of
+                // +-1.5% error, but nowhere in the spec does it say that the error has to be
+                // unbiased - better safe than sorry).
+                err += abs(max - min) * 3;
+
+                if (err < bestErr) {
+                    bestErr = err;
+                    table[i * 2 + 0] = max;
+                    table[i * 2 + 1] = min;
+                }
+            }
+        }
+    }
+}
+
+static void init_dxt1_tables()
+{
+    // Prepare single color lookup tables.
+    uint8 expand5[32];
+    uint8 expand6[64];
+    for (int i = 0; i < 32; i++) expand5[i] = (i << 3) | (i >> 2);
+    for (int i = 0; i < 64; i++) expand6[i] = (i << 2) | (i >> 4);
+
+    PrepareOptTable(&match5[0][0], expand5, 32);
+    PrepareOptTable(&match6[0][0], expand6, 64);
+}
+
+// Single color compressor, based on:
+// https://mollyrocket.com/forums/viewtopic.php?t=392
+static void compress_dxt1_single_color_optimal(Color32 c, BlockDXT1 * output)
+{
+    output->col0.r = match5[c.r][0];
+    output->col0.g = match6[c.g][0];
+    output->col0.b = match5[c.b][0];
+    output->col1.r = match5[c.r][1];
+    output->col1.g = match6[c.g][1];
+    output->col1.b = match5[c.b][1];
+    output->indices = 0xaaaaaaaa;
+    
+    if (output->col0.u < output->col1.u)
+    {
+        swap(output->col0.u, output->col1.u);
+        output->indices ^= 0x55555555;
+    }
+}
+
+
+// Compress block using the average color.
+static float compress_dxt1_single_color(const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, BlockDXT1 * output)
+{
+    // Compute block average.
+    Vector3 color_sum = { 0,0,0 };
+    float weight_sum = 0;
+
+    for (int i = 0; i < count; i++) {
+        color_sum += colors[i] * weights[i];
+        weight_sum += weights[i];
+    }
+
+    // Compress optimally.
+    compress_dxt1_single_color_optimal(vector3_to_color32(color_sum / weight_sum), output);
+
+    // Decompress block color.
+    Color32 palette[4];
+    evaluate_palette(output->col0, output->col1, palette);
+
+    Vector3 block_color = color_to_vector3(palette[output->indices & 0x3]);
+
+    // Evaluate error.
+    float error = 0;
+    for (int i = 0; i < count; i++) {
+        error += weights[i] * evaluate_mse(block_color, colors[i], color_weights);
+    }
+    return error;
+}
+
+
+static float compress_dxt1_bounding_box_exhaustive(const Vector4 input_colors[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, bool three_color_mode, int max_volume, BlockDXT1 * output)
+{
+    // Compute bounding box.
+    Vector3 min_color = { 1,1,1 };
+    Vector3 max_color = { 0,0,0 };
+
+    for (int i = 0; i < count; i++) {
+        min_color = min(min_color, colors[i]);
+        max_color = max(max_color, colors[i]);
+    }
+
+    // Convert to 5:6:5
+    int min_r = int(31 * min_color.x);
+    int min_g = int(63 * min_color.y);
+    int min_b = int(31 * min_color.z);
+    int max_r = int(31 * max_color.x + 1);
+    int max_g = int(63 * max_color.y + 1);
+    int max_b = int(31 * max_color.z + 1);
+
+    // Expand the box.
+    int range_r = max_r - min_r;
+    int range_g = max_g - min_g;
+    int range_b = max_b - min_b;
+
+    min_r = max(0, min_r - range_r / 2 - 2);
+    min_g = max(0, min_g - range_g / 2 - 2);
+    min_b = max(0, min_b - range_b / 2 - 2);
+
+    max_r = min(31, max_r + range_r / 2 + 2);
+    max_g = min(63, max_g + range_g / 2 + 2);
+    max_b = min(31, max_b + range_b / 2 + 2);
+
+    // Estimate size of search space.
+    int volume = (max_r-min_r+1) * (max_g-min_g+1) * (max_b-min_b+1);
+
+    // if size under search_limit, then proceed. Note that search_volume is sqrt of number of evaluations.
+    if (volume > max_volume) {
+        return FLT_MAX;
+    }
+
+    // @@ Convert to fixed point before building box?
+    Color32 colors32[16];
+    for (int i = 0; i < count; i++) {
+        colors32[i] = vector3_to_color32(colors[i]);
+    }
+
+    float best_error = FLT_MAX;
+    Color16 best0, best1;           // @@ Record endpoints as Color16?
+
+    Color16 c0, c1;
+    Color32 palette[4];
+
+    for(int r0 = min_r; r0 <= max_r; r0++)
+    for(int g0 = min_g; g0 <= max_g; g0++)
+    for(int b0 = min_b; b0 <= max_b; b0++)
+    {
+        c0.r = r0; c0.g = g0; c0.b = b0;
+        palette[0] = bitexpand_color16_to_color32(c0);
+
+        for(int r1 = min_r; r1 <= max_r; r1++)
+        for(int g1 = min_g; g1 <= max_g; g1++)
+        for(int b1 = min_b; b1 <= max_b; b1++)
+        {
+            c1.r = r1; c1.g = g1; c1.b = b1;
+            palette[1] = bitexpand_color16_to_color32(c1);
+
+            if (c0.u > c1.u) {
+                // Evaluate error in 4 color mode.
+                evaluate_palette4(c0, c1, palette);
+            }
+            else {
+                if (three_color_mode) {
+                    // Evaluate error in 3 color mode.
+                    evaluate_palette3(c0, c1, palette);
+                }
+                else {
+                    // Skip 3 color mode.
+                    continue;
+                }
+            }
+
+            float error = evaluate_palette_error(palette, colors32, weights, count);
+
+            if (error < best_error) {
+                best_error = error;
+                best0 = c0;
+                best1 = c1;
+            }
+        }
+    }
+
+    output->col0 = best0;
+    output->col1 = best1;
+
+    Vector3 vector_palette[4];
+    evaluate_palette(output->col0, output->col1, vector_palette);
+
+    output->indices = compute_indices(input_colors, color_weights, vector_palette);
+
+    return best_error / (255 * 255);
+}
+
+
+static void compress_dxt1_cluster_fit(const Vector4 input_colors[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, bool three_color_mode, BlockDXT1 * output)
+{
+    ClusterFit fit;
+    
+#if ICBC_FAST_CLUSTER_FIT
+    if (count > 15) {
+        fit.setColorSet(input_colors, color_weights);
+
+        // start & end are in [0, 1] range.
+        Vector3 start, end;
+        fit.fastCompress4(&start, &end);
+
+        if (three_color_mode && fit.fastCompress3(&start, &end)) {
+            output_block3(input_colors, color_weights, start, end, output);
+        }
+        else {
+            output_block4(input_colors, color_weights, start, end, output);
+        }
+    }
+    else 
+#endif
+    {
+        fit.setColorSet(colors, weights, count, color_weights);
+
+        // start & end are in [0, 1] range.
+        Vector3 start, end;
+        fit.compress4(&start, &end);
+
+        if (three_color_mode && fit.compress3(&start, &end)) {
+            output_block3(input_colors, color_weights, start, end, output);
+        }
+        else {
+            output_block4(input_colors, color_weights, start, end, output);
+        }
+    }
+}
+
+
+static float refine_endpoints(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, bool three_color_mode, float input_error, BlockDXT1 * output) {
+    // TODO:
+    // - Optimize palette evaluation when updating only one channel.
+    // - try all diagonals.
+
+    // Things that don't help:
+    // - Alternate endpoint updates.
+    // - Randomize order.
+    // - If one direction does not improve, test opposite direction next.
+
+    static const int8 deltas[16][3] = {
+        {1,0,0},
+        {0,1,0},
+        {0,0,1},
+
+        {-1,0,0},
+        {0,-1,0},
+        {0,0,-1},
+
+        {1,1,0},
+        {1,0,1},
+        {0,1,1},
+
+        {-1,-1,0},
+        {-1,0,-1},
+        {0,-1,-1},
+
+        {-1,1,0},
+        //{-1,0,1},
+
+        {1,-1,0},
+        {0,-1,1},
+
+        //{1,0,-1},
+        {0,1,-1},
+    };
+
+    float best_error = input_error;
+
+    int lastImprovement = 0;
+    for (int i = 0; i < 256; i++) {
+        BlockDXT1 refined = *output;
+        int8 delta[3] = { deltas[i % 16][0], deltas[i % 16][1], deltas[i % 16][2] };
+
+        if ((i / 16) & 1) {
+            refined.col0.r += delta[0];
+            refined.col0.g += delta[1];
+            refined.col0.b += delta[2];
+        }
+        else {
+            refined.col1.r += delta[0];
+            refined.col1.g += delta[1];
+            refined.col1.b += delta[2];
+        }
+
+        if (!three_color_mode) {
+            if (refined.col0.u == refined.col1.u) refined.col1.g += 1;
+            if (refined.col0.u < refined.col1.u) swap(refined.col0.u, refined.col1.u);
+        }
+
+        Vector3 palette[4];
+        evaluate_palette(output->col0, output->col1, palette);
+
+        refined.indices = compute_indices(input_colors, color_weights, palette);
+
+        float refined_error = evaluate_mse(input_colors, input_weights, color_weights, &refined);
+        if (refined_error < best_error) {
+            best_error = refined_error;
+            *output = refined;
+            lastImprovement = i;
+        }
+
+        // Early out if the last 32 steps didn't improve error.
+        if (i - lastImprovement > 32) break;
+    }
+
+    return best_error;
+}
+
+
+static float compress_dxt1(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, bool three_color_mode, bool hq, BlockDXT1 * output)
+{
+    Vector3 colors[16];
+    float weights[16];
+    int count = reduce_colors(input_colors, input_weights, colors, weights);
+
+    if (count == 0) {
+        // Output trivial block.
+        output->col0.u = 0;
+        output->col1.u = 0;
+        output->indices = 0;
+        return 0;
+    }
+
+    // Cluster fit cannot handle single color blocks, so encode them optimally.
+    if (count == 1) {
+        compress_dxt1_single_color_optimal(vector3_to_color32(colors[0]), output);
+        return evaluate_mse(input_colors, input_weights, color_weights, output);
+    }
+
+    // Quick end point selection.
+    Vector3 c0, c1;
+    fit_colors_bbox(colors, count, &c0, &c1);
+    inset_bbox(&c0, &c1);
+    select_diagonal(colors, count, &c0, &c1);
+    output_block4(input_colors, color_weights, c0, c1, output);
+
+    float error = evaluate_mse(input_colors, input_weights, color_weights, output);
+
+    // Refine color for the selected indices.
+    if (optimize_end_points4(output->indices, input_colors, 16, &c0, &c1)) {
+        BlockDXT1 optimized_block;
+        output_block4(input_colors, color_weights, c0, c1, &optimized_block);
+
+        float optimized_error = evaluate_mse(input_colors, input_weights, color_weights, &optimized_block);
+        if (optimized_error < error) {
+            error = optimized_error;
+            *output = optimized_block;
+        }
+    }
+
+    // @@ Use current endpoints as input for initial PCA approximation?
+
+    // Try cluster fit.
+    BlockDXT1 cluster_fit_output;
+    compress_dxt1_cluster_fit(input_colors, colors, weights, count, color_weights, three_color_mode, &cluster_fit_output);
+
+    float cluster_fit_error = evaluate_mse(input_colors, input_weights, color_weights, &cluster_fit_output);
+    if (cluster_fit_error < error) {
+        *output = cluster_fit_output;
+        error = cluster_fit_error;
+    }
+
+    if (hq) {
+        error = refine_endpoints(input_colors, input_weights, color_weights, three_color_mode, error, output);
+    }
+
+    return error;
+}
+
+
+// 
+static bool centroid_end_points(uint indices, const Vector3 * colors, /*const float * weights,*/ float factor[4], Vector3 * c0, Vector3 * c1) {
+
+    *c0 = { 0,0,0 };
+    *c1 = { 0,0,0 };
+    float w0_sum = 0;
+    float w1_sum = 0;
+
+    for (int i = 0; i < 16; i++) {
+        int idx = (indices >> (2 * i)) & 3;
+        float w0 = factor[idx];// * weights[i];
+        float w1 = (1 - factor[idx]);// * weights[i];
+
+        *c0 += colors[i] * w0;   w0_sum += w0;
+        *c1 += colors[i] * w1;   w1_sum += w1;
+    }
+
+    *c0 *= (1.0f / w0_sum);
+    *c1 *= (1.0f / w1_sum);
+
+    return true;
+}
+
+
+
+static float compress_dxt1_test(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, BlockDXT1 * output)
+{
+    Vector3 colors[16];
+    for (int i = 0; i < 16; i++) {
+        colors[i] = input_colors[i].xyz;
+    }
+    int count = 16;
+
+    // Quick end point selection.
+    Vector3 c0, c1;
+    fit_colors_bbox(colors, count, &c0, &c1);
+    if (c0 == c1) {
+        compress_dxt1_single_color_optimal(vector3_to_color32(c0), output);
+        return evaluate_mse(input_colors, input_weights, color_weights, output);
+    }
+    inset_bbox(&c0, &c1);
+    select_diagonal(colors, count, &c0, &c1);
+
+    output_block4(colors, c0, c1, output);
+    float best_error = evaluate_mse(input_colors, input_weights, color_weights, output);
+
+
+    // Given an index assignment, we can compute end points in two different ways:
+    // - least squares optimization.
+    // - centroid.
+    // Are these different? The first finds the end points that minimize the least squares error.
+    // The second averages the input colors
+
+    while (true) {
+        float last_error = best_error;
+        uint last_indices = output->indices;
+
+        int cluster_counts[4] = { 0, 0, 0, 0 };
+        for (int i = 0; i < 16; i++) {
+            int idx = (output->indices >> (2 * i)) & 3;
+            cluster_counts[idx] += 1;
+        }
+        int n = 0;
+        for (int i = 0; i < 4; i++) n += int(cluster_counts[i] != 0);
+
+        if (n == 4) {
+            float factors[4] = { 1.0f, 0.0f, 2.0f / 3, 1.0f / 3 };
+            if (optimize_end_points4(last_indices, colors, 16, factors, &c0, &c1)) {
+                BlockDXT1 refined_block;
+                output_block4(colors, c0, c1, &refined_block);
+                float new_error = evaluate_mse(input_colors, input_weights, color_weights, &refined_block);
+                if (new_error < best_error) {
+                    best_error = new_error;
+                    *output = refined_block;
+                }
+            }
+        }
+        else if (n == 3) {
+            // 4 options:
+            static const float tables[4][3] = {
+                { 0, 2.f/3, 1.f/3 },    // 0, 1/3, 2/3
+                { 1, 0,     1.f/3 },    // 0, 1/3, 1
+                { 1, 0,     2.f/3 },    // 0, 2/3, 1
+                { 1, 2.f/3, 1.f/3 },    // 1/2, 2/3, 1
+            };
+
+            for (int k = 0; k < 4; k++) {
+                // Remap tables:
+                float factors[4];
+                for (int i = 0, j = 0; i < 4; i++) {
+                    factors[i] = tables[k][j];
+                    if (cluster_counts[i] != 0) j += 1;
+                }
+                if (optimize_end_points4(last_indices, colors, 16, factors, &c0, &c1)) {
+                    BlockDXT1 refined_block;
+                    output_block4(colors, c0, c1, &refined_block);
+                    float new_error = evaluate_mse(input_colors, input_weights, color_weights, &refined_block);
+                    if (new_error < best_error) {
+                        best_error = new_error;
+                        *output = refined_block;
+                    }
+                }
+            }
+
+            // @@ And 1 3-color block:
+            // 0, 1/2, 1
+        }
+        else if (n == 2) {
+
+            // 6 options:
+            static const float tables[6][2] = {
+                { 0, 1.f/3 },       // 0, 1/3
+                { 0, 2.f/3 },       // 0, 2/3
+                { 1, 0 },           // 0, 1
+                { 2.f/3, 1.f/3 },   // 1/3, 2/3
+                { 1, 1.f/3 },       // 1/3, 1
+                { 1, 2.f/3 },       // 2/3, 1
+            };
+
+            for (int k = 0; k < 6; k++) {
+                // Remap tables:
+                float factors[4];
+                for (int i = 0, j = 0; i < 4; i++) {
+                    factors[i] = tables[k][j];
+                    if (cluster_counts[i] != 0) j += 1;
+                }
+                if (optimize_end_points4(last_indices, colors, 16, factors, &c0, &c1)) {
+                    BlockDXT1 refined_block;
+                    output_block4(colors, c0, c1, &refined_block);
+                    float new_error = evaluate_mse(input_colors, input_weights, color_weights, &refined_block);
+                    if (new_error < best_error) {
+                        best_error = new_error;
+                        *output = refined_block;
+                    }
+                }
+            }
+
+            // @@ And 2 3-color blocks:
+            // 0, 0.5
+            // 0.5, 1
+            // 0, 1     // This is equivalent to the 4 color mode.
+        }
+
+        // If error has not improved, stop.
+        //if (best_error == last_error) break;
+
+        // If error has not improved or indices haven't changed, stop.
+        if (output->indices == last_indices || best_error < last_error) break;
+    }
+
+    if (false) {
+        best_error = refine_endpoints(input_colors, input_weights, color_weights, false, best_error, output);
+    }
+
+    return best_error;
+}
+
+
+
+static float compress_dxt1_fast(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, BlockDXT1 * output)
+{
+    Vector3 colors[16];
+    for (int i = 0; i < 16; i++) {
+        colors[i] = input_colors[i].xyz;
+    }
+    int count = 16;
+
+    /*float error = FLT_MAX;
+    error = compress_dxt1_single_color(colors, input_weights, count, color_weights, output);
+
+    if (error == 0.0f || count == 1) {
+        // Early out.
+        return error;
+    }*/
+
+    // Quick end point selection.
+    Vector3 c0, c1;
+    fit_colors_bbox(colors, count, &c0, &c1);
+    if (c0 == c1) {
+        compress_dxt1_single_color_optimal(vector3_to_color32(c0), output);
+        return evaluate_mse(input_colors, input_weights, color_weights, output);
+    }
+    inset_bbox(&c0, &c1);
+    select_diagonal(colors, count, &c0, &c1);
+    output_block4(input_colors, color_weights, c0, c1, output);
+
+    // Refine color for the selected indices.
+    if (optimize_end_points4(output->indices, input_colors, 16, &c0, &c1)) {
+        output_block4(input_colors, color_weights, c0, c1, output);
+    }
+
+    return evaluate_mse(input_colors, input_weights, color_weights, output);
+}
+
+
+static void compress_dxt1_fast(const uint8 input_colors[16*4], BlockDXT1 * output) {
+
+    Vector3 vec_colors[16];
+    for (int i = 0; i < 16; i++) {
+        vec_colors[i] = { input_colors[4 * i + 0] / 255.0f, input_colors[4 * i + 1] / 255.0f, input_colors[4 * i + 2] / 255.0f };
+    }
+
+    // Quick end point selection.
+    Vector3 c0, c1;
+    //fit_colors_bbox(colors, count, &c0, &c1);
+    //select_diagonal(colors, count, &c0, &c1);
+    fit_colors_bbox(vec_colors, 16, &c0, &c1);
+    if (c0 == c1) {
+        compress_dxt1_single_color_optimal(vector3_to_color32(c0), output);
+        return;
+    }
+    inset_bbox(&c0, &c1);
+    select_diagonal(vec_colors, 16, &c0, &c1);
+    output_block4(vec_colors, c0, c1, output);
+
+    // Refine color for the selected indices.
+    if (optimize_end_points4(output->indices, vec_colors, 16, &c0, &c1)) {
+        output_block4(vec_colors, c0, c1, output);
+    }
+}
+
+// Public API
+
+void init() {
+    init_dxt1_tables();
+}
+
+float compress_dxt1(const float input_colors[16 * 4], const float input_weights[16], const float rgb[3], bool three_color_mode, bool hq, void * output) {
+    return compress_dxt1((Vector4*)input_colors, input_weights, { rgb[0], rgb[1], rgb[2] }, three_color_mode, hq, (BlockDXT1*)output);
+}
+
+float compress_dxt1_fast(const float input_colors[16 * 4], const float input_weights[16], const float rgb[3], void * output) {
+    return compress_dxt1_fast((Vector4*)input_colors, input_weights, { rgb[0], rgb[1], rgb[2] }, (BlockDXT1*)output);
+}
+
+void compress_dxt1_fast(const unsigned char input_colors[16 * 4], void * output) {
+    compress_dxt1_fast(input_colors, (BlockDXT1*)output);
+}
+
+void compress_dxt1_test(const float input_colors[16 * 4], const float input_weights[16], const float rgb[3], void * output) {
+    compress_dxt1_test((Vector4*)input_colors, input_weights, { rgb[0], rgb[1], rgb[2] }, (BlockDXT1*)output);
+}
+
+float evaluate_dxt1_error(const unsigned char rgba_block[16 * 4], const void * dxt_block, Decoder decoder/*=Decoder_D3D10*/) {
+    return evaluate_dxt1_error(rgba_block, (BlockDXT1 *)dxt_block, decoder);
+}
+
+} // icbc
+#endif // ICBC_IMPLEMENTATION
diff --git a/src/nvtt/tests/CMakeLists.txt b/src/nvtt/tests/CMakeLists.txt
index 0cf1ab4..872fd8f 100644
--- a/src/nvtt/tests/CMakeLists.txt
+++ b/src/nvtt/tests/CMakeLists.txt
@@ -29,7 +29,7 @@ ADD_EXECUTABLE(nvhdrtest hdrtest.cpp)
 TARGET_LINK_LIBRARIES(nvhdrtest nvcore nvimage nvtt bc6h nvmath)
 
 ADD_EXECUTABLE(bc1enc bc1enc.cpp)
-TARGET_LINK_LIBRARIES(bc1enc nvcore nvimage nvmath nvtt squish CMP_Core)
+TARGET_LINK_LIBRARIES(bc1enc nvcore nvimage nvmath squish CMP_Core)
 
 INSTALL(TARGETS nvtestsuite nvhdrtest DESTINATION bin)
  
diff --git a/src/nvtt/tests/bc1enc.cpp b/src/nvtt/tests/bc1enc.cpp
index 912fdb8..36129cc 100644
--- a/src/nvtt/tests/bc1enc.cpp
+++ b/src/nvtt/tests/bc1enc.cpp
@@ -1,6 +1,5 @@
 
 #define  _CRT_SECURE_NO_WARNINGS
-#include <assert.h>
 #include <stdlib.h>
 
 //#define STBI_ASSERT(x)
@@ -13,12 +12,13 @@
 #define RGBCX_IMPLEMENTATION
 #include "../extern/rg/rgbcx.h"
 
+#define ICBC_IMPLEMENTATION
+#include "nvtt/icbc.h"
+
 #include "../extern/libsquish-1.15/squish.h"
 
 #include "../extern/CMP_Core/source/CMP_Core.h"
 
-#include "nvtt/CompressorDXT1.h"
-
 #include "nvmath/Vector.h"
 #include "nvmath/Color.h"
 
@@ -37,73 +37,23 @@ typedef unsigned int u32;
 #define TEST_RGBCX 1
 
 #define TEST_NVTT_FAST 1
+#define TEST_NVTT_TEST 1
 #define TEST_NVTT 1
-#define TEST_NVTT_HQ 1
-
-#define TEST_SQUISH 1
-#define TEST_SQUISH_HQ 1
-
-#define TEST_AMD_CMP 1
-
-
-
-static float mse_to_psnr(float mse) {
-    float rms = sqrtf(mse);
-    float psnr = rms ? (float)clamp(log10(255.0 / rms) * 20.0, 0.0, 300.0) : 1e+10f;
-    return psnr;
-}
-
-/*
-void image_metrics::calc(const image &a, const image &b, uint32_t first_chan, uint32_t total_chans, bool avg_comp_error, bool use_601_luma)
-{
-    //assert((first_chan < 4U) && (first_chan + total_chans <= 4U));
+#define TEST_NVTT_HQ 0
 
-    const uint32_t width = std::min(a.get_width(), b.get_width());
-    const uint32_t height = std::min(a.get_height(), b.get_height());
+#define TEST_SQUISH 0
+#define TEST_SQUISH_HQ 0
 
-    double hist[256];
-    memset(hist, 0, sizeof(hist));
+#define TEST_AMD_CMP 0
 
-    for (uint32_t y = 0; y < height; y++)
-    {
-        for (uint32_t x = 0; x < width; x++)
-        {
-            const color_rgba &ca = a(x, y), &cb = b(x, y);
 
-            for (uint32_t c = 0; c < 3; c++)
-                hist[iabs(ca[first_chan + c] - cb[first_chan + c])]++;
-        }
-    }
 
-    m_max = 0;
-    double sum = 0.0f, sum2 = 0.0f;
-    for (uint32_t i = 0; i < 256; i++)
-    {
-        if (hist[i])
-        {
-            m_max = std::max<float>(m_max, (float)i);
-            double v = i * hist[i];
-            sum += v;
-            sum2 += i * v;
-        }
-    }
-
-    double total_values = (double)width * (double)height;
-    if (avg_comp_error)
-        total_values *= (double)clamp<uint32_t>(total_chans, 1, 4);
-
-    m_mean = (float)clamp<double>(sum / total_values, 0.0f, 255.0);
-    m_mean_squared = (float)clamp<double>(sum2 / total_values, 0.0f, 255.0 * 255.0);
-    m_rms = (float)sqrt(m_mean_squared);
-    m_psnr = m_rms ? (float)clamp<double>(log10(255.0 / m_rms) * 20.0, 0.0f, 300.0f) : 1e+10f;
-}
-*/
 
 // Returns mse.
-float evaluate_dxt1_mse(uint8 * rgba, uint8 * block, int block_count, int decoder = 0) {
+float evaluate_dxt1_mse(uint8 * rgba, uint8 * block, int block_count, icbc::Decoder decoder = icbc::Decoder_D3D10) {
     double total = 0.0f;
     for (int b = 0; b < block_count; b++) {
-        total += nv::evaluate_dxt1_error(rgba, (BlockDXT1 *)block, decoder);
+        total += icbc::evaluate_dxt1_error(rgba, block, decoder);
         rgba += 4 * 4 * 4;
         block += 8;
     }
@@ -250,7 +200,7 @@ bool test_bc1(const char * inputFileName, int index, Stats * stats) {
         if (stats) {
             stats->compressorName = "stb";
             stats->mseArray[index] = mse;
-            stats->timeArray[index] = timer.elapsed();
+            stats->timeArray[index] = timer.elapsed() / repeat_count;
             stats++;
         }
         else {
@@ -274,7 +224,7 @@ bool test_bc1(const char * inputFileName, int index, Stats * stats) {
         if (stats) {
             stats->compressorName = "stb-hq";
             stats->mseArray[index] = mse;
-            stats->timeArray[index] = timer.elapsed();
+            stats->timeArray[index] = timer.elapsed() / repeat_count;
             stats++;
         }
         else {
@@ -300,7 +250,7 @@ bool test_bc1(const char * inputFileName, int index, Stats * stats) {
         if (stats) {
             stats->compressorName = "rgbcx";
             stats->mseArray[index] = mse;
-            stats->timeArray[index] = timer.elapsed();
+            stats->timeArray[index] = timer.elapsed() / repeat_count;
             stats++;
         }
         else {
@@ -310,22 +260,22 @@ bool test_bc1(const char * inputFileName, int index, Stats * stats) {
 
     if (TEST_NVTT_FAST) {
         memset(block_data, 0, block_count * 8);
-        Vector3 color_weights(1);
+        float color_weights[3] = { 1, 1, 1 };
 
         timer.start();
         for (int i = 0; i < repeat_count; i++) {
             for (int b = 0; b < block_count; b++) {
-                Vector4 input_colors[16];
+                float input_colors[16*4];
                 float input_weights[16];
                 for (int j = 0; j < 16; j++) {
-                    input_colors[j].x = rgba_block_data[b * 4 * 4 * 4 + j * 4 + 0] / 255.0f;
-                    input_colors[j].y = rgba_block_data[b * 4 * 4 * 4 + j * 4 + 1] / 255.0f;
-                    input_colors[j].z = rgba_block_data[b * 4 * 4 * 4 + j * 4 + 2] / 255.0f;
-                    input_colors[j].w = 255.0f;
+                    input_colors[4*j+0] = rgba_block_data[b * 4 * 4 * 4 + j * 4 + 0] / 255.0f;
+                    input_colors[4*j+1] = rgba_block_data[b * 4 * 4 * 4 + j * 4 + 1] / 255.0f;
+                    input_colors[4*j+2] = rgba_block_data[b * 4 * 4 * 4 + j * 4 + 2] / 255.0f;
+                    input_colors[4*j+3] = 1.0f;
                     input_weights[j] = 1.0f;
                 }
 
-                compress_dxt1_fast(input_colors, input_weights, color_weights, (BlockDXT1*)(block_data + b * 8));
+                icbc::compress_dxt1_fast(input_colors, input_weights, color_weights, (block_data + b * 8));
             }
         }
         timer.stop();
@@ -335,7 +285,7 @@ bool test_bc1(const char * inputFileName, int index, Stats * stats) {
         if (stats) {
             stats->compressorName = "nvtt-fast";
             stats->mseArray[index] = mse;
-            stats->timeArray[index] = timer.elapsed();
+            stats->timeArray[index] = timer.elapsed() / repeat_count;
             stats++;
         }
         else {
@@ -343,24 +293,59 @@ bool test_bc1(const char * inputFileName, int index, Stats * stats) {
         }
     }
 
+    if (TEST_NVTT_TEST) {
+        memset(block_data, 0, block_count * 8);
+        float color_weights[3] = { 1, 1, 1 };
+
+        timer.start();
+        for (int i = 0; i < repeat_count; i++) {
+            for (int b = 0; b < block_count; b++) {
+                float input_colors[16 * 4];
+                float input_weights[16];
+                for (int j = 0; j < 16; j++) {
+                    input_colors[4 * j + 0] = rgba_block_data[b * 4 * 4 * 4 + j * 4 + 0] / 255.0f;
+                    input_colors[4 * j + 1] = rgba_block_data[b * 4 * 4 * 4 + j * 4 + 1] / 255.0f;
+                    input_colors[4 * j + 2] = rgba_block_data[b * 4 * 4 * 4 + j * 4 + 2] / 255.0f;
+                    input_colors[4 * j + 3] = 1.0f;
+                    input_weights[j] = 1.0f;
+                }
+
+                icbc::compress_dxt1_test(input_colors, input_weights, color_weights, (block_data + b * 8));
+            }
+        }
+        timer.stop();
+
+        float mse = evaluate_dxt1_mse(rgba_block_data, block_data, block_count);
+
+        if (stats) {
+            stats->compressorName = "nvtt-test";
+            stats->mseArray[index] = mse;
+            stats->timeArray[index] = timer.elapsed() / repeat_count;
+            stats++;
+        }
+        else {
+            output_dxt_dds(bw, bh, block_data, "nvtt_test.dds");
+        }
+    }
+
     if (TEST_NVTT) {
         memset(block_data, 0, block_count * 8);
-        Vector3 color_weights(1);
+        float color_weights[3] = { 1, 1, 1 };
 
         timer.start();
         for (int i = 0; i < repeat_count; i++) {
             for (int b = 0; b < block_count; b++) {
-                Vector4 input_colors[16];
+                float input_colors[16*4];
                 float input_weights[16];
                 for (int j = 0; j < 16; j++) {
-                    input_colors[j].x = rgba_block_data[b * 4 * 4 * 4 + j * 4 + 0] / 255.0f;
-                    input_colors[j].y = rgba_block_data[b * 4 * 4 * 4 + j * 4 + 1] / 255.0f;
-                    input_colors[j].z = rgba_block_data[b * 4 * 4 * 4 + j * 4 + 2] / 255.0f;
-                    input_colors[j].w = 1.0f;
+                    input_colors[4*j+0] = rgba_block_data[b * 4 * 4 * 4 + j * 4 + 0] / 255.0f;
+                    input_colors[4*j+1] = rgba_block_data[b * 4 * 4 * 4 + j * 4 + 1] / 255.0f;
+                    input_colors[4*j+2] = rgba_block_data[b * 4 * 4 * 4 + j * 4 + 2] / 255.0f;
+                    input_colors[4*j+3] = 1.0f;
                     input_weights[j] = 1.0f;
                 }
 
-                compress_dxt1(input_colors, input_weights, color_weights, false, false, (BlockDXT1*)(block_data + b * 8));
+                icbc::compress_dxt1(input_colors, input_weights, color_weights, false, false, (block_data + b * 8));
             }
         }
         timer.stop();
@@ -370,7 +355,7 @@ bool test_bc1(const char * inputFileName, int index, Stats * stats) {
         if (stats) {
             stats->compressorName = "nvtt";
             stats->mseArray[index] = mse;
-            stats->timeArray[index] = timer.elapsed();
+            stats->timeArray[index] = timer.elapsed() / repeat_count;
             stats++;
         }
         else {
@@ -380,22 +365,22 @@ bool test_bc1(const char * inputFileName, int index, Stats * stats) {
 
     if (TEST_NVTT_HQ) {
         memset(block_data, 0, block_count * 8);
-        Vector3 color_weights(1);
+        float color_weights[3] = { 1, 1, 1 };
 
         timer.start();
         for (int i = 0; i < repeat_count; i++) {
             for (int b = 0; b < block_count; b++) {
-                Vector4 input_colors[16];
+                float input_colors[16 * 4];
                 float input_weights[16];
                 for (int j = 0; j < 16; j++) {
-                    input_colors[j].x = rgba_block_data[b * 4 * 4 * 4 + j * 4 + 0] / 255.0f;
-                    input_colors[j].y = rgba_block_data[b * 4 * 4 * 4 + j * 4 + 1] / 255.0f;
-                    input_colors[j].z = rgba_block_data[b * 4 * 4 * 4 + j * 4 + 2] / 255.0f;
-                    input_colors[j].w = 1.0f;
+                    input_colors[4 * j + 0] = rgba_block_data[b * 4 * 4 * 4 + j * 4 + 0] / 255.0f;
+                    input_colors[4 * j + 1] = rgba_block_data[b * 4 * 4 * 4 + j * 4 + 1] / 255.0f;
+                    input_colors[4 * j + 2] = rgba_block_data[b * 4 * 4 * 4 + j * 4 + 2] / 255.0f;
+                    input_colors[4 * j + 3] = 1.0f;
                     input_weights[j] = 1.0f;
                 }
 
-                compress_dxt1(input_colors, input_weights, color_weights, true, true, (BlockDXT1*)(block_data + b * 8));
+                icbc::compress_dxt1(input_colors, input_weights, color_weights, true, true, (block_data + b * 8));
             }
         }
         timer.stop();
@@ -405,7 +390,7 @@ bool test_bc1(const char * inputFileName, int index, Stats * stats) {
         if (stats) {
             stats->compressorName = "nvtt-hq";
             stats->mseArray[index] = mse;
-            stats->timeArray[index] = timer.elapsed();
+            stats->timeArray[index] = timer.elapsed() / repeat_count;
             stats++;
         }
         else {
@@ -429,7 +414,7 @@ bool test_bc1(const char * inputFileName, int index, Stats * stats) {
         if (stats) {
             stats->compressorName = "squish";
             stats->mseArray[index] = mse;
-            stats->timeArray[index] = timer.elapsed();
+            stats->timeArray[index] = timer.elapsed() / repeat_count;
             stats++;
         }
         else {
@@ -453,7 +438,7 @@ bool test_bc1(const char * inputFileName, int index, Stats * stats) {
         if (stats) {
             stats->compressorName = "squish-hq";
             stats->mseArray[index] = mse;
-            stats->timeArray[index] = timer.elapsed();
+            stats->timeArray[index] = timer.elapsed() / repeat_count;
             stats++;
         }
         else {
@@ -477,7 +462,7 @@ bool test_bc1(const char * inputFileName, int index, Stats * stats) {
         if (stats) {
             stats->compressorName = "cmp";
             stats->mseArray[index] = mse;
-            stats->timeArray[index] = timer.elapsed();
+            stats->timeArray[index] = timer.elapsed() / repeat_count;
             stats++;
         }
         else {
@@ -545,51 +530,51 @@ bool analyze_bc1(const char * inputFileName) {
     int this_should_never_happen = 0;
     int this_should_never_happen_either = 0;
         
-    Vector3 color_weights(1);
+    float color_weights[3] = { 1, 1, 1 };
 
     for (int b = 0; b < block_count; b++) {
 
         uint8 * rgba_block = rgba_block_data + b * 4 * 4 * 4;
         uint8 * dxt_block = block_data + b * 8;
 
-        Vector4 input_colors[16];
+        float input_colors[16*4];
         float input_weights[16];
         for (int j = 0; j < 16; j++) {
-            input_colors[j].x = rgba_block[j * 4 + 0] / 255.0f;
-            input_colors[j].y = rgba_block[j * 4 + 1] / 255.0f;
-            input_colors[j].z = rgba_block[j * 4 + 2] / 255.0f;
-            input_colors[j].w = 255.0f;
+            input_colors[4*j+0] = rgba_block[j * 4 + 0] / 255.0f;
+            input_colors[4*j+1] = rgba_block[j * 4 + 1] / 255.0f;
+            input_colors[4*j+2] = rgba_block[j * 4 + 2] / 255.0f;
+            input_colors[4*j+3] = 255.0f;
             input_weights[j] = 1.0f;
         }
 
         // Compare all the different modes on the same block:
 
         stb_compress_dxt_block(dxt_block, rgba_block, 0, STB_DXT_NORMAL);
-        float mse_stb = nv::evaluate_dxt1_error(rgba_block, (BlockDXT1 *)dxt_block);
+        float mse_stb = icbc::evaluate_dxt1_error(rgba_block, dxt_block);
 
         stb_compress_dxt_block(dxt_block, rgba_block, 0, STB_DXT_HIGHQUAL);
-        float mse_stb_hq = nv::evaluate_dxt1_error(rgba_block, (BlockDXT1 *)dxt_block);
+        float mse_stb_hq = icbc::evaluate_dxt1_error(rgba_block, dxt_block);
 
-        compress_dxt1_fast(input_colors, input_weights, color_weights, (BlockDXT1*)dxt_block);
-        float mse_nvtt_fast = nv::evaluate_dxt1_error(rgba_block, (BlockDXT1 *)dxt_block);
+        icbc::compress_dxt1_fast(input_colors, input_weights, color_weights, dxt_block);
+        float mse_nvtt_fast = icbc::evaluate_dxt1_error(rgba_block, dxt_block);
 
-        compress_dxt1_fast2(rgba_block, (BlockDXT1*)dxt_block);
-        float mse_nvtt_fast2 = nv::evaluate_dxt1_error(rgba_block, (BlockDXT1 *)dxt_block);
+        icbc::compress_dxt1_fast(rgba_block, dxt_block);
+        float mse_nvtt_fast2 = icbc::evaluate_dxt1_error(rgba_block, dxt_block);
 
-        compress_dxt1_fast_geld(rgba_block, (BlockDXT1*)dxt_block);
-        float mse_nvtt_geld = nv::evaluate_dxt1_error(rgba_block, (BlockDXT1 *)dxt_block);
+        icbc::compress_dxt1_test(input_colors, input_weights, color_weights, dxt_block);
+        float mse_nvtt_test = icbc::evaluate_dxt1_error(rgba_block, dxt_block);
 
-        compress_dxt1(input_colors, input_weights, color_weights, true, false, (BlockDXT1*)dxt_block);
-        float mse_nvtt = nv::evaluate_dxt1_error(rgba_block, (BlockDXT1 *)dxt_block);
+        icbc::compress_dxt1(input_colors, input_weights, color_weights, true, false, dxt_block);
+        float mse_nvtt = icbc::evaluate_dxt1_error(rgba_block, dxt_block);
 
-        compress_dxt1(input_colors, input_weights, color_weights, true, true, (BlockDXT1*)dxt_block);
-        float mse_nvtt_hq = nv::evaluate_dxt1_error(rgba_block, (BlockDXT1 *)dxt_block);
+        icbc::compress_dxt1(input_colors, input_weights, color_weights, true, true, dxt_block);
+        float mse_nvtt_hq = icbc::evaluate_dxt1_error(rgba_block, dxt_block);
 
         squish::Compress(rgba_block, dxt_block, squish::kDxt1);
-        float mse_squish = nv::evaluate_dxt1_error(rgba_block, (BlockDXT1 *)dxt_block);
+        float mse_squish = icbc::evaluate_dxt1_error(rgba_block, dxt_block);
 
         squish::Compress(rgba_block, dxt_block, squish::kDxt1 | squish::kColourIterativeClusterFit);
-        float mse_squish_hq = nv::evaluate_dxt1_error(rgba_block, (BlockDXT1 *)dxt_block);
+        float mse_squish_hq = icbc::evaluate_dxt1_error(rgba_block, dxt_block);
 
         if (mse_stb < mse_nvtt_fast) {
             stb_better_than_nvtt_fast++;
@@ -603,6 +588,9 @@ bool analyze_bc1(const char * inputFileName) {
         if (mse_nvtt_hq < mse_nvtt) {
             nvtt_hq_wins++;
         }
+        if (mse_nvtt < mse_nvtt_test) {
+            int k = 1;
+        }
         if (mse_squish < mse_nvtt_hq) {
             squish_better_than_nvtt_hq++;
         }
@@ -619,6 +607,12 @@ bool analyze_bc1(const char * inputFileName) {
 
 
 
+static float mse_to_psnr(float mse) {
+    float rms = sqrtf(mse);
+    float psnr = rms ? (float)clamp(log10(255.0 / rms) * 20.0, 0.0, 300.0) : 1e+10f;
+    return psnr;
+}
+
 
 const char * image_set[] = {
     "testsuite/kodak/kodim01.png",
@@ -696,25 +690,26 @@ const char * roblox_set[] = {
 };
 
 
-
-
 int main(int argc, char *argv[])
 {
-    const char * inputFileName = "testsuite/artificial.png";
-    //const char * inputFileName = "testsuite/kodak/kodim14.png";
+    //const char * inputFileName = "testsuite/artificial.png";
+    const char * inputFileName = "testsuite/kodak/kodim14.png";
     //const char * inputFileName = "testsuite/kodak/kodim18.png";
     //const char * inputFileName = "testsuite/kodak/kodim15.png";
     //const char * inputFileName = "testsuite/waterloo/frymire.png";
     //const char * inputFileName = "Roblox/leafygrass_top/diffuse.tga";
-    
+
+    icbc::init();
+    rgbcx::encode_bc1_init();
+
     test_bc1(inputFileName, 0, NULL);
     //analyze_bc1(inputFileName);
 
-    const char ** set = roblox_set;
-    int count = sizeof(roblox_set) / sizeof(char*);
+    //const char ** set = roblox_set;
+    //int count = sizeof(roblox_set) / sizeof(char*);
 
-    //const char ** set = image_set;
-    //int count = sizeof(image_set) / sizeof(char*);
+    const char ** set = image_set;
+    int count = sizeof(image_set) / sizeof(char*);
 
     const int MAX_COMPRESSOR_COUNT = 16;
     Stats stats[MAX_COMPRESSOR_COUNT];