diff --git a/src/nvtt/BlockCompressor.cpp b/src/nvtt/BlockCompressor.cpp
index 1cd98f4..adf97fe 100644
--- a/src/nvtt/BlockCompressor.cpp
+++ b/src/nvtt/BlockCompressor.cpp
@@ -214,7 +214,7 @@ void FastCompressorDXT1::compressBlock(Vector4 colors[16], float weights[16], co
 }
 void CompressorDXT1::compressBlock(Vector4 colors[16], float weights[16], const CompressionOptions::Private & compressionOptions, void * output)
 {
-    compress_dxt1(colors, weights, compressionOptions.colorWeight.xyz(), /*three_color_mode*/true, (BlockDXT1 *)output);
+    compress_dxt1(colors, weights, compressionOptions.colorWeight.xyz(), /*three_color_mode*/true, false, (BlockDXT1 *)output);
 }
 
 
diff --git a/src/nvtt/BlockCompressor.h b/src/nvtt/BlockCompressor.h
index 63a9b7c..f23fb99 100644
--- a/src/nvtt/BlockCompressor.h
+++ b/src/nvtt/BlockCompressor.h
@@ -1,29 +1,5 @@
-// Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
-// Copyright (c) 2007-2009 NVIDIA Corporation -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#ifndef NVTT_BLOCKCOMPRESSOR_H
-#define NVTT_BLOCKCOMPRESSOR_H
+
+#pragma once
 
 #include "Compressor.h"
 
@@ -189,6 +165,3 @@ namespace nv
 #endif
 
 } // nv namespace
-
-
-#endif // NVTT_BLOCKCOMPRESSOR_H
diff --git a/src/nvtt/CompressorDXT1.cpp b/src/nvtt/CompressorDXT1.cpp
index 8a09669..128bc4a 100644
--- a/src/nvtt/CompressorDXT1.cpp
+++ b/src/nvtt/CompressorDXT1.cpp
@@ -52,6 +52,7 @@ static const float midpoints6[64] = {
 }*/
 
 static Color16 vector3_to_color16(const Vector3 & v) {
+
     // Truncate.
     uint r = ftoi_trunc(clamp(v.x * 31.0f, 0.0f, 31.0f));
 	uint g = ftoi_trunc(clamp(v.y * 63.0f, 0.0f, 63.0f));
@@ -66,6 +67,7 @@ static Color16 vector3_to_color16(const Vector3 & v) {
 }
 
 
+
 static Color32 bitexpand_color16_to_color32(Color16 c16) {
     Color32 c32;
     //c32.b = (c16.b << 3) | (c16.b >> 2);
@@ -80,36 +82,9 @@ static Color32 bitexpand_color16_to_color32(Color16 c16) {
     return c32;
 }
 
-/*static Color32 bitexpand_color16_to_color32(int r, int g, int b) {
-    Color32 c32;
-    c32.b = (b << 3) | (b >> 2);
-    c32.g = (g << 2) | (g >> 4);
-    c32.r = (r << 3) | (r >> 2);
-    c32.a = 0xFF;
-    return c32;
-}*/
-
-static Color16 truncate_color32_to_color16(Color32 c32) {
-    Color16 c16;
-    c16.b = (c32.b >> 3);
-    c16.g = (c32.g >> 2);
-    c16.r = (c32.r >> 3);
-    return c16;
-}
-
-/*inline Vector3 r5g6b5_to_vector3(int r, int g, int b)
-{
-    Vector3 c;
-    c.x = float((r << 3) | (r >> 2));
-    c.y = float((g << 2) | (g >> 4));
-    c.z = float((b << 3) | (b >> 2));
-    return c;
-}*/
-
 inline Vector3 color_to_vector3(Color32 c)
 {
-    const float scale = 1.0f / 255.0f;
-    return Vector3(c.r * scale, c.g * scale, c.b * scale);
+    return Vector3(c.r / 255.0f, c.g / 255.0f, c.b / 255.0f);
 }
 
 inline Color32 vector3_to_color(Vector3 v)
@@ -126,17 +101,17 @@ inline Color32 vector3_to_color(Vector3 v)
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Input block processing.
 
-inline static void color_block_to_vector_block(const ColorBlock & rgba, Vector3 block[16])
+/*inline static void color_block_to_vector_block(const ColorBlock & rgba, Vector3 block[16])
 {
 	for (int i = 0; i < 16; i++)
 	{
 		const Color32 c = rgba.color(i);
 		block[i] = Vector3(c.r, c.g, c.b);
 	}
-}
+}*/
 
 // Find first valid color.
-static bool find_valid_color_rgb(const Vector3 * colors, const float * weights, int count, Vector3 * valid_color)
+/*static bool find_valid_color_rgb(const Vector3 * colors, const float * weights, int count, Vector3 * valid_color)
 {
     for (int i = 0; i < count; i++) {
         if (weights[i] > 0.0f) {
@@ -147,9 +122,9 @@ static bool find_valid_color_rgb(const Vector3 * colors, const float * weights,
 
     // No valid colors.
     return false;
-}
+}*/
 
-static bool is_single_color_rgb(const Vector3 * colors, const float * weights, int count, Vector3 color)
+/*static bool is_single_color_rgb(const Vector3 * colors, const float * weights, int count, Vector3 color)
 {
     for (int i = 0; i < count; i++) {
         if (weights[i] > 0.0f) {
@@ -158,7 +133,7 @@ static bool is_single_color_rgb(const Vector3 * colors, const float * weights, i
     }
 
     return true;
-}
+}*/
 
 // Find similar colors and combine them together.
 static int reduce_colors(const Vector4 * input_colors, const float * input_weights, Vector3 * colors, float * weights)
@@ -193,23 +168,56 @@ static int reduce_colors(const Vector4 * input_colors, const float * input_weigh
     return n;
 }
 
+static int reduce_colors(const uint8 * input_colors, Vector3 * colors, float * weights)
+{
+    int n = 0;
+    for (int i = 0; i < 16; i++)
+    {
+        Vector3 ci;
+        ci.x = float(input_colors[4 * i + 0]);
+        ci.y = float(input_colors[4 * i + 1]);
+        ci.z = float(input_colors[4 * i + 2]);
+
+        // Find matching color.
+        int j;
+        for (j = 0; j < n; j++) {
+            if (equal(colors[j].x, ci.x) && equal(colors[j].y, ci.y) && equal(colors[j].z, ci.z)) {
+                weights[j] += 1.0f;
+                break;
+            }
+        }
+
+        // No match found. Add new color.
+        if (j == n) {
+            colors[n] = ci;
+            weights[n] = 1.0f;
+            n++;
+        }
+    }
+
+    nvDebugCheck(n <= 16);
+
+    return n;
+}
+
+
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Error evaluation.
 
 // Different ways of estimating the error.
-/*static float evaluate_mse(const Vector3 & p, const Vector3 & c) {
-    //return (square(p.x-c.x) * w2.x + square(p.y-c.y) * w2.y + square(p.z-c.z) * w2.z);
-    Vector3 d = (p - c);
-    return dot(d, d);
-}*/
 
 static float evaluate_mse(const Vector3 & p, const Vector3 & c, const Vector3 & w) {
-    //return (square(p.x-c.x) * w2.x + square(p.y-c.y) * w2.y + square(p.z-c.z) * w2.z);
-    Vector3 d = (p - c) * w;
+    Vector3 d = (p * 255 - c * 255) * w;
     return dot(d, d);
 }
 
+static float evaluate_mse(const Color32 & p, const Vector3 & c, const Vector3 & w) {
+    Vector3 d = (Vector3(p.r, p.g, p.b) - c * 255) * w;
+    return dot(d, d);
+}
+
+
 /*static float evaluate_mse(const Vector3 & p, const Vector3 & c, const Vector3 & w) {
     return ww.x * square(p.x-c.x) + ww.y * square(p.y-c.y) + ww.z * square(p.z-c.z);
 }*/
@@ -237,7 +245,8 @@ static int evaluate_mse(const Color32 palette[4], const Color32 & c) {
 // Returns MSE error in [0-255] range.
 static int evaluate_mse(const BlockDXT1 * output, Color32 color, int index) {
     Color32 palette[4];
-    output->evaluatePalette(palette, /*d3d9=*/false);
+    //output->evaluatePalette(palette, /*d3d9=*/false);
+    output->evaluatePaletteNV5x(palette);
 
     return evaluate_mse(palette[index], color);
 }
@@ -253,6 +262,16 @@ static float evaluate_palette_error(Color32 palette[4], const Color32 * colors,
     return total;
 }
 
+static float evaluate_palette_error(Color32 palette[4], const Color32 * colors, int count) {
+
+    float total = 0.0f;
+    for (int i = 0; i < count; i++) {
+        total += evaluate_mse(palette, colors[i]);
+    }
+
+    return total;
+}
+
 #if 0
 static float evaluate_mse(const BlockDXT1 * output, const Vector3 colors[16]) {
     Color32 palette[4];
@@ -278,18 +297,38 @@ static float evaluate_mse(const BlockDXT1 * output, const Vector3 colors[16]) {
 static float evaluate_mse(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, const BlockDXT1 * output) {
     Color32 palette[4];
     output->evaluatePalette(palette, /*d3d9=*/false);
+    //output->evaluatePaletteNV5x(palette);
 
     // convert palette to float.
-    Vector3 vector_palette[4];
+    /*Vector3 vector_palette[4];
     for (int i = 0; i < 4; i++) {
         vector_palette[i] = color_to_vector3(palette[i]);
-    }
+    }*/
 
     // evaluate error for each index.
     float error = 0.0f;
     for (int i = 0; i < 16; i++) {
         int index = (output->indices >> (2 * i)) & 3;
-        error += input_weights[i] * evaluate_mse(vector_palette[index], input_colors[i].xyz(), color_weights);
+        error += input_weights[i] * evaluate_mse(palette[index], input_colors[i].xyz(), color_weights);
+    }
+    return error;
+}
+
+float nv::evaluate_dxt1_error(const uint8 rgba_block[16*4], const BlockDXT1 * block, int decoder) {
+    Color32 palette[4];
+    if (decoder == 2) {
+        block->evaluatePaletteNV5x(palette);
+    }
+    else {
+        block->evaluatePalette(palette, /*d3d9=*/decoder);
+    }
+
+    // evaluate error for each index.
+    float error = 0.0f;
+    for (int i = 0; i < 16; i++) {
+        int index = (block->indices >> (2 * i)) & 3;
+        Color32 c(rgba_block[4 * i + 0], rgba_block[4 * i + 1], rgba_block[4 * i + 2]);
+        error += evaluate_mse(palette[index], c);
     }
     return error;
 }
@@ -299,19 +338,45 @@ static float evaluate_mse(const Vector4 input_colors[16], const float input_weig
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Palette evaluation.
 
-static void evaluate_palette4(Color32 palette[4]) {
+#define DECODER 0
+
+inline void evaluate_palette4(Color16 c0, Color16 c1, Color32 palette[4]) {
+#if DECODER == 0
     palette[2].r = (2 * palette[0].r + palette[1].r) / 3;
     palette[2].g = (2 * palette[0].g + palette[1].g) / 3;
     palette[2].b = (2 * palette[0].b + palette[1].b) / 3;
     palette[3].r = (2 * palette[1].r + palette[0].r) / 3;
     palette[3].g = (2 * palette[1].g + palette[0].g) / 3;
     palette[3].b = (2 * palette[1].b + palette[0].b) / 3;
+#elif DECODER == 1
+    palette[2].r = (2 * palette[0].r + palette[1].r + 1) / 3;
+    palette[2].g = (2 * palette[0].g + palette[1].g + 1) / 3;
+    palette[2].b = (2 * palette[0].b + palette[1].b + 1) / 3;
+    palette[3].r = (2 * palette[1].r + palette[0].r + 1) / 3;
+    palette[3].g = (2 * palette[1].g + palette[0].g + 1) / 3;
+    palette[3].b = (2 * palette[1].b + palette[0].b + 1) / 3;
+#else
+    int dg = palette[1].g - palette[0].g;
+    palette[2].r = ((2 * c0.r + c1.r) * 22) / 8;
+    palette[2].g = (256 * palette[0].g + dg * 80 + dg / 4 + 128) / 256;
+    palette[2].b = ((2 * c0.b + c1.b) * 22) / 8;
+    palette[3].r = ((2 * c1.r + c0.r) * 22) / 8;
+    palette[3].g = (256 * palette[1].g - dg * 80 - dg / 4 + 128) / 256;
+    palette[3].b = ((2 * c1.b + c0.b) * 22) / 8;
+#endif
 }
 
-static void evaluate_palette3(Color32 palette[4]) {
+inline void evaluate_palette3(Color16 c0, Color16 c1, Color32 palette[4]) {
+#if DECODER == 0 || DECODER == 1
     palette[2].r = (palette[0].r + palette[1].r) / 2;
     palette[2].g = (palette[0].g + palette[1].g) / 2;
     palette[2].b = (palette[0].b + palette[1].b) / 2;
+#else
+    int dg = palette[1].g - palette[0].g;
+    palette[2].r = ((c0.r + c1.r) * 33) / 8;
+    palette[2].g = (256 * palette[0].g + dg * 128 + dg / 4 + 128) / 256;
+    palette[2].b = ((c0.b + c1.b) * 33) / 8;
+#endif
     palette[3].r = 0;
     palette[3].g = 0;
     palette[3].b = 0;
@@ -321,10 +386,10 @@ static void evaluate_palette(Color16 c0, Color16 c1, Color32 palette[4]) {
     palette[0] = bitexpand_color16_to_color32(c0);
     palette[1] = bitexpand_color16_to_color32(c1);
     if (c0.u > c1.u) {
-        evaluate_palette4(palette);
+        evaluate_palette4(c0, c1, palette);
     }
     else {
-        evaluate_palette3(palette);
+        evaluate_palette3(c0, c1, palette);
     }
 }
 
@@ -378,6 +443,32 @@ static uint compute_indices4(const Vector4 input_colors[16], const Vector3 & col
 }
 
 
+static uint compute_indices4(const Vector3 input_colors[16], const Vector3 palette[4]) {
+
+    uint indices = 0;
+    for (int i = 0; i < 16; i++) {
+        float d0 = evaluate_mse(palette[0], input_colors[i], Vector3(1));
+        float d1 = evaluate_mse(palette[1], input_colors[i], Vector3(1));
+        float d2 = evaluate_mse(palette[2], input_colors[i], Vector3(1));
+        float d3 = evaluate_mse(palette[3], input_colors[i], Vector3(1));
+
+        uint b0 = d0 > d3;
+        uint b1 = d1 > d2;
+        uint b2 = d0 > d2;
+        uint b3 = d1 > d3;
+        uint b4 = d2 > d3;
+
+        uint x0 = b1 & b2;
+        uint x1 = b0 & b3;
+        uint x2 = b0 & b4;
+
+        indices |= (x2 | ((x0 | x1) << 1)) << (2 * i);
+    }
+
+    return indices;
+}
+
+
 static uint compute_indices(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 palette[4]) {
     
     uint indices = 0;
@@ -435,7 +526,186 @@ static void output_block4(const Vector4 input_colors[16], const Vector3 & color_
 }
 
 
+static void output_block4(const Vector3 input_colors[16], const Vector3 & v0, const Vector3 & v1, BlockDXT1 * block)
+{
+    Color16 color0 = vector3_to_color16(v0);
+    Color16 color1 = vector3_to_color16(v1);
+
+    if (color0.u < color1.u) {
+        swap(color0, color1);
+    }
+
+    Vector3 palette[4];
+    evaluate_palette(color0, color1, palette);
+
+    block->col0 = color0;
+    block->col1 = color1;
+    block->indices = compute_indices4(input_colors, palette);
+}
+
+// Least squares fitting of color end points for the given indices. @@ Take weights into account.
+static bool optimize_end_points4(uint indices, const Vector4 * colors, /*const float * weights,*/ int count, Vector3 * a, Vector3 * b)
+{
+    float alpha2_sum = 0.0f;
+    float beta2_sum = 0.0f;
+    float alphabeta_sum = 0.0f;
+    Vector3 alphax_sum(0.0f);
+    Vector3 betax_sum(0.0f);
 
+    for (int i = 0; i < count; i++)
+    {
+        const uint bits = indices >> (2 * i);
+
+        float beta = float(bits & 1);
+        if (bits & 2) beta = (1 + beta) / 3.0f;
+        float alpha = 1.0f - beta;
+
+        alpha2_sum += alpha * alpha;
+        beta2_sum += beta * beta;
+        alphabeta_sum += alpha * beta;
+        alphax_sum += alpha * colors[i].xyz();
+        betax_sum += beta * colors[i].xyz();
+    }
+
+    float denom = alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum;
+    if (equal(denom, 0.0f)) return false;
+
+    float factor = 1.0f / denom;
+
+    *a = saturate((alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor);
+    *b = saturate((betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor);
+
+    return true;
+}
+
+static bool optimize_end_points4(uint indices, const Vector3 * colors, int count, Vector3 * a, Vector3 * b)
+{
+    float alpha2_sum = 0.0f;
+    float beta2_sum = 0.0f;
+    float alphabeta_sum = 0.0f;
+    Vector3 alphax_sum(0.0f);
+    Vector3 betax_sum(0.0f);
+
+    for (int i = 0; i < count; i++)
+    {
+        const uint bits = indices >> (2 * i);
+
+        float beta = float(bits & 1);
+        if (bits & 2) beta = (1 + beta) / 3.0f;
+        float alpha = 1.0f - beta;
+
+        alpha2_sum += alpha * alpha;
+        beta2_sum += beta * beta;
+        alphabeta_sum += alpha * beta;
+        alphax_sum += alpha * colors[i];
+        betax_sum += beta * colors[i];
+    }
+
+    float denom = alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum;
+    if (equal(denom, 0.0f)) return false;
+
+    float factor = 1.0f / denom;
+
+    *a = saturate((alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor);
+    *b = saturate((betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor);
+
+    return true;
+}
+
+
+// Least squares fitting of color end points for the given indices. @@ This does not support black/transparent index. @@ Take weights into account.
+static bool optimize_end_points3(uint indices, const Vector3 * colors, /*const float * weights,*/ int count, Vector3 * a, Vector3 * b)
+{
+    float alpha2_sum = 0.0f;
+    float beta2_sum = 0.0f;
+    float alphabeta_sum = 0.0f;
+    Vector3 alphax_sum(0.0f);
+    Vector3 betax_sum(0.0f);
+
+    for (int i = 0; i < count; i++)
+    {
+        const uint bits = indices >> (2 * i);
+
+        float beta = float(bits & 1);
+        if (bits & 2) beta = 0.5f;
+        float alpha = 1.0f - beta;
+
+        alpha2_sum += alpha * alpha;
+        beta2_sum += beta * beta;
+        alphabeta_sum += alpha * beta;
+        alphax_sum += alpha * colors[i];
+        betax_sum += beta * colors[i];
+    }
+
+    float denom = alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum;
+    if (equal(denom, 0.0f)) return false;
+
+    float factor = 1.0f / denom;
+
+    *a = saturate((alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor);
+    *b = saturate((betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor);
+
+    return true;
+}
+
+// @@ After optimization we need to round end points. Round in all possible directions, and pick best.
+
+
+
+// find minimum and maximum colors based on bounding box in color space
+inline static void fit_colors_bbox(const Vector3 * colors, int count, Vector3 * restrict c0, Vector3 * restrict c1)
+{
+    *c0 = Vector3(0);
+    *c1 = Vector3(1);
+
+    for (int i = 0; i < count; i++) {
+        *c0 = max(*c0, colors[i]);
+        *c1 = min(*c1, colors[i]);
+    }
+}
+
+inline static void select_diagonal(const Vector3 * colors, int count, Vector3 * restrict c0, Vector3 * restrict c1)
+{
+    Vector3 center = (*c0 + *c1) * 0.5f;
+
+    /*Vector3 center = colors[0];
+    for (int i = 1; i < count; i++) {
+        center = center * float(i-1) / i + colors[i] / i;
+    }*/
+    /*Vector3 center = colors[0];
+    for (int i = 1; i < count; i++) {
+        center += colors[i];
+    }
+    center /= count;*/
+
+    Vector2 covariance = Vector2(0);
+    for (int i = 0; i < count; i++) {
+        Vector3 t = colors[i] - center;
+        covariance += t.xy() * t.z;
+    }
+
+    float x0 = c0->x;
+    float y0 = c0->y;
+    float x1 = c1->x;
+    float y1 = c1->y;
+
+    if (covariance.x < 0) {
+        swap(x0, x1);
+    }
+    if (covariance.y < 0) {
+        swap(y0, y1);
+    }
+
+    c0->set(x0, y0, c0->z);
+    c1->set(x1, y1, c1->z);
+}
+
+inline static void inset_bbox(Vector3 * restrict c0, Vector3 * restrict c1)
+{
+    Vector3 inset = (*c0 - *c1) / 16.0f - (8.0f / 255.0f) / 16.0f;
+    *c0 = saturate(*c0 - inset);
+    *c1 = saturate(*c1 + inset);
+}
 
 
 // Single color compressor, based on:
@@ -505,16 +775,6 @@ float nv::compress_dxt1_single_color(const Vector3 * colors, const float * weigh
 }
 
 
-/* @@ Not implemented yet.
-// Low quality baseline compressor.
-float nv::compress_dxt1_least_squares_fit(const Vector3 * input_colors, const Vector3 * colors, const float * weights, int count, BlockDXT1 * output)
-{
-    // @@ Iterative best end point fit.
-
-    return FLT_MAX;
-}*/
-
-
 float nv::compress_dxt1_bounding_box_exhaustive(const Vector4 input_colors[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, bool three_color_mode, int max_volume, BlockDXT1 * output)
 {
     // Compute bounding box.
@@ -583,12 +843,12 @@ float nv::compress_dxt1_bounding_box_exhaustive(const Vector4 input_colors[16],
 
             if (c0.u > c1.u) {
                 // Evaluate error in 4 color mode.
-                evaluate_palette4(palette);
+                evaluate_palette4(c0, c1, palette);
             }
             else {
                 if (three_color_mode) {
                     // Evaluate error in 3 color mode.
-                    evaluate_palette3(palette);
+                    evaluate_palette3(c0, c1, palette);
                 }
                 else {
                     // Skip 3 color mode.
@@ -637,9 +897,65 @@ void nv::compress_dxt1_cluster_fit(const Vector4 input_colors[16], const Vector3
 }
 
 
+/*static unsigned int stb__MatchColorsBlock(uint8 *block, uint8 *color)
+{
+    uint mask = 0;
+    int dir[3];
+    dir[0] = color[0 * 4 + 0] - color[1 * 4 + 0];
+    dir[1] = color[0 * 4 + 1] - color[1 * 4 + 1];
+    dir[2] = color[0 * 4 + 2] - color[1 * 4 + 2];
+    int dots[16];
+    int stops[4];
+    int i;
+
+    for (i = 0;i < 16;i++)
+        dots[i] = block[i * 4 + 0] * dir[0] + block[i * 4 + 1] * dir[1] + block[i * 4 + 2] * dir[2];
+
+    for (i = 0;i < 4;i++)
+        stops[i] = color[i * 4 + 0] * dir[0] + color[i * 4 + 1] * dir[1] + color[i * 4 + 2] * dir[2];
+
+    // think of the colors as arranged on a line; project point onto that line, then choose
+    // next color out of available ones. we compute the crossover points for "best color in top
+    // half"/"best in bottom half" and then the same inside that subinterval.
+    //
+    // relying on this 1d approximation isn't always optimal in terms of euclidean distance,
+    // but it's very close and a lot faster.
+    // http://cbloomrants.blogspot.com/2008/12/12-08-08-dxtc-summary.html
+
+    int c0Point = (stops[1] + stops[3]);
+    int halfPoint = (stops[3] + stops[2]);
+    int c3Point = (stops[2] + stops[0]);
+
+    for (i = 15;i >= 0;i--) {
+        int dot = 2 * dots[i];
+        mask <<= 2;
+
+        uint sel;
+        if (dot < halfPoint)
+            sel = (dot < c0Point) ? 1 : 3;
+        else
+            sel = (dot < c3Point) ? 2 : 0;
+
+        mask |= sel;
+    }
+
+    return mask;
+}*/
+
+
+inline uint32 mod3(uint32 a) {
+    a = (a >> 16) + (a & 0xFFFF);   /* sum base 2**16 digits    a <= 0x1FFFE */
+    a = (a >> 8) + (a & 0xFF);      /* sum base 2**8 digits     a <= 0x2FD */
+    a = (a >> 4) + (a & 0xF);       /* sum base 2**4 digits     a <= 0x3C; worst case 0x3B */
+    a = (a >> 2) + (a & 0x3);       /* sum base 2**2 digits     a <= 0x1D; worst case 0x1B */
+    a = (a >> 2) + (a & 0x3);       /* sum base 2**2 digits     a <= 0x9; worst case 0x7 */
+    a = (a >> 2) + (a & 0x3);       /* sum base 2**2 digits     a <= 0x4 */
+    if (a > 2) a = a - 3;
+    return a;
+}
 
 
-float nv::compress_dxt1(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, bool three_color_mode, BlockDXT1 * output)
+float nv::compress_dxt1(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, bool three_color_mode, bool hq, BlockDXT1 * output)
 {
     Vector3 colors[16];
     float weights[16];
@@ -658,7 +974,7 @@ float nv::compress_dxt1(const Vector4 input_colors[16], const float input_weight
 
     // Sometimes the single color compressor produces better results than the exhaustive. This introduces discontinuities between blocks that
     // use different compressors. For this reason, this is not enabled by default.
-    if (1) {
+    if (0) {
         error = compress_dxt1_single_color(colors, weights, count, color_weights, output);
 
         if (error == 0.0f || count == 1) {
@@ -686,16 +1002,48 @@ float nv::compress_dxt1(const Vector4 input_colors[16], const float input_weight
         }
     }
 
-    // @@ TODO.
-    // This is pretty fast and in some cases can produces better quality than cluster fit.
-    //error = compress_dxt1_least_squares_fit(colors, weigths, error, output);
-
     // Cluster fit cannot handle single color blocks, so encode them optimally if we haven't encoded them already.
     if (error == FLT_MAX && count == 1) {
         error = compress_dxt1_single_color_optimal(colors[0], output);
     }
 
     if (count > 1) {
+        // Fast box fit encoding:
+        {
+            BlockDXT1 box_fit_output;
+
+            Vector3 colors[16];
+            for (int i = 0; i < 16; i++) {
+                colors[i] = input_colors[i].xyz();
+            }
+            int count = 16;
+
+            // Quick end point selection.
+            Vector3 c0, c1;
+            fit_colors_bbox(colors, count, &c0, &c1);
+            inset_bbox(&c0, &c1);
+            select_diagonal(colors, count, &c0, &c1);
+            output_block4(input_colors, color_weights, c0, c1, &box_fit_output);
+
+            float box_fit_error = evaluate_mse(input_colors, input_weights, color_weights, &box_fit_output);
+            if (box_fit_error < error) {
+                error = box_fit_error;
+                *output = box_fit_output;
+
+                // Refine color for the selected indices.
+                if (optimize_end_points4(output->indices, input_colors, 16, &c0, &c1)) {
+                    output_block4(input_colors, color_weights, c0, c1, &box_fit_output);
+
+                    box_fit_error = evaluate_mse(input_colors, input_weights, color_weights, &box_fit_output);
+                    if (box_fit_error < error) {
+                        error = box_fit_error;
+                        *output = box_fit_output;
+                    }
+                }
+            }
+        }
+
+        // Try cluster fit.
         BlockDXT1 cluster_fit_output;
         compress_dxt1_cluster_fit(input_colors, colors, weights, count, color_weights, three_color_mode, &cluster_fit_output);
 
@@ -704,6 +1052,74 @@ float nv::compress_dxt1(const Vector4 input_colors[16], const float input_weight
         if (cluster_fit_error < error) {
             *output = cluster_fit_output;
             error = cluster_fit_error;
+
+            /*if (hq && cluster_fit_output.isFourColorMode()) {
+                
+                // Refine color for the selected indices.
+                Vector3 c0, c1;
+                if (optimize_end_points4(output->indices, input_colors, 16, &c0, &c1)) {
+                    BlockDXT1 box_fit_output;
+                    output_block4(input_colors, color_weights, c0, c1, &box_fit_output);
+
+                    float box_fit_error = evaluate_mse(input_colors, input_weights, color_weights, &box_fit_output);
+                    if (box_fit_error < error) {
+                        error = box_fit_error;
+                        *output = box_fit_output;
+                    }
+                }
+            }*/
+
+            if (hq) {
+                int8 deltas[16][3] = {
+                    {1,0,0},
+                    {0,1,0},
+                    {0,0,1},
+
+                    {-1,0,0},
+                    {0,-1,0},
+                    {0,0,-1},
+
+                    {1,1,0},
+                    {1,0,1},
+                    {0,1,1},
+
+                    {-1,-1,0},
+                    {-1,0,-1},
+                    {0,-1,-1},
+
+                    {-1,1,0},
+                    //{-1,0,1},
+
+                    {1,-1,0},
+                    {0,-1,1},
+
+                    //{1,0,-1},
+                    {0,1,-1},
+                };
+
+                BlockDXT1 refined = *output;
+                for (int i = 0; i < 10000; i++) {
+                    int rnd = i * 2654435761;
+                    int8 delta[3] = { deltas[rnd % 16][0], deltas[rnd % 16][1], deltas[rnd % 16][2] };
+
+                    if ((rnd / 16) & 1) {
+                        refined.col0.r += delta[0];
+                        refined.col0.g += delta[1];
+                        refined.col0.b += delta[2];
+                    }
+                    else {
+                        refined.col1.r += delta[0];
+                        refined.col1.g += delta[1];
+                        refined.col1.b += delta[2];
+                    }
+
+                    float refined_error = evaluate_mse(input_colors, input_weights, color_weights, &refined);
+                    if (refined_error < error) {
+                        *output = refined;
+                        error = refined_error;
+                    }
+                }
+            }
         }
     }
 
@@ -720,165 +1136,389 @@ float nv::compress_dxt1(const Vector4 input_colors[16], const float input_weight
 // @@ How do we do the initial index/cluster assignment? Use standard cluster fit.
 
 
-// Least squares fitting of color end points for the given indices. @@ Take weights into account.
-static bool optimize_end_points4(uint indices, const Vector4 * colors, /*const float * weights,*/ int count, Vector3 * a, Vector3 * b)
+
+float nv::compress_dxt1_fast(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, BlockDXT1 * output)
 {
-    float alpha2_sum = 0.0f;
-    float beta2_sum = 0.0f;
-    float alphabeta_sum = 0.0f;
-    Vector3 alphax_sum(0.0f);
-    Vector3 betax_sum(0.0f);
+    Vector3 colors[16];
+    for (int i = 0; i < 16; i++) {
+        colors[i] = input_colors[i].xyz();
+    }
+    int count = 16;
 
-    for (int i = 0; i < count; i++)
-    {
-        const uint bits = indices >> (2 * i);
+    /*float error = FLT_MAX;
+    error = compress_dxt1_single_color(colors, input_weights, count, color_weights, output);
 
-        float beta = float(bits & 1);
-        if (bits & 2) beta = (1 + beta) / 3.0f;
-        float alpha = 1.0f - beta;
+    if (error == 0.0f || count == 1) {
+        // Early out.
+        return error;
+    }*/
 
-        alpha2_sum += alpha * alpha;
-        beta2_sum += beta * beta;
-        alphabeta_sum += alpha * beta;
-        alphax_sum += alpha * colors[i].xyz();
-        betax_sum += beta * colors[i].xyz();
+    // Quick end point selection.
+    Vector3 c0, c1;
+    fit_colors_bbox(colors, count, &c0, &c1);
+    if (c0 == c1) {
+        ::compress_dxt1_single_color_optimal(vector3_to_color(c0), output);
+        return evaluate_mse(input_colors, input_weights, color_weights, output);
     }
+    inset_bbox(&c0, &c1);
+    select_diagonal(colors, count, &c0, &c1);
+    output_block4(input_colors, color_weights, c0, c1, output);
 
-    float denom = alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum;
-    if (equal(denom, 0.0f)) return false;
+    // Refine color for the selected indices.
+    if (optimize_end_points4(output->indices, input_colors, 16, &c0, &c1)) {
+        output_block4(input_colors, color_weights, c0, c1, output);
+    }
 
-    float factor = 1.0f / denom;
+    return evaluate_mse(input_colors, input_weights, color_weights, output);
+}
 
-    *a = saturate((alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor);
-    *b = saturate((betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor);
 
-    return true;
-}
+void nv::compress_dxt1_fast2(const uint8 input_colors[16*4], BlockDXT1 * output) {
+    /*Vector3 colors[16];
+    float weights[16];
+    int count = reduce_colors(input_colors, colors, weights);
 
+    if (count == 0) {
+        // Output trivial block.
+        output->col0.u = 0;
+        output->col1.u = 0;
+        output->indices = 0;
+        return;
+    }
 
-// Least squares fitting of color end points for the given indices. @@ This does not support black/transparent index. @@ Take weights into account.
-static bool optimize_end_points3(uint indices, const Vector3 * colors, /*const float * weights,*/ int count, Vector3 * a, Vector3 * b)
-{
-    float alpha2_sum = 0.0f;
-    float beta2_sum = 0.0f;
-    float alphabeta_sum = 0.0f;
-    Vector3 alphax_sum(0.0f);
-    Vector3 betax_sum(0.0f);
 
-    for (int i = 0; i < count; i++)
-    {
-        const uint bits = indices >> (2 * i);
+    float error = FLT_MAX;
+    error = compress_dxt1_single_color(colors, weights, count, Vector3(1.0f), output);
 
-        float beta = float(bits & 1);
-        if (bits & 2) beta = 0.5f;
-        float alpha = 1.0f - beta;
+    if (error == 0.0f || count == 1) {
+        // Early out.
+        return;
+    }*/
 
-        alpha2_sum += alpha * alpha;
-        beta2_sum += beta * beta;
-        alphabeta_sum += alpha * beta;
-        alphax_sum += alpha * colors[i];
-        betax_sum += beta * colors[i];
+    Vector3 vec_colors[16];
+    for (int i = 0; i < 16; i++) {
+        vec_colors[i] = Vector3(input_colors[4 * i + 0] / 255.0f, input_colors[4 * i + 1] / 255.0f, input_colors[4 * i + 2] / 255.0f);
     }
 
-    float denom = alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum;
-    if (equal(denom, 0.0f)) return false;
+    // Quick end point selection.
+    Vector3 c0, c1;
+    //fit_colors_bbox(colors, count, &c0, &c1);
+    //select_diagonal(colors, count, &c0, &c1);
+    fit_colors_bbox(vec_colors, 16, &c0, &c1);
+    if (c0 == c1) {
+        ::compress_dxt1_single_color_optimal(vector3_to_color(c0), output);
+        return;
+    }
+    inset_bbox(&c0, &c1);
+    select_diagonal(vec_colors, 16, &c0, &c1);
+    output_block4(vec_colors, c0, c1, output);
 
-    float factor = 1.0f / denom;
+    // Refine color for the selected indices.
+    if (optimize_end_points4(output->indices, vec_colors, 16, &c0, &c1)) {
+        output_block4(vec_colors, c0, c1, output);
+    }
+}
 
-    *a = saturate((alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor);
-    *b = saturate((betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor);
 
-    return true;
+static int Mul8Bit(int a, int b)
+{
+    int t = a * b + 128;
+    return (t + (t >> 8)) >> 8;
 }
 
-// @@ After optimization we need to round end points. Round in all possible directions, and pick best.
+static bool compute_least_squares_endpoints(const uint8 *block, uint32 mask, Vector3 *pmax, Vector3 *pmin)
+{
+    static const int w1Tab[4] = { 3,0,2,1 };
+    static const int prods[4] = { 0x090000,0x000900,0x040102,0x010402 };
+    // ^some magic to save a lot of multiplies in the accumulating loop...
+    // (precomputed products of weights for least squares system, accumulated inside one 32-bit register)
 
+    int akku = 0;
+    int At1_r, At1_g, At1_b;
+    int At2_r, At2_g, At2_b;
+    unsigned int cm = mask;
 
+    if ((mask ^ (mask << 2)) < 4) // all pixels have the same index?
+    {
+        return false;
+    }
+    else {
+        At1_r = At1_g = At1_b = 0;
+        At2_r = At2_g = At2_b = 0;
+        for (int i = 0;i < 16;++i, cm >>= 2) {
+            int step = cm & 3;
+            int w1 = w1Tab[step];
+            int r = block[i * 4 + 0];
+            int g = block[i * 4 + 1];
+            int b = block[i * 4 + 2];
+
+            akku += prods[step];
+            At1_r += w1 * r;
+            At1_g += w1 * g;
+            At1_b += w1 * b;
+            At2_r += r;
+            At2_g += g;
+            At2_b += b;
+        }
 
-// find minimum and maximum colors based on bounding box in color space
-inline static void fit_colors_bbox(const Vector3 * colors, int count, Vector3 * restrict c0, Vector3 * restrict c1)
-{
-    *c0 = Vector3(0);
-    *c1 = Vector3(255);
+        At2_r = 3 * At2_r - At1_r;
+        At2_g = 3 * At2_g - At1_g;
+        At2_b = 3 * At2_b - At1_b;
 
-    for (int i = 0; i < count; i++) {
-        *c0 = max(*c0, colors[i]);
-        *c1 = min(*c1, colors[i]);
+        // extract solutions and decide solvability
+        int xx = akku >> 16;
+        int yy = (akku >> 8) & 0xff;
+        int xy = (akku >> 0) & 0xff;
+
+        float f = 3.0f / 255.0f / (xx*yy - xy * xy);
+
+        // solve.
+        pmax->x = (At1_r*yy - At2_r * xy) * f;
+        pmax->y = (At1_r*yy - At2_r * xy) * f;
+        pmax->z = (At1_r*yy - At2_r * xy) * f;
+
+        pmin->x = (At2_r*xx - At1_r * xy) * f;
+        pmin->y = (At2_r*xx - At1_r * xy) * f;
+        pmin->z = (At2_r*xx - At1_r * xy) * f;
+
+        return true;
     }
 }
 
-inline static void select_diagonal(const Vector3 * colors, int count, Vector3 * restrict c0, Vector3 * restrict c1)
+
+static uint32 bc1_find_sels(const uint8 *input_colors, uint32_t lr, uint32_t lg, uint32_t lb, uint32_t hr, uint32_t hg, uint32_t hb)
 {
-    Vector3 center = (*c0 + *c1) * 0.5f;
+    uint32_t block_r[4], block_g[4], block_b[4];
 
-    Vector2 covariance = Vector2(0);
-    for (int i = 0; i < count; i++) {
-        Vector3 t = colors[i] - center;
-        covariance += t.xy() * t.z;
+    block_r[0] = (lr << 3) | (lr >> 2); block_g[0] = (lg << 2) | (lg >> 4); block_b[0] = (lb << 3) | (lb >> 2);
+    block_r[3] = (hr << 3) | (hr >> 2); block_g[3] = (hg << 2) | (hg >> 4); block_b[3] = (hb << 3) | (hb >> 2);
+    block_r[1] = (block_r[0] * 2 + block_r[3]) / 3; block_g[1] = (block_g[0] * 2 + block_g[3]) / 3; block_b[1] = (block_b[0] * 2 + block_b[3]) / 3;
+    block_r[2] = (block_r[3] * 2 + block_r[0]) / 3; block_g[2] = (block_g[3] * 2 + block_g[0]) / 3; block_b[2] = (block_b[3] * 2 + block_b[0]) / 3;
+
+    int ar = block_r[3] - block_r[0], ag = block_g[3] - block_g[0], ab = block_b[3] - block_b[0];
+
+    int dots[4];
+    for (uint32_t i = 0; i < 4; i++)
+        dots[i] = (int)block_r[i] * ar + (int)block_g[i] * ag + (int)block_b[i] * ab;
+
+    int t0 = dots[0] + dots[1], t1 = dots[1] + dots[2], t2 = dots[2] + dots[3];
+
+    ar *= 2; ag *= 2; ab *= 2;
+
+    uint sels = 0;
+    for (uint32_t i = 0; i < 16; i++)
+    {
+        const int d = input_colors[4*i+0] * ar + input_colors[4*i+1] * ag + input_colors[4*i+2] * ab;
+        static const uint8_t s_sels[4] = { 3, 2, 1, 0 };
+
+        // Rounding matters here!
+        // d <= t0: <=, not <, to the later LS step "sees" a wider range of selectors. It matters for quality.
+        sels |= s_sels[(d <= t0) + (d < t1) + (d < t2)] << (2 * i);
     }
+    return sels;
+}
 
-    float x0 = c0->x;
-    float y0 = c0->y;
-    float x1 = c1->x;
-    float y1 = c1->y;
 
-    if (covariance.x < 0) {
-        swap(x0, x1);
+void nv::compress_dxt1_fast_geld(const uint8 input_colors[16 * 4], BlockDXT1 * block) {
+
+    int fr = input_colors[0];
+    int fg = input_colors[1];
+    int fb = input_colors[2];
+
+    int total_r = fr, total_g = fg, total_b = fb;
+    int max_r = fr, max_g = fg, max_b = fb;
+    int min_r = fr, min_g = fg, min_b = fb;
+    uint32 grayscale_flag = (fr == fg) && (fr == fb);
+    for (uint32 i = 1; i < 16; i++)
+    {
+        const int r = input_colors[4*i+0], g = input_colors[4 * i + 1], b = input_colors[4 * i + 2];
+        grayscale_flag &= ((r == g) && (r == b));
+        max_r = max(max_r, r); max_g = max(max_g, g); max_b = max(max_b, b);
+        min_r = min(min_r, r); min_g = min(min_g, g); min_b = min(min_b, b);
+        total_r += r; total_g += g; total_b += b;
     }
-    if (covariance.y < 0) {
-        swap(y0, y1);
+
+    int lr, lg, lb;
+    int hr, hg, hb;
+
+    if (grayscale_flag) {
+        // Grayscale blocks are a common enough case to specialize.
+        lr = lb = Mul8Bit(min_r, 31);
+        lg = Mul8Bit(min_r, 63);
+
+        hr = hb = Mul8Bit(max_r, 31);
+        hg = Mul8Bit(max_r, 63);
     }
+    else {
+        int avg_r = (total_r + 8) >> 4, avg_g = (total_g + 8) >> 4, avg_b = (total_b + 8) >> 4;
 
-    c0->set(x0, y0, c0->z);
-    c1->set(x1, y1, c1->z);
-}
+        // Find the shortest vector from a AABB corner to the block's average color.
+        // This is to help avoid outliers.
 
-inline static void inset_bbox(Vector3 * restrict c0, Vector3 * restrict c1)
-{
-    Vector3 inset = (*c0 - *c1) / 16.0f - (8.0f / 255.0f) / 16.0f;
-    *c0 = clamp(*c0 - inset, 0.0f, 255.0f);
-    *c1 = clamp(*c1 + inset, 0.0f, 255.0f);
-}
+        uint32_t dist[3][2];
+        dist[0][0] = square(min_r - avg_r) << 3; dist[0][1] = square(max_r - avg_r) << 3;
+        dist[1][0] = square(min_g - avg_g) << 3; dist[1][1] = square(max_g - avg_g) << 3;
+        dist[2][0] = square(min_b - avg_b) << 3; dist[2][1] = square(max_b - avg_b) << 3;
 
+        uint32_t min_d0 = (dist[0][0] + dist[1][0] + dist[2][0]);
+        uint32_t d4 = (dist[0][0] + dist[1][0] + dist[2][1]) | 4;
+        min_d0 = min(min_d0, d4);
 
+        uint32_t min_d1 = (dist[0][1] + dist[1][0] + dist[2][0]) | 1;
+        uint32_t d5 = (dist[0][1] + dist[1][0] + dist[2][1]) | 5;
+        min_d1 = min(min_d1, d5);
 
-float nv::compress_dxt1_fast(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, BlockDXT1 * output)
-{
-    Vector3 colors[16];
-    float weights[16];
-    int count = reduce_colors(input_colors, input_weights, colors, weights);
+        uint32_t d2 = (dist[0][0] + dist[1][1] + dist[2][0]) | 2;
+        min_d0 = min(min_d0, d2);
 
-    if (count == 0) {
-        // Output trivial block.
-        output->col0.u = 0;
-        output->col1.u = 0;
-        output->indices = 0;
-        return 0;
-    }
+        uint32_t d3 = (dist[0][1] + dist[1][1] + dist[2][0]) | 3;
+        min_d1 = min(min_d1, d3);
 
+        uint32_t d6 = (dist[0][0] + dist[1][1] + dist[2][1]) | 6;
+        min_d0 = min(min_d0, d6);
 
-    float error = FLT_MAX;
-    error = compress_dxt1_single_color(colors, weights, count, color_weights, output);
+        uint32_t d7 = (dist[0][1] + dist[1][1] + dist[2][1]) | 7;
+        min_d1 = min(min_d1, d7);
 
-    if (error == 0.0f || count == 1) {
-        // Early out.
-        return error;
+        uint32_t min_d = min(min_d0, min_d1);
+        uint32_t best_i = min_d & 7;
+
+        const int delta_r = (best_i & 1) ? (max_r - avg_r) : (avg_r - min_r);
+        const int delta_g = (best_i & 2) ? (max_g - avg_g) : (avg_g - min_g);
+        const int delta_b = (best_i & 4) ? (max_b - avg_b) : (avg_b - min_b);
+
+        // Now we have a smaller AABB going from the block's average color to a cornerpoint of the larger AABB.
+        // Project all pixels colors along the 4 vectors going from a smaller AABB cornerpoint to the opposite cornerpoint, find largest projection.
+        // One of these vectors will be a decent approximation of the block's PCA.
+        const int saxis0_r = delta_r, saxis0_g = delta_g, saxis0_b = delta_b;
+
+        int low_dot0 = INT_MAX, high_dot0 = INT_MIN;
+        int low_dot1 = INT_MAX, high_dot1 = INT_MIN;
+        int low_dot2 = INT_MAX, high_dot2 = INT_MIN;
+        int low_dot3 = INT_MAX, high_dot3 = INT_MIN;
+
+        int low_c0, low_c1, low_c2, low_c3;
+        int high_c0, high_c1, high_c2, high_c3;
+
+        for (uint32_t i = 0; i < 16; i++)
+        {
+            const int dotx = input_colors[4*i+0] * saxis0_r;
+            const int doty = input_colors[4*i+1] * saxis0_g;
+            const int dotz = input_colors[4*i+2] * saxis0_b;
+
+            const int dot0 = ((dotz + dotx + doty) << 4) + i;
+            const int dot1 = ((dotz - dotx - doty) << 4) + i;
+            const int dot2 = ((dotz - dotx + doty) << 4) + i;
+            const int dot3 = ((dotz + dotx - doty) << 4) + i;
+
+            if (dot0 < low_dot0)
+            {
+                low_dot0 = dot0;
+                low_c0 = i;
+            }
+            if ((dot0 ^ 15) > high_dot0)
+            {
+                high_dot0 = dot0 ^ 15;
+                high_c0 = i;
+            }
+
+            if (dot1 < low_dot1)
+            {
+                low_dot1 = dot1;
+                low_c1 = i;
+            }
+            if ((dot1 ^ 15) > high_dot1)
+            {
+                high_dot1 = dot1 ^ 15;
+                high_c1 = i;
+            }
+
+            if (dot2 < low_dot2)
+            {
+                low_dot2 = dot2;
+                low_c2 = i;
+            }
+            if ((dot2 ^ 15) > high_dot2)
+            {
+                high_dot2 = dot2 ^ 15;
+                high_c2 = i;
+            }
+
+            if (dot3 < low_dot3)
+            {
+                low_dot3 = dot3;
+                low_c3 = i;
+            }
+            if ((dot3 ^ 15) > high_dot3)
+            {
+                high_dot3 = dot3 ^ 15;
+                high_c3 = i;
+            }
+        }
+
+
+        uint32_t low_c = low_dot0 & 15, high_c = ~high_dot0 & 15, r = (high_dot0 & ~15) - (low_dot0 & ~15);
+
+        uint32_t tr = (high_dot1 & ~15) - (low_dot1 & ~15);
+        if (tr > r)
+            low_c = low_dot1 & 15, high_c = ~high_dot1 & 15, r = tr;
+
+        tr = (high_dot2 & ~15) - (low_dot2 & ~15);
+        if (tr > r)
+            low_c = low_dot2 & 15, high_c = ~high_dot2 & 15, r = tr;
+
+        tr = (high_dot3 & ~15) - (low_dot3 & ~15);
+        if (tr > r)
+            low_c = low_dot3 & 15, high_c = ~high_dot3 & 15;
+
+        lr = Mul8Bit(input_colors[low_c*4+0], 31);
+        lg = Mul8Bit(input_colors[low_c*4+1], 63);
+        lb = Mul8Bit(input_colors[low_c*4+2], 31);
+
+        hr = Mul8Bit(input_colors[high_c*4+0], 31);
+        hg = Mul8Bit(input_colors[high_c*4+1], 63);
+        hb = Mul8Bit(input_colors[high_c*4+2], 31);
     }
 
-    // Quick end point selection.
-    Vector3 c0, c1;
-    fit_colors_bbox(colors, count, &c0, &c1);
-    select_diagonal(colors, count, &c0, &c1);
-    inset_bbox(&c0, &c1);
-    output_block4(input_colors, color_weights, c0, c1, output);
+    uint32 selectors = bc1_find_sels(input_colors, lr, lg, lb, hr, hg, hb);
 
-    // Refine color for the selected indices.
-    if (optimize_end_points4(output->indices, input_colors, 16, &c0, &c1)) {
-        output_block4(input_colors, color_weights, c0, c1, output);
+    Vector3 c0, c1;
+    if (!compute_least_squares_endpoints(input_colors, selectors, &c0, &c1)) {
+        // @@ Single color compressor.
+        Color32 c(lr, lg, lb);
+        ::compress_dxt1_single_color_optimal(c, block);
     }
+    else {
+        Color16 color0 = vector3_to_color16(c0);
+        Color16 color1 = vector3_to_color16(c1);
 
-    return evaluate_mse(input_colors, input_weights, color_weights, output);
-}
+        if (color0.u < color1.u) {
+            swap(color0, color1);
+        }
 
+        Color32 palette[4];
+        evaluate_palette(color0, color1, palette);
 
+        block->col0 = color0;
+        block->col1 = color1;
+        block->indices = bc1_find_sels(input_colors, color0.r, color0.g, color0.b, color1.r, color1.g, color1.b);
+    }
 
+    /*// Quick end point selection.
+    Vector3 c0, c1;
+    //fit_colors_bbox(colors, count, &c0, &c1);
+    //select_diagonal(colors, count, &c0, &c1);
+    fit_colors_bbox(vec_colors, 16, &c0, &c1);
+    if (c0 == c1) {
+        ::compress_dxt1_single_color_optimal(vector3_to_color(c0), output);
+        return;
+    }
+    inset_bbox(&c0, &c1);
+    select_diagonal(vec_colors, 16, &c0, &c1);
+    output_block4(vec_colors, c0, c1, output);
+
+    // Refine color for the selected indices.
+    if (optimize_end_points4(output->indices, vec_colors, 16, &c0, &c1)) {
+        output_block4(vec_colors, c0, c1, output);
+    }*/
+}
diff --git a/src/nvtt/CompressorDXT1.h b/src/nvtt/CompressorDXT1.h
index ac5bdb5..97aa5a5 100644
--- a/src/nvtt/CompressorDXT1.h
+++ b/src/nvtt/CompressorDXT1.h
@@ -18,9 +18,15 @@ namespace nv {
     void compress_dxt1_cluster_fit(const Vector4 input_colors[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, bool three_color_mode, BlockDXT1 * output);
 
     // Cluster fit end point selection.
-    float compress_dxt1(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, bool three_color_mode, BlockDXT1 * output);
+    float compress_dxt1(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, bool three_color_mode, bool hq, BlockDXT1 * output);
 
     // Quick end point selection followed by least squares refinement.
     float compress_dxt1_fast(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, BlockDXT1 * output);
 
+    // @@ Change these interfaces to take a pitch argument instead of assuming (4*4), just like CMP_Core.
+    void compress_dxt1_fast2(const unsigned char input_colors[16*4], BlockDXT1 * output);
+    void compress_dxt1_fast_geld(const unsigned char input_colors[16 * 4], BlockDXT1 * output);
+
+    float evaluate_dxt1_error(const unsigned char rgba_block[16 * 4], const BlockDXT1 * block, int decoder = 0);
+
 }
diff --git a/src/nvtt/CompressorDXT5_RGBM.cpp b/src/nvtt/CompressorDXT5_RGBM.cpp
index 3274f2c..0002470 100644
--- a/src/nvtt/CompressorDXT5_RGBM.cpp
+++ b/src/nvtt/CompressorDXT5_RGBM.cpp
@@ -59,7 +59,7 @@ float nv::compress_dxt5_rgbm(const Vector4 input_colors[16], const float input_w
     convert_to_rgbm(input_colors, input_weights, min_m, input_colors_rgbm, rgb_weights);
 
     // Compress RGB.
-    compress_dxt1(input_colors_rgbm, rgb_weights, Vector3(1), /*three_color_mode=*/false, &output->color);
+    compress_dxt1(input_colors_rgbm, rgb_weights, Vector3(1), /*three_color_mode=*/false, /*hq=*/false, &output->color);
 
     // Decompress RGB/M block.
     nv::ColorBlock RGB;