diff --git a/src/nvtt/ClusterFit.cpp b/src/nvtt/ClusterFit.cpp
index 4f5602a..7b91e2b 100644
--- a/src/nvtt/ClusterFit.cpp
+++ b/src/nvtt/ClusterFit.cpp
@@ -411,19 +411,19 @@ bool ClusterFit::compress4( Vector3 * start, Vector3 * end )
 #else
 
 inline Vector3 round565(const Vector3 & v) {
-	uint r = ftoi_floor(v.x * 31.0f);
+	uint r = ftoi_trunc(v.x * 31.0f);
     float r0 = float(((r+0) << 3) | ((r+0) >> 2));
     float r1 = float(((r+1) << 3) | ((r+1) >> 2));
     if (fabs(v.x - r1) < fabs(v.x - r0)) r = min(r+1, 31U);
 	r = (r << 3) | (r >> 2);
 
-	uint g = ftoi_floor(v.y * 63.0f);
+	uint g = ftoi_trunc(v.y * 63.0f);
     float g0 = float(((g+0) << 2) | ((g+0) >> 4));
     float g1 = float(((g+1) << 2) | ((g+1) >> 4));
     if (fabs(v.y - g1) < fabs(v.y - g0)) g = min(g+1, 63U);
     g = (g << 2) | (g >> 4);
 
-    uint b = ftoi_floor(v.z * 31.0f);
+    uint b = ftoi_trunc(v.z * 31.0f);
     float b0 = float(((b+0) << 3) | ((b+0) >> 2));
     float b1 = float(((b+1) << 3) | ((b+1) >> 2));
     if (fabs(v.z - b1) < fabs(v.z - b0)) b = min(b+1, 31U);
@@ -474,8 +474,10 @@ bool ClusterFit::compress3(Vector3 * start, Vector3 * end)
             // clamp to the grid
             a = clamp(a, 0, 1);
             b = clamp(b, 0, 1);
-            //a = floor(grid * a + 0.5f) * gridrcp;
-            //b = floor(grid * b + 0.5f) * gridrcp;
+#if 1
+            a = floor(grid * a + 0.5f) * gridrcp;
+            b = floor(grid * b + 0.5f) * gridrcp;
+#else
 
             //int ar = ftoi_round(31 * a.x); ar = (ar << 3) | (ar >> 2); a.x = float(ar) / 255.0f;
             //int ag = ftoi_round(63 * a.y); ar = (ag << 2) | (ag >> 4); a.y = float(ag) / 255.0f;
@@ -496,7 +498,7 @@ bool ClusterFit::compress3(Vector3 * start, Vector3 * end)
 
             a = round565(a);
             b = round565(b);
-
+#endif
 
             // compute the error
             Vector3 e1 = a*a*alpha2_sum + b*b*beta2_sum + 2.0f*( a*b*alphabeta_sum - a*alphax_sum - b*betax_sum );
@@ -582,9 +584,10 @@ bool ClusterFit::compress4(Vector3 * start, Vector3 * end)
                 // clamp to the grid
                 a = clamp(a, 0, 1);
                 b = clamp(b, 0, 1);
-                //a = floor(a * grid + 0.5f) * gridrcp;
-                //b = floor(b * grid + 0.5f) * gridrcp;
-
+#if 0
+                a = floor(a * grid + 0.5f) * gridrcp;
+                b = floor(b * grid + 0.5f) * gridrcp;
+#else
                 //int ar = ftoi_round(31 * a.x); ar = (ar << 3) | (ar >> 2); a.x = float(ar) / 255.0f;
                 //int ag = ftoi_round(63 * a.y); ar = (ag << 2) | (ag >> 4); a.y = float(ag) / 255.0f;
                 //int ab = ftoi_round(31 * a.z); ar = (ab << 3) | (ab >> 2); a.z = float(ab) / 255.0f;
@@ -606,6 +609,8 @@ bool ClusterFit::compress4(Vector3 * start, Vector3 * end)
 
                 a = round565(a);
                 b = round565(b);
+#endif
+                // @@ It would be much more accurate to evaluate the error exactly. 
 
                 // compute the error
                 Vector3 e1 = a*a*alpha2_sum + b*b*beta2_sum + 2.0f*( a*b*alphabeta_sum - a*alphax_sum - b*betax_sum );
diff --git a/src/nvtt/ClusterFit.h b/src/nvtt/ClusterFit.h
index 39dcbfb..72c9ef9 100644
--- a/src/nvtt/ClusterFit.h
+++ b/src/nvtt/ClusterFit.h
@@ -31,8 +31,8 @@
 #include "nvmath/Vector.h"
 
 // Use SIMD version if altivec or SSE are available.
-//#define NVTT_USE_SIMD (NV_USE_ALTIVEC || NV_USE_SSE)
-#define NVTT_USE_SIMD 0
+#define NVTT_USE_SIMD (NV_USE_ALTIVEC || NV_USE_SSE)
+//#define NVTT_USE_SIMD 0
 
 namespace nv {
 
diff --git a/src/nvtt/CompressorDX9.cpp b/src/nvtt/CompressorDX9.cpp
index 1fc3254..aaef88d 100644
--- a/src/nvtt/CompressorDX9.cpp
+++ b/src/nvtt/CompressorDX9.cpp
@@ -113,9 +113,40 @@ void FastCompressorDXT5n::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alpha
 }
 
 
+namespace nv {
+    float compress_dxt1(const Vector3 input_colors[16], const float input_weights[16], const Vector3 & color_weights, BlockDXT1 * output);
+}
+
 #if 1
 void CompressorDXT1::compressBlock(ColorSet & set, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
 {
+#if 1
+    // @@ This setup is the same for all compressors.
+    Vector3 input_colors[16];
+    float input_weights[16];
+
+    uint x, y;
+    for (y = 0; y < set.h; y++) {
+        for (x = 0; x < set.w; x++) {
+            input_colors[4*y+x] = set.color(x, y).xyz();
+            input_weights[4*y+x] = 1.0f;
+            if (alphaMode == nvtt::AlphaMode_Transparency) input_weights[4*y+x] = set.color(x, y).z;
+        }
+        for (; x < 4; x++) {
+            input_colors[4*y+x] = Vector3(0);
+            input_weights[4*y+x] = 0.0f;
+        }
+    }
+    for (; y < 4; y++) {
+        for (x = 0; x < 4; x++) {
+            input_colors[4*y+x] = Vector3(0);
+            input_weights[4*y+x] = 0.0f;
+        }
+    }
+
+    compress_dxt1(input_colors, input_weights, compressionOptions.colorWeight.xyz(), (BlockDXT1 *)output);
+
+#else
     set.setUniformWeights();
     set.createMinimalSet(/*ignoreTransparent*/false);
 
@@ -145,8 +176,9 @@ void CompressorDXT1::compressBlock(ColorSet & set, nvtt::AlphaMode alphaMode, co
             QuickCompress::outputBlock4(set, start, end, block);        
         }
     }
+#endif
 }
-#elif 1
+#elif 0
 
 
 extern void compress_dxt1_bounding_box_exhaustive(const ColorBlock & input, BlockDXT1 * output);
diff --git a/src/nvtt/CompressorDXT1.cpp b/src/nvtt/CompressorDXT1.cpp
index f884a9a..62653f6 100644
--- a/src/nvtt/CompressorDXT1.cpp
+++ b/src/nvtt/CompressorDXT1.cpp
@@ -20,13 +20,81 @@
 using namespace nv;
 
 
-inline static void color_block_to_vector_block(const ColorBlock & rgba, Vector3 block[16])
-{
-	for (int i = 0; i < 16; i++)
-	{
-		const Color32 c = rgba.color(i);
-		block[i] = Vector3(c.r, c.g, c.b);
-	}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Color conversion functions.
+
+static const float midpoints5[32] = {
+    0.015686f, 0.047059f, 0.078431f, 0.111765f, 0.145098f, 0.176471f, 0.207843f, 0.241176f, 0.274510f, 0.305882f, 0.337255f, 0.370588f, 0.403922f, 0.435294f, 0.466667f, 0.5f,
+    0.533333f, 0.564706f, 0.596078f, 0.629412f, 0.662745f, 0.694118f, 0.725490f, 0.758824f, 0.792157f, 0.823529f, 0.854902f, 0.888235f, 0.921569f, 0.952941f, 0.984314f, 1.0f
+};
+
+static const float midpoints6[64] = {
+    0.007843f, 0.023529f, 0.039216f, 0.054902f, 0.070588f, 0.086275f, 0.101961f, 0.117647f, 0.133333f, 0.149020f, 0.164706f, 0.180392f, 0.196078f, 0.211765f, 0.227451f, 0.245098f, 
+    0.262745f, 0.278431f, 0.294118f, 0.309804f, 0.325490f, 0.341176f, 0.356863f, 0.372549f, 0.388235f, 0.403922f, 0.419608f, 0.435294f, 0.450980f, 0.466667f, 0.482353f, 0.500000f, 
+    0.517647f, 0.533333f, 0.549020f, 0.564706f, 0.580392f, 0.596078f, 0.611765f, 0.627451f, 0.643137f, 0.658824f, 0.674510f, 0.690196f, 0.705882f, 0.721569f, 0.737255f, 0.754902f, 
+    0.772549f, 0.788235f, 0.803922f, 0.819608f, 0.835294f, 0.850980f, 0.866667f, 0.882353f, 0.898039f, 0.913725f, 0.929412f, 0.945098f, 0.960784f, 0.976471f, 0.992157f, 1.0f
+};
+
+/*void init_tables() {
+    for (int i = 0; i < 31; i++) {
+        float f0 = float(((i+0) << 3) | ((i+0) >> 2)) / 255.0f;
+        float f1 = float(((i+1) << 3) | ((i+1) >> 2)) / 255.0f;
+        midpoints5[i] = (f0 + f1) * 0.5;
+    }
+    midpoints5[31] = 1.0f;
+
+    for (int i = 0; i < 63; i++) {
+        float f0 = float(((i+0) << 2) | ((i+0) >> 4)) / 255.0f;
+        float f1 = float(((i+1) << 2) | ((i+1) >> 4)) / 255.0f;
+        midpoints6[i] = (f0 + f1) * 0.5;
+    }
+    midpoints6[63] = 1.0f;
+}*/
+
+static Color16 vector3_to_color16(const Vector3 & v) {
+    // Truncate.
+    uint r = ftoi_trunc(clamp(v.x * 31.0f, 0.0f, 31.0f));
+	uint g = ftoi_trunc(clamp(v.y * 63.0f, 0.0f, 63.0f));
+	uint b = ftoi_trunc(clamp(v.z * 31.0f, 0.0f, 31.0f));
+
+    // Round exactly according to 565 bit-expansion.
+    r += (v.x > midpoints5[r]);
+    g += (v.y > midpoints6[g]);
+    b += (v.z > midpoints5[b]);
+
+    return Color16((r << 11) | (g << 5) | b);
+}
+
+
+static Color32 bitexpand_color16_to_color32(Color16 c16) {
+    Color32 c32;
+    //c32.b = (c16.b << 3) | (c16.b >> 2);
+    //c32.g = (c16.g << 2) | (c16.g >> 4);
+    //c32.r = (c16.r << 3) | (c16.r >> 2);
+    //c32.a = 0xFF;
+
+    c32.u = ((c16.u << 3) & 0xf8) | ((c16.u << 5) & 0xfc00) | ((c16.u << 8) & 0xf80000);
+    c32.u |= (c32.u >> 5) & 0x070007;
+    c32.u |= (c32.u >> 6) & 0x000300;
+
+    return c32;
+}
+
+static Color32 bitexpand_color16_to_color32(int r, int g, int b) {
+    Color32 c32;
+    c32.b = (b << 3) | (b >> 2);
+    c32.g = (g << 2) | (g >> 4);
+    c32.r = (r << 3) | (r >> 2);
+    c32.a = 0xFF;
+    return c32;
+}
+
+static Color16 truncate_color32_to_color16(Color32 c32) {
+    Color16 c16;
+    c16.b = (c32.b >> 3);
+    c16.g = (c32.g >> 2);
+    c16.r = (c32.r >> 3);
+    return c16;
 }
 
 inline Vector3 r5g6b5_to_vector3(int r, int g, int b)
@@ -40,20 +108,32 @@ inline Vector3 r5g6b5_to_vector3(int r, int g, int b)
 
 inline Vector3 color_to_vector3(Color32 c)
 {
-    const float scale = 1.0f / 255.0f;
+    const float scale = 1.0f / 255.0f;
     return Vector3(c.r * scale, c.g * scale, c.b * scale);
 }
 
 inline Color32 vector3_to_color(Vector3 v)
 {
-    Color32 color;
-    color.r = U8(ftoi_round(saturate(v.x) * 255));
-    color.g = U8(ftoi_round(saturate(v.y) * 255));
-    color.b = U8(ftoi_round(saturate(v.z) * 255));
-    color.a = 255;
+    Color32 color;
+    color.r = U8(ftoi_round(saturate(v.x) * 255));
+    color.g = U8(ftoi_round(saturate(v.y) * 255));
+    color.b = U8(ftoi_round(saturate(v.z) * 255));
+    color.a = 255;
+    return color;
 }
 
 
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Input block processing.
+
+inline static void color_block_to_vector_block(const ColorBlock & rgba, Vector3 block[16])
+{
+	for (int i = 0; i < 16; i++)
+	{
+		const Color32 c = rgba.color(i);
+		block[i] = Vector3(c.r, c.g, c.b);
+	}
+}
 
 // Find first valid color.
 static bool find_valid_color_rgb(const Vector3 * colors, const float * weights, int count, Vector3 * valid_color)
@@ -114,46 +194,67 @@ static int reduce_colors(const Vector3 * input_colors, const float * input_weigh
 }
 
 
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Error evaluation.
 
 // Different ways of estimating the error.
-static float evaluate_mse(const Vector3 & p, const Vector3 & c) {
-    return square(p.x-c.x) + square(p.y-c.y) + square(p.z-c.z);
+/*static float evaluate_mse(const Vector3 & p, const Vector3 & c) {
+    //return (square(p.x-c.x) * w2.x + square(p.y-c.y) * w2.y + square(p.z-c.z) * w2.z);
+    Vector3 d = (p - c);
+    return dot(d, d);
+}*/
+
+static float evaluate_mse(const Vector3 & p, const Vector3 & c, const Vector3 & w) {
+    //return (square(p.x-c.x) * w2.x + square(p.y-c.y) * w2.y + square(p.z-c.z) * w2.z);
+    Vector3 d = (p - c) * w;
+    return dot(d, d);
 }
 
 /*static float evaluate_mse(const Vector3 & p, const Vector3 & c, const Vector3 & w) {
     return ww.x * square(p.x-c.x) + ww.y * square(p.y-c.y) + ww.z * square(p.z-c.z);
 }*/
 
-static int evaluate_mse_rgb(const Color32 & p, const Color32 & c) {
-    return square(int(p.r)-c.r) + square(int(p.g)-c.g) + square(int(p.b)-c.b);
+static int evaluate_mse(const Color32 & p, const Color32 & c) {
+    return (square(int(p.r)-c.r) + square(int(p.g)-c.g) + square(int(p.b)-c.b));
 }
 
-static float evaluate_mse(const Vector3 palette[4], const Vector3 & c) {
-    float e0 = evaluate_mse(palette[0], c);
-    float e1 = evaluate_mse(palette[1], c);
-    float e2 = evaluate_mse(palette[2], c);
-    float e3 = evaluate_mse(palette[3], c);
+static float evaluate_mse(const Vector3 palette[4], const Vector3 & c, const Vector3 & w) {
+    float e0 = evaluate_mse(palette[0], c, w);
+    float e1 = evaluate_mse(palette[1], c, w);
+    float e2 = evaluate_mse(palette[2], c, w);
+    float e3 = evaluate_mse(palette[3], c, w);
     return min(min(e0, e1), min(e2, e3));
 }
 
 static int evaluate_mse(const Color32 palette[4], const Color32 & c) {
-    int e0 = evaluate_mse_rgb(palette[0], c);
-    int e1 = evaluate_mse_rgb(palette[1], c);
-    int e2 = evaluate_mse_rgb(palette[2], c);
-    int e3 = evaluate_mse_rgb(palette[3], c);
+    int e0 = evaluate_mse(palette[0], c);
+    int e1 = evaluate_mse(palette[1], c);
+    int e2 = evaluate_mse(palette[2], c);
+    int e3 = evaluate_mse(palette[3], c);
     return min(min(e0, e1), min(e2, e3));
 }
 
-static float evaluate_mse(const Vector3 palette[4], const Vector3 & c, int index) {
-    return evaluate_mse(palette[index], c);
-}
+// Returns MSE error in [0-255] range.
+static int evaluate_mse(const BlockDXT1 * output, Color32 color, int index) {
+    Color32 palette[4];
+    output->evaluatePalette(palette, /*d3d9=*/false);
 
-static int evaluate_mse(const Color32 palette[4], const Color32 & c, int index) {
-    return evaluate_mse_rgb(palette[index], c);
+    return evaluate_mse(palette[index], color);
 }
 
+// Returns weighted MSE error in [0-255] range.
+static float evaluate_palette_error(Color32 palette[4], const Color32 * colors, const float * weights, int count) {
+    
+	float total = 0.0f;
+	for (int i = 0; i < count; i++) {
+        total += weights[i] * evaluate_mse(palette, colors[i]);
+	}
+
+	return total;
+}
 
-static float evaluate_mse(const BlockDXT1 * output, Vector3 colors[16]) {
+#if 0
+static float evaluate_mse(const BlockDXT1 * output, const Vector3 colors[16]) {
     Color32 palette[4];
     output->evaluatePalette(palette, /*d3d9=*/false);
 
@@ -167,39 +268,171 @@ static float evaluate_mse(const BlockDXT1 * output, Vector3 colors[16]) {
     float error = 0.0f;
     for (int i = 0; i < 16; i++) {
         int index = (output->indices >> (2*i)) & 3; // @@ Is this the right order?
-        error += evaluate_mse(vector_palette, colors[i], index);
+        error += evaluate_mse(vector_palette[index], colors[i]);
     }
 
     return error;
 }
+#endif
 
-static int evaluate_mse(const BlockDXT1 * output, Color32 color, int index) {
+static float evaluate_mse(const Vector3 colors[16], const float weights[16], const Vector3 & color_weights, const BlockDXT1 * output) {
     Color32 palette[4];
     output->evaluatePalette(palette, /*d3d9=*/false);
 
-    return evaluate_mse(palette, color, index);
+    // convert palette to float.
+    Vector3 vector_palette[4];
+    for (int i = 0; i < 4; i++) {
+        vector_palette[i] = color_to_vector3(palette[i]);
+    }
+
+    // evaluate error for each index.
+    float error = 0.0f;
+    for (int i = 0; i < 16; i++) {
+        int index = (output->indices >> (2 * i)) & 3;
+        error += weights[i] * evaluate_mse(vector_palette[index], colors[i], color_weights);
+    }
+    return error;
 }
 
 
-/*void output_block3(const ColorSet & set, const Vector3 & start, const Vector3 & end, BlockDXT1 * block)
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Palette evaluation.
+
+static void evaluate_palette4(Color32 palette[4]) {
+    palette[2].r = (2 * palette[0].r + palette[1].r) / 3;
+    palette[2].g = (2 * palette[0].g + palette[1].g) / 3;
+    palette[2].b = (2 * palette[0].b + palette[1].b) / 3;
+    palette[3].r = (2 * palette[1].r + palette[0].r) / 3;
+    palette[3].g = (2 * palette[1].g + palette[0].g) / 3;
+    palette[3].b = (2 * palette[1].b + palette[0].b) / 3;
+}
+
+static void evaluate_palette3(Color32 palette[4]) {
+    palette[2].r = (palette[0].r + palette[1].r) / 2;
+    palette[2].g = (palette[0].g + palette[1].g) / 2;
+    palette[2].b = (palette[0].b + palette[1].b) / 2;
+    palette[3].r = 0;
+    palette[3].g = 0;
+    palette[3].b = 0;
+}
+
+static void evaluate_palette(Color16 c0, Color16 c1, Color32 palette[4]) {
+    palette[0] = bitexpand_color16_to_color32(c0);
+    palette[1] = bitexpand_color16_to_color32(c1);
+    if (c0.u > c1.u) {
+        evaluate_palette4(palette);
+    }
+    else {
+        evaluate_palette3(palette);
+    }
+}
+
+static void evaluate_palette(Color16 c0, Color16 c1, Vector3 palette[4]) {
+    Color32 palette32[4];
+    evaluate_palette(c0, c1, palette32);
+
+    for (int i = 0; i < 4; i++) {
+        palette[i] = color_to_vector3(palette32[i]);
+    }
+}
+
+static void evaluate_palette3(Color16 c0, Color16 c1, Vector3 palette[4]) {
+    nvDebugCheck(c0.u > c1.u);
+
+    Color32 palette32[4];
+    evaluate_palette(c0, c1, palette32);
+
+    for (int i = 0; i < 4; i++) {
+        palette[i] = color_to_vector3(palette32[i]);
+    }
+}
+
+
+
+
+
+static uint compute_indices4(const Vector3 input_colors[16], const Vector3 & color_weights, const Vector3 palette[4]) {
+    
+    uint indices = 0;
+	for (int i = 0; i < 16; i++) {
+		float d0 = evaluate_mse(palette[0], input_colors[i], color_weights);
+		float d1 = evaluate_mse(palette[1], input_colors[i], color_weights);
+		float d2 = evaluate_mse(palette[2], input_colors[i], color_weights);
+		float d3 = evaluate_mse(palette[3], input_colors[i], color_weights);
+		
+		uint b0 = d0 > d3;
+		uint b1 = d1 > d2;
+		uint b2 = d0 > d2;
+		uint b3 = d1 > d3;
+		uint b4 = d2 > d3;
+		
+		uint x0 = b1 & b2;
+		uint x1 = b0 & b3;
+		uint x2 = b0 & b4;
+		
+		indices |= (x2 | ((x0 | x1) << 1)) << (2 * i);
+	}
+
+	return indices;
+}
+
+
+static uint compute_indices(const Vector3 input_colors[16], const Vector3 & color_weights, const Vector3 palette[4]) {
+    
+    uint indices = 0;
+	for (int i = 0; i < 16; i++) {
+		float d0 = evaluate_mse(palette[0], input_colors[i], color_weights);
+		float d1 = evaluate_mse(palette[1], input_colors[i], color_weights);
+		float d2 = evaluate_mse(palette[2], input_colors[i], color_weights);
+		float d3 = evaluate_mse(palette[3], input_colors[i], color_weights);
+		
+        uint index;
+        if (d0 < d1 && d0 < d2 && d0 < d3) index = 0;
+        else if (d1 < d2 && d1 < d3) index = 1;
+        else if (d2 < d3) index = 2;
+        else index = 3;
+
+		indices |= index << (2 * i);
+	}
+
+	return indices;
+}
+
+
+static void output_block3(const Vector3 input_colors[16], const Vector3 & color_weights, const Vector3 & v0, const Vector3 & v1, BlockDXT1 * block)
 {
-    Vector3 minColor = start * 255.0f;
-    Vector3 maxColor = end * 255.0f;
-    uint16 color0 = roundAndExpand(&minColor);
-    uint16 color1 = roundAndExpand(&maxColor);
+    Color16 color0 = vector3_to_color16(v0);
+    Color16 color1 = vector3_to_color16(v1);
 
-    if (color0 > color1) {
-        swap(maxColor, minColor);
+    if (color0.u > color1.u) {
         swap(color0, color1);
     }
 
-    block->col0 = Color16(color0);
-    block->col1 = Color16(color1);
-    block->indices = compute_indices3(colors, weights, count, maxColor / 255.0f, minColor / 255.0f);
+    Vector3 palette[4];
+    evaluate_palette(color0, color1, palette);
 
-    //optimizeEndPoints3(set, block);
-}*/
+    block->col0 = color0;
+    block->col1 = color1;
+    block->indices = compute_indices(input_colors, color_weights, palette);
+}
+
+static void output_block4(const Vector3 input_colors[16], const Vector3 & color_weights, const Vector3 & v0, const Vector3 & v1, BlockDXT1 * block)
+{
+    Color16 color0 = vector3_to_color16(v0);
+    Color16 color1 = vector3_to_color16(v1);
+
+    if (color0.u < color1.u) {
+        swap(color0, color1);
+    }
 
+    Vector3 palette[4];
+    evaluate_palette(color0, color1, palette);
+
+    block->col0 = color0;
+    block->col1 = color1;
+    block->indices = compute_indices4(input_colors, color_weights, palette);
+}
 
 
 
@@ -207,7 +440,7 @@ static int evaluate_mse(const BlockDXT1 * output, Color32 color, int index) {
 
 // Single color compressor, based on:
 // https://mollyrocket.com/forums/viewtopic.php?t=392
-float nv::compress_dxt1_single_color_optimal(Color32 c, BlockDXT1 * output)
+static void compress_dxt1_single_color_optimal(Color32 c, BlockDXT1 * output)
 {
     output->col0.r = OMatch5[c.r][0];
     output->col0.g = OMatch6[c.g][0];
@@ -222,92 +455,66 @@ float nv::compress_dxt1_single_color_optimal(Color32 c, BlockDXT1 * output)
         swap(output->col0.u, output->col1.u);
         output->indices ^= 0x55555555;
     }
-
-    return (float) evaluate_mse(output, c, output->indices & 3);
 }
 
 
-float nv::compress_dxt1_single_color_optimal(const Vector3 & color, BlockDXT1 * output)
-{
-    return compress_dxt1_single_color_optimal(vector3_to_color(color), output);
-}
-
-
-// Low quality baseline compressor.
-float nv::compress_dxt1_least_squares_fit(const Vector3 * input_colors, const Vector3 * colors, const float * weights, int count, BlockDXT1 * output)
+float nv::compress_dxt1_single_color_optimal(Color32 c, BlockDXT1 * output)
 {
-    // @@ Iterative best end point fit.
+    ::compress_dxt1_single_color_optimal(c, output);
 
-    return FLT_MAX;
+    // Multiply by 16^2, the weight associated to a single color.
+    // Divide by 255*255 to covert error to [0-1] range.
+    return (256.0f / (255*255)) * evaluate_mse(output, c, output->indices & 3);
 }
 
 
-static Color32 bitexpand_color16_to_color32(Color16 c16) {
-    Color32 c32;
-    c32.b = (c16.b << 3) | (c16.b >> 2);
-    c32.g = (c16.g << 2) | (c16.g >> 4);
-    c32.r = (c16.r << 3) | (c16.r >> 2);
-    c32.a = 0xFF;
-
-    //c32.u = ((c16.u << 3) & 0xf8) | ((c16.u << 5) & 0xfc00) | ((c16.u << 8) & 0xf80000);
-    //c32.u |= (c32.u >> 5) & 0x070007;
-    //c32.u |= (c32.u >> 6) & 0x000300;
-
-    return c32;
+float nv::compress_dxt1_single_color_optimal(const Vector3 & color, BlockDXT1 * output)
+{
+    return compress_dxt1_single_color_optimal(vector3_to_color(color), output);
 }
 
-static Color32 bitexpand_color16_to_color32(int r, int g, int b) {
-    Color32 c32;
-    c32.b = (b << 3) | (b >> 2);
-    c32.g = (g << 2) | (g >> 4);
-    c32.r = (r << 3) | (r >> 2);
-    c32.a = 0xFF;
-    return c32;
-}
 
-static Color16 truncate_color32_to_color16(Color32 c32) {
-    Color16 c16;
-    c16.b = (c32.b >> 3);
-    c16.g = (c32.g >> 2);
-    c16.r = (c32.r >> 3);
-    return c16;
-}
+// Compress block using the average color.
+float nv::compress_dxt1_single_color(const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, BlockDXT1 * output)
+{
+    // Compute block average.
+    Vector3 color_sum(0);
+    float weight_sum = 0;
 
+    for (int i = 0; i < count; i++) {
+        color_sum += colors[i] * weights[i];
+        weight_sum += weights[i];
+    }
 
+    // Compress optimally.
+    ::compress_dxt1_single_color_optimal(vector3_to_color(color_sum / weight_sum), output);
 
+    // Decompress block color.
+    Color32 palette[4];
+    output->evaluatePalette(palette, /*d3d9=*/false);
 
-static float evaluate_palette4(Color32 palette[4]) {
-    palette[2].r = (2 * palette[0].r + palette[1].r) / 3;
-    palette[2].g = (2 * palette[0].g + palette[1].g) / 3;
-    palette[2].b = (2 * palette[0].b + palette[1].b) / 3;
-    palette[3].r = (2 * palette[1].r + palette[0].r) / 3;
-    palette[3].g = (2 * palette[1].g + palette[0].g) / 3;
-    palette[3].b = (2 * palette[1].b + palette[0].b) / 3;
-}
+    Vector3 block_color = color_to_vector3(palette[output->indices & 0x3]);
 
-static float evaluate_palette3(Color32 palette[4]) {
-    palette[2].r = (palette[0].r + palette[1].r) / 2;
-    palette[2].g = (palette[0].g + palette[1].g) / 2;
-    palette[2].b = (palette[0].b + palette[1].b) / 2;
-    palette[3].r = 0;
-    palette[3].g = 0;
-    palette[3].b = 0;
+    // Evaluate error.
+    float error = 0;
+    for (int i = 0; i < count; i++) {
+        error += weights[i] * evaluate_mse(block_color, colors[i], color_weights);
+    }
+    return error;
 }
 
-static float evaluate_palette_error(Color32 palette[4], const Color32 * colors, const float * weights, int count) {
-    
-	float total = 0.0f;
-	for (int i = 0; i < count; i++) {
-        total += (weights[i] * weights[i]) * evaluate_mse(palette, colors[i]);
-	}
-
-	return total;
-}
 
+/* @@ Not implemented yet.
+// Low quality baseline compressor.
+float nv::compress_dxt1_least_squares_fit(const Vector3 * input_colors, const Vector3 * colors, const float * weights, int count, BlockDXT1 * output)
+{
+    // @@ Iterative best end point fit.
 
+    return FLT_MAX;
+}*/
 
 
-float nv::compress_dxt1_bounding_box_exhaustive(const Vector3 input_colors[16], const Vector3 * colors, const float * weights, int count, int max_volume, BlockDXT1 * output)
+float nv::compress_dxt1_bounding_box_exhaustive(const Vector3 input_colors[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, int max_volume, BlockDXT1 * output)
 {
     // Compute bounding box.
     Vector3 min_color(1.0f);
@@ -331,85 +538,92 @@ float nv::compress_dxt1_bounding_box_exhaustive(const Vector3 input_colors[16],
     int range_g = max_g - min_g;
     int range_b = max_b - min_b;
 
-    min_r = max(0, min_r - (range_r + 1) / 1 - 1);
-    min_g = max(0, min_g - (range_g + 1) / 1 - 1);
-    min_b = max(0, min_b - (range_b + 1) / 1 - 1);
+    min_r = max(0, min_r - range_r / 2 - 2);
+    min_g = max(0, min_g - range_g / 2 - 2);
+    min_b = max(0, min_b - range_b / 2 - 2);
 
-    max_r = min(31, max_r + (range_r + 1) / 2 + 1);
-    max_g = min(63, max_g + (range_g + 1) / 2 + 1);
-    max_b = min(31, max_b + (range_b + 1) / 2 + 1);
+    max_r = min(31, max_r + range_r / 2 + 2);
+    max_g = min(63, max_g + range_g / 2 + 2);
+    max_b = min(31, max_b + range_b / 2 + 2);
 
     // Estimate size of search space.
     int volume = (max_r-min_r+1) * (max_g-min_g+1) * (max_b-min_b+1);
 
-    // if size under search_limit, then proceed. Note that search_limit is sqrt of number of evaluations.
+    // if size under search_limit, then proceed. Note that search_volume is sqrt of number of evaluations.
     if (volume > max_volume) {
         return FLT_MAX;
     }
 
+    // @@ Convert to fixed point before building box?
     Color32 colors32[16];
     for (int i = 0; i < count; i++) {
         colors32[i] = toColor32(Vector4(colors[i], 1));
     }
 
     float best_error = FLT_MAX;
-    Color32 best0, best1;
+    Color16 best0, best1;           // @@ Record endpoints as Color16?
+
+    Color16 c0, c1;
+    Color32 palette[4];
 
     for(int r0 = min_r; r0 <= max_r; r0++)
-    for(int r1 = max_r; r1 >= r0; r1--)
     for(int g0 = min_g; g0 <= max_g; g0++)
-    for(int g1 = max_g; g1 >= g0; g1--)
     for(int b0 = min_b; b0 <= max_b; b0++)
-    for(int b1 = max_b; b1 >= b0; b1--)
     {
-        Color32 palette[4];
-        palette[0] = bitexpand_color16_to_color32(r1, g1, b1);
-        palette[1] = bitexpand_color16_to_color32(r0, g0, b0);
-        
-        // Evaluate error in 4 color mode.
-        evaluate_palette4(palette);
+        c0.r = r0; c0.g = g0; c0.b = b0;
+        palette[0] = bitexpand_color16_to_color32(c0);
+
+        for(int r1 = min_r; r1 <= max_r; r1++)
+        for(int g1 = min_g; g1 <= max_g; g1++)
+        for(int b1 = min_b; b1 <= max_b; b1++)
+        {
+            c1.r = r1; c1.g = g1; c1.b = b1;
+            palette[1] = bitexpand_color16_to_color32(c1);
+
+            if (c0.u > c1.u) {
+                // Evaluate error in 4 color mode.
+                evaluate_palette4(palette);
+            }
+            else {
+    #if 1
+                // Evaluate error in 3 color mode.
+                evaluate_palette3(palette);
+    #else
+                // Skip 3 color mode.
+                continue;
+    #endif
+            }
 
-        float error = evaluate_palette_error(palette, colors32, weights, count);
+            float error = evaluate_palette_error(palette, colors32, weights, count);
 
-        if (error < best_error) {
-            best_error = error;
-            best0 = palette[0];
-            best1 = palette[1];
+            if (error < best_error) {
+                best_error = error;
+                best0 = c0;
+                best1 = c1;
+            }
         }
+    }
 
-#if 0
-        // Evaluate error in 3 color mode.
-        evaluate_palette3(palette);
-
-        float error = evaluate_palette_error(palette, colors, weights, count);
+    output->col0 = best0;
+    output->col1 = best1;
 
-        if (error < best_error) {
-            best_error = error;
-            best0 = palette[1];
-            best1 = palette[0];
-        }
-#endif
+    if (output->col0.u < output->col1.u) {
+        int k = 1;
     }
 
-    output->col0 = truncate_color32_to_color16(best0);
-    output->col1 = truncate_color32_to_color16(best1);
+    Vector3 vector_palette[4];
+    evaluate_palette(output->col0, output->col1, vector_palette);
 
-    if (output->col0.u <= output->col1.u) {
-        //output->indices = computeIndices3(colors, best0, best1);
-    }
-    else {
-        //output->indices = computeIndices4(colors, best0, best1);
-    }
+    output->indices = compute_indices(input_colors, color_weights, vector_palette);
 
-    return FLT_MAX;
+    return best_error / (255 * 255);
 }
 
 
-float nv::compress_dxt1_cluster_fit(const Vector3 input_colors[16], const Vector3 * colors, const float * weights, int count, BlockDXT1 * output)
+void nv::compress_dxt1_cluster_fit(const Vector3 input_colors[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, BlockDXT1 * output)
 {
     ClusterFit fit;
-    //fit.setColorWeights(compressionOptions.colorWeight);
-    fit.setColorWeights(Vector4(1));                // @@ Set color weights.
+    fit.setColorWeights(Vector4(color_weights, 1));
     fit.setColorSet(colors, weights, count);
 
     // start & end are in [0, 1] range.
@@ -417,18 +631,17 @@ float nv::compress_dxt1_cluster_fit(const Vector3 input_colors[16], const Vector
     fit.compress4(&start, &end);
 
     if (fit.compress3(&start, &end)) {
-        //output_block3(input_colors, start, end, block);
-        // @@ Output block.
+        output_block3(input_colors, color_weights, start, end, output);
     }
     else {
-        //output_block4(input_colors, start, end, block);
-        // @@ Output block. 
+        output_block4(input_colors, color_weights, start, end, output);
     }
 }
 
 
 
-float nv::compress_dxt1(const Vector3 input_colors[16], const float input_weights[16], BlockDXT1 * output)
+
+float nv::compress_dxt1(const Vector3 input_colors[16], const float input_weights[16], const Vector3 & color_weights, BlockDXT1 * output)
 {
     Vector3 colors[16];
     float weights[16];
@@ -442,20 +655,59 @@ float nv::compress_dxt1(const Vector3 input_colors[16], const float input_weight
         return 0;
     }
 
-    if (count == 1) {
-        return compress_dxt1_single_color_optimal(colors[0], output);
+
+    float error = FLT_MAX;
+
+    // Sometimes the single color compressor produces better results than the exhaustive. This introduces discontinuities between blocks that
+    // use different compressors. For this reason, this is not enabled by default.
+    if (1) {
+        error = compress_dxt1_single_color(colors, weights, count, color_weights, output);
+
+        if (error == 0.0f || count == 1) {
+            // Early out.
+            return error;
+        }
     }
 
+    // This is too expensive, even with a low threshold.
     // If high quality:
-    //error = compress_dxt1_bounding_box_exhaustive(colors, weigths, count, 3200, error, output);
-    //if (error < FLT_MAX) return error;
+    if (0) {
+        BlockDXT1 exhaustive_output;
+        float exhaustive_error = compress_dxt1_bounding_box_exhaustive(input_colors, colors, weights, count, color_weights, 400, &exhaustive_output);
+
+        if (exhaustive_error != FLT_MAX) {
+            float exhaustive_error2 = evaluate_mse(input_colors, input_weights, color_weights, &exhaustive_output);
+
+            // The exhaustive compressor does not use color_weights, so the results may be different.
+            //nvCheck(equal(exhaustive_error, exhaustive_error2));
+
+            if (exhaustive_error2 < error) {
+                *output = exhaustive_output;
+                error = exhaustive_error;
+            }
+        }
+    }
 
+    // @@ TODO.
     // This is pretty fast and in some cases can produces better quality than cluster fit.
-//    error = compress_dxt1_least_squares_fit(colors, weigths, error, output);
+    //error = compress_dxt1_least_squares_fit(colors, weigths, error, output);
 
-    // 
-    float error = compress_dxt1_cluster_fit(input_colors, colors, weights, count, output);
+    // Cluster fit cannot handle single color blocks, so encode them optimally if we haven't encoded them already.
+    if (error == FLT_MAX && count == 1) {
+        error = compress_dxt1_single_color_optimal(colors[0], output);
+    }
+
+    if (count > 1) {
+        BlockDXT1 cluster_fit_output;
+        compress_dxt1_cluster_fit(input_colors, colors, weights, count, color_weights, &cluster_fit_output);
+
+        float cluster_fit_error = evaluate_mse(input_colors, input_weights, color_weights, &cluster_fit_output);
+        
+        if (cluster_fit_error < error) {
+            *output = cluster_fit_output;
+            error = cluster_fit_error;
+        }
+    }
 
     return error;
 }
-
diff --git a/src/nvtt/CompressorDXT1.h b/src/nvtt/CompressorDXT1.h
index 3c2fe57..fb9d655 100644
--- a/src/nvtt/CompressorDXT1.h
+++ b/src/nvtt/CompressorDXT1.h
@@ -28,11 +28,12 @@ namespace nv {
     float compress_dxt1_single_color_optimal(Color32 c, BlockDXT1 * output);
     float compress_dxt1_single_color_optimal(const Vector3 & color, BlockDXT1 * output);
 
-    float compress_dxt1_least_squares_fit(const Vector3 input_colors[16], const Vector3 * colors, const float * weights, int count, BlockDXT1 * output);
-    float compress_dxt1_bounding_box_exhaustive(const Vector3 input_colors[16], const Vector3 * colors, const float * weights, int count, int search_limit, BlockDXT1 * output);
-    float compress_dxt1_cluster_fit(const Vector3 input_colors[16], const Vector3 * colors, const float * weights, int count, BlockDXT1 * output);
+    float compress_dxt1_single_color(const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, BlockDXT1 * output);
+    float compress_dxt1_least_squares_fit(const Vector3 input_colors[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, BlockDXT1 * output);
+    float compress_dxt1_bounding_box_exhaustive(const Vector3 input_colors[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, int search_limit, BlockDXT1 * output);
+    void compress_dxt1_cluster_fit(const Vector3 input_colors[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, BlockDXT1 * output);
 
 
-    float compress_dxt1(const Vector3 colors[16], const float weights[16], BlockDXT1 * output);
+    float compress_dxt1(const Vector3 colors[16], const float weights[16], const Vector3 & color_weights, BlockDXT1 * output);
 
 }
\ No newline at end of file