diff --git a/src/nvtt/CompressorDXT1.cpp b/src/nvtt/CompressorDXT1.cpp
index 128bc4a..f68ab3c 100644
--- a/src/nvtt/CompressorDXT1.cpp
+++ b/src/nvtt/CompressorDXT1.cpp
@@ -1048,77 +1048,79 @@ float nv::compress_dxt1(const Vector4 input_colors[16], const float input_weight
         compress_dxt1_cluster_fit(input_colors, colors, weights, count, color_weights, three_color_mode, &cluster_fit_output);
 
         float cluster_fit_error = evaluate_mse(input_colors, input_weights, color_weights, &cluster_fit_output);
-        
+
         if (cluster_fit_error < error) {
             *output = cluster_fit_output;
             error = cluster_fit_error;
+        }
 
-            /*if (hq && cluster_fit_output.isFourColorMode()) {
-                
-                // Refine color for the selected indices.
-                Vector3 c0, c1;
-                if (optimize_end_points4(output->indices, input_colors, 16, &c0, &c1)) {
-                    BlockDXT1 box_fit_output;
-                    output_block4(input_colors, color_weights, c0, c1, &box_fit_output);
+        if (hq) {
+            // TODO:
+            // - Optimize palette evaluation when updating only one channel.
+            // - try all diagonals.
 
-                    float box_fit_error = evaluate_mse(input_colors, input_weights, color_weights, &box_fit_output);
-                    if (box_fit_error < error) {
-                        error = box_fit_error;
-                        *output = box_fit_output;
-                    }
-                }
-            }*/
+            // Things that don't help:
+            // - Alternate endpoint updates.
+            // - Randomize order.
+            // - If one direction does not improve, test opposite direction next.
 
-            if (hq) {
-                int8 deltas[16][3] = {
-                    {1,0,0},
-                    {0,1,0},
-                    {0,0,1},
+            static const int8 deltas[16][3] = {
+                {1,0,0},
+                {0,1,0},
+                {0,0,1},
 
-                    {-1,0,0},
-                    {0,-1,0},
-                    {0,0,-1},
+                {-1,0,0},
+                {0,-1,0},
+                {0,0,-1},
 
-                    {1,1,0},
-                    {1,0,1},
-                    {0,1,1},
+                {1,1,0},
+                {1,0,1},
+                {0,1,1},
 
-                    {-1,-1,0},
-                    {-1,0,-1},
-                    {0,-1,-1},
+                {-1,-1,0},
+                {-1,0,-1},
+                {0,-1,-1},
 
-                    {-1,1,0},
-                    //{-1,0,1},
+                {-1,1,0},
+                //{-1,0,1},
 
-                    {1,-1,0},
-                    {0,-1,1},
+                {1,-1,0},
+                {0,-1,1},
 
-                    //{1,0,-1},
-                    {0,1,-1},
-                };
+                //{1,0,-1},
+                {0,1,-1},
+            };
 
+            int lastImprovement = 0;
+            for (int i = 0; i < 256; i++) {
                 BlockDXT1 refined = *output;
-                for (int i = 0; i < 10000; i++) {
-                    int rnd = i * 2654435761;
-                    int8 delta[3] = { deltas[rnd % 16][0], deltas[rnd % 16][1], deltas[rnd % 16][2] };
+                int8 delta[3] = { deltas[i % 16][0], deltas[i % 16][1], deltas[i % 16][2] };
 
-                    if ((rnd / 16) & 1) {
-                        refined.col0.r += delta[0];
-                        refined.col0.g += delta[1];
-                        refined.col0.b += delta[2];
-                    }
-                    else {
-                        refined.col1.r += delta[0];
-                        refined.col1.g += delta[1];
-                        refined.col1.b += delta[2];
-                    }
-
-                    float refined_error = evaluate_mse(input_colors, input_weights, color_weights, &refined);
-                    if (refined_error < error) {
-                        *output = refined;
-                        error = refined_error;
-                    }
+                if ((i / 16) & 1) {
+                    refined.col0.r += delta[0];
+                    refined.col0.g += delta[1];
+                    refined.col0.b += delta[2];
                 }
+                else {
+                    refined.col1.r += delta[0];
+                    refined.col1.g += delta[1];
+                    refined.col1.b += delta[2];
+                }
+
+                Vector3 palette[4];
+                evaluate_palette(output->col0, output->col1, palette);
+
+                refined.indices = compute_indices(input_colors, color_weights, palette);
+
+                float refined_error = evaluate_mse(input_colors, input_weights, color_weights, &refined);
+                if (refined_error < error) {
+                    *output = refined;
+                    error = refined_error;
+                    lastImprovement = i;
+                }
+
+                // Early out if the last 32 steps didn't improve error.
+                if (i - lastImprovement > 32) break;
             }
         }
     }