From 831c8e6667812f749d90a2c5e472b48184b7cebc Mon Sep 17 00:00:00 2001
From: Ignacio Castano <castano@gmail.com>
Date: Sun, 23 Aug 2020 21:58:18 -0700
Subject: [PATCH] Upgrade ICBC to 1.05

---
 src/nvtt/icbc.h | 349 +++++++++++++++++-------------------------------
 1 file changed, 123 insertions(+), 226 deletions(-)

diff --git a/src/nvtt/icbc.h b/src/nvtt/icbc.h
index 82fd9fd..681a77a 100644
--- a/src/nvtt/icbc.h
+++ b/src/nvtt/icbc.h
@@ -1,4 +1,4 @@
-// icbc.h v1.04
+// icbc.h v1.05
 // A High Quality SIMD BC1 Encoder by Ignacio Castano <castano@gmail.com>.
 //
 // LICENSE:
@@ -376,6 +376,7 @@ ICBC_FORCEINLINE VFloat vsaturate(VFloat a) { return min(max(a, 0.0f), 1.0f); }
 ICBC_FORCEINLINE VFloat vround01(VFloat a) { return float(int(a + 0.5f)); }
 ICBC_FORCEINLINE VFloat lane_id() { return 0; }
 ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) { return mask ? b : a; }
+ICBC_FORCEINLINE VMask vbroadcast(bool b) { return b; }
 ICBC_FORCEINLINE bool all(VMask m) { return m; }
 ICBC_FORCEINLINE bool any(VMask m) { return m; }
 ICBC_FORCEINLINE uint mask(VMask m) { return (uint)m; }
@@ -511,6 +512,10 @@ ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) {
 #endif
 }
 
+ICBC_FORCEINLINE VMask vbroadcast(bool b) { 
+    return _mm_castsi128_ps(_mm_set1_epi32(-int32_t(b)));
+}
+
 ICBC_FORCEINLINE bool all(VMask m) {
     int value = _mm_movemask_ps(m);
     return value == 0x7;
@@ -702,6 +707,10 @@ ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) {
     return _mm256_blendv_ps(a, b, mask);
 }
 
+ICBC_FORCEINLINE VMask vbroadcast(bool b) { 
+    return _mm256_castsi256_ps(_mm256_set1_epi32(-int32_t(b)));
+}
+
 ICBC_FORCEINLINE bool all(VMask m) {
     __m256 zero = _mm256_setzero_ps();
     return _mm256_testc_ps(_mm256_cmp_ps(zero, zero, _CMP_EQ_UQ), m) == 0;
@@ -715,6 +724,11 @@ ICBC_FORCEINLINE uint mask(VMask m) {
     return (uint)_mm256_movemask_ps(m);
 }
 
+// This is missing on some GCC versions.
+#if !defined _mm256_set_m128
+#define _mm256_set_m128(hi, lo) _mm256_insertf128_ps(_mm256_castps128_ps256(lo), (hi), 0x1)
+#endif
+
 ICBC_FORCEINLINE int reduce_min_index(VFloat v) {
 
     __m128 vlow  = _mm256_castps256_ps128(v);
@@ -917,6 +931,10 @@ ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) {
     return _mm512_mask_blend_ps(mask.m, a, b);
 }
 
+ICBC_FORCEINLINE VMask vbroadcast(bool b) { 
+    return { __mmask16(-int16_t(b)) };
+}
+
 ICBC_FORCEINLINE bool all(VMask mask) {
     return mask.m == 0xFFFFFFFF;
 }
@@ -1572,7 +1590,7 @@ static Color16 vector3_to_color16(const Vector3 & v) {
     b += (v.z > midpoints5[b]);
 
     Color16 c;
-    c.u = (r << 11) | (g << 5) | b;
+    c.u = uint16((r << 11) | (g << 5) | b);
     return c;
 }
 
@@ -1822,7 +1840,7 @@ int compute_sat(const Vector3 * colors, const float * weights, int count, Summed
     sat->w[0] = w;
 
     for (int i = 1; i < count; i++) {
-        float w = weights[order[i]];
+        w = weights[order[i]];
         sat->r[i] = sat->r[i - 1] + colors[order[i]].x * w;
         sat->g[i] = sat->g[i - 1] + colors[order[i]].y * w;
         sat->b[i] = sat->b[i - 1] + colors[order[i]].z * w;
@@ -1906,9 +1924,9 @@ static void init_cluster_tables() {
                     }
 
                     if (!found) {
-                        s_fourCluster[i].c0 = c0;
-                        s_fourCluster[i].c1 = c0+c1;
-                        s_fourCluster[i].c2 = c0+c1+c2;
+                        s_fourCluster[i].c0 = uint8(c0);
+                        s_fourCluster[i].c1 = uint8(c0+c1);
+                        s_fourCluster[i].c2 = uint8(c0+c1+c2);
                         i++;
                     }
                 }
@@ -1941,8 +1959,8 @@ static void init_cluster_tables() {
                 }
 
                 if (!found) {
-                    s_threeCluster[i].c0 = c0;
-                    s_threeCluster[i].c1 = c0 + c1;
+                    s_threeCluster[i].c0 = uint8(c0);
+                    s_threeCluster[i].c1 = uint8(c0 + c1);
                     i++;
                 }
             }
@@ -2040,7 +2058,6 @@ static void cluster_fit_three(const SummedAreaTable & sat, int count, Vector3 me
         x1.z = vpermuteif(c1 >= 0, vbsat, c1);
         w1   = vpermuteif(c1 >= 0, vwsat, c1);
 
-
 #elif ICBC_USE_AVX2_PERMUTE2
 
         // Load 4 uint8 per lane. @@ Ideally I should pack this better and load only 2.
@@ -2156,6 +2173,7 @@ static void cluster_fit_three(const SummedAreaTable & sat, int count, Vector3 me
         }
 
 #else
+
         // Scalar path
         x0.x = vzero(); x0.y = vzero(); x0.z = vzero(); w0 = vzero();
         x1.x = vzero(); x1.y = vzero(); x1.z = vzero(); w1 = vzero();
@@ -2177,6 +2195,7 @@ static void cluster_fit_three(const SummedAreaTable & sat, int count, Vector3 me
                 lane(w1, l) = sat.w[c1];
             }
         }
+
 #endif
 
         VFloat w2 = vbroadcast(w_sum) - w1;
@@ -2536,7 +2555,7 @@ static void cluster_fit_four(const SummedAreaTable & sat, int count, Vector3 met
 Decoder s_decoder = Decoder_D3D10;
 
 // D3D10
-inline void evaluate_palette4_d3d10(Color16 c0, Color16 c1, Color32 palette[4]) {
+inline void evaluate_palette4_d3d10(Color32 palette[4]) {
     palette[2].r = (2 * palette[0].r + palette[1].r) / 3;
     palette[2].g = (2 * palette[0].g + palette[1].g) / 3;
     palette[2].b = (2 * palette[0].b + palette[1].b) / 3;
@@ -2547,7 +2566,7 @@ inline void evaluate_palette4_d3d10(Color16 c0, Color16 c1, Color32 palette[4])
     palette[3].b = (2 * palette[1].b + palette[0].b) / 3;
     palette[3].a = 0xFF;
 }
-inline void evaluate_palette3_d3d10(Color16 c0, Color16 c1, Color32 palette[4]) {
+inline void evaluate_palette3_d3d10(Color32 palette[4]) {
     palette[2].r = (palette[0].r + palette[1].r) / 2;
     palette[2].g = (palette[0].g + palette[1].g) / 2;
     palette[2].b = (palette[0].b + palette[1].b) / 2;
@@ -2558,31 +2577,31 @@ static void evaluate_palette_d3d10(Color16 c0, Color16 c1, Color32 palette[4]) {
     palette[0] = bitexpand_color16_to_color32(c0);
     palette[1] = bitexpand_color16_to_color32(c1);
     if (c0.u > c1.u) {
-        evaluate_palette4_d3d10(c0, c1, palette);
+        evaluate_palette4_d3d10(palette);
     }
     else {
-        evaluate_palette3_d3d10(c0, c1, palette);
+        evaluate_palette3_d3d10(palette);
     }
 }
 
 // NV
 inline void evaluate_palette4_nv(Color16 c0, Color16 c1, Color32 palette[4]) {
     int gdiff = palette[1].g - palette[0].g;
-    palette[2].r = ((2 * c0.r + c1.r) * 22) / 8;
-    palette[2].g = (256 * palette[0].g + gdiff / 4 + 128 + gdiff * 80) / 256;
-    palette[2].b = ((2 * c0.b + c1.b) * 22) / 8;
+    palette[2].r = uint8(((2 * c0.r + c1.r) * 22) / 8);
+    palette[2].g = uint8((256 * palette[0].g + gdiff / 4 + 128 + gdiff * 80) / 256);
+    palette[2].b = uint8(((2 * c0.b + c1.b) * 22) / 8);
     palette[2].a = 0xFF;
 
-    palette[3].r = ((2 * c1.r + c0.r) * 22) / 8;
-    palette[3].g = (256 * palette[1].g - gdiff / 4 + 128 - gdiff * 80) / 256;
-    palette[3].b = ((2 * c1.b + c0.b) * 22) / 8;
+    palette[3].r = uint8(((2 * c1.r + c0.r) * 22) / 8);
+    palette[3].g = uint8((256 * palette[1].g - gdiff / 4 + 128 - gdiff * 80) / 256);
+    palette[3].b = uint8(((2 * c1.b + c0.b) * 22) / 8);
     palette[3].a = 0xFF;
 }
 inline void evaluate_palette3_nv(Color16 c0, Color16 c1, Color32 palette[4]) {
     int gdiff = palette[1].g - palette[0].g;
-    palette[2].r = ((c0.r + c1.r) * 33) / 8;
-    palette[2].g = (256 * palette[0].g + gdiff / 4 + 128 + gdiff * 128) / 256;
-    palette[2].b = ((c0.b + c1.b) * 33) / 8;
+    palette[2].r = uint8(((c0.r + c1.r) * 33) / 8);
+    palette[2].g = uint8((256 * palette[0].g + gdiff / 4 + 128 + gdiff * 128) / 256);
+    palette[2].b = uint8(((c0.b + c1.b) * 33) / 8);
     palette[2].a = 0xFF;
     palette[3].u = 0;
 }
@@ -2599,21 +2618,21 @@ static void evaluate_palette_nv(Color16 c0, Color16 c1, Color32 palette[4]) {
 }
 
 // AMD
-inline void evaluate_palette4_amd(Color16 c0, Color16 c1, Color32 palette[4]) {
-    palette[2].r = (43 * palette[0].r + 21 * palette[1].r + 32) >> 6;
-    palette[2].g = (43 * palette[0].g + 21 * palette[1].g + 32) >> 6;
-    palette[2].b = (43 * palette[0].b + 21 * palette[1].b + 32) >> 6;
+inline void evaluate_palette4_amd(Color32 palette[4]) {
+    palette[2].r = uint8((43 * palette[0].r + 21 * palette[1].r + 32) >> 6);
+    palette[2].g = uint8((43 * palette[0].g + 21 * palette[1].g + 32) >> 6);
+    palette[2].b = uint8((43 * palette[0].b + 21 * palette[1].b + 32) >> 6);
     palette[2].a = 0xFF;
 
-    palette[3].r = (43 * palette[1].r + 21 * palette[0].r + 32) >> 6;
-    palette[3].g = (43 * palette[1].g + 21 * palette[0].g + 32) >> 6;
-    palette[3].b = (43 * palette[1].b + 21 * palette[0].b + 32) >> 6;
+    palette[3].r = uint8((43 * palette[1].r + 21 * palette[0].r + 32) >> 6);
+    palette[3].g = uint8((43 * palette[1].g + 21 * palette[0].g + 32) >> 6);
+    palette[3].b = uint8((43 * palette[1].b + 21 * palette[0].b + 32) >> 6);
     palette[3].a = 0xFF;
 }
-inline void evaluate_palette3_amd(Color16 c0, Color16 c1, Color32 palette[4]) {
-    palette[2].r = (palette[0].r + palette[1].r + 1) / 2;
-    palette[2].g = (palette[0].g + palette[1].g + 1) / 2;
-    palette[2].b = (palette[0].b + palette[1].b + 1) / 2;
+inline void evaluate_palette3_amd(Color32 palette[4]) {
+    palette[2].r = uint8((palette[0].r + palette[1].r + 1) / 2);
+    palette[2].g = uint8((palette[0].g + palette[1].g + 1) / 2);
+    palette[2].b = uint8((palette[0].b + palette[1].b + 1) / 2);
     palette[2].a = 0xFF;
     palette[3].u = 0;
 }
@@ -2622,27 +2641,17 @@ static void evaluate_palette_amd(Color16 c0, Color16 c1, Color32 palette[4]) {
     palette[1] = bitexpand_color16_to_color32(c1);
 
     if (c0.u > c1.u) {
-        evaluate_palette4_amd(c0, c1, palette);
+        evaluate_palette4_amd(palette);
     }
     else {
-        evaluate_palette3_amd(c0, c1, palette);
+        evaluate_palette3_amd(palette);
     }
 }
 
-inline void evaluate_palette4(Color16 c0, Color16 c1, Color32 palette[4]) {
-    if (s_decoder == Decoder_D3D10)         evaluate_palette4_d3d10(c0, c1, palette);    
-    else if (s_decoder == Decoder_NVIDIA)   evaluate_palette4_nv(c0, c1, palette);
-    else if (s_decoder == Decoder_AMD)      evaluate_palette4_amd(c0, c1, palette);
-}
-inline void evaluate_palette3(Color16 c0, Color16 c1, Color32 palette[4]) {
-    if (s_decoder == Decoder_D3D10)         evaluate_palette3_d3d10(c0, c1, palette);
-    else if (s_decoder == Decoder_NVIDIA)   evaluate_palette3_nv(c0, c1, palette);
-    else if (s_decoder == Decoder_AMD)      evaluate_palette3_amd(c0, c1, palette);
-}
-inline void evaluate_palette(Color16 c0, Color16 c1, Color32 palette[4]) {
-    if (s_decoder == Decoder_D3D10)         evaluate_palette_d3d10(c0, c1, palette);
-    else if (s_decoder == Decoder_NVIDIA)   evaluate_palette_nv(c0, c1, palette);
-    else if (s_decoder == Decoder_AMD)      evaluate_palette_amd(c0, c1, palette);
+inline void evaluate_palette(Color16 c0, Color16 c1, Color32 palette[4], Decoder decoder = s_decoder) {
+    if (decoder == Decoder_D3D10)         evaluate_palette_d3d10(c0, c1, palette);
+    else if (decoder == Decoder_NVIDIA)   evaluate_palette_nv(c0, c1, palette);
+    else if (decoder == Decoder_AMD)      evaluate_palette_amd(c0, c1, palette);
 }
 
 static void evaluate_palette(Color16 c0, Color16 c1, Vector3 palette[4]) {
@@ -2657,15 +2666,7 @@ static void evaluate_palette(Color16 c0, Color16 c1, Vector3 palette[4]) {
 static void decode_dxt1(const BlockDXT1 * block, unsigned char rgba_block[16 * 4], Decoder decoder)
 {
     Color32 palette[4];
-    if (decoder == Decoder_NVIDIA) {
-        evaluate_palette_nv(block->col0, block->col1, palette);
-    }
-    else if (decoder == Decoder_AMD) {
-        evaluate_palette_amd(block->col0, block->col1, palette);
-    }
-    else {
-        evaluate_palette(block->col0, block->col1, palette);
-    }
+    evaluate_palette(block->col0, block->col1, palette, decoder);
 
     for (int i = 0; i < 16; i++) {
         int index = (block->indices >> (2 * i)) & 3;
@@ -2693,23 +2694,10 @@ static float evaluate_mse(const Color32 & p, const Vector3 & c, const Vector3 &
     return dot(d, d);
 }
 
-
-/*static float evaluate_mse(const Vector3 & p, const Vector3 & c, const Vector3 & w) {
-    return ww.x * square(p.x-c.x) + ww.y * square(p.y-c.y) + ww.z * square(p.z-c.z);
-}*/
-
 static int evaluate_mse(const Color32 & p, const Color32 & c) {
     return (square(int(p.r)-c.r) + square(int(p.g)-c.g) + square(int(p.b)-c.b));
 }
 
-/*static float evaluate_mse(const Vector3 palette[4], const Vector3 & c, const Vector3 & w) {
-    float e0 = evaluate_mse(palette[0], c, w);
-    float e1 = evaluate_mse(palette[1], c, w);
-    float e2 = evaluate_mse(palette[2], c, w);
-    float e3 = evaluate_mse(palette[3], c, w);
-    return min(min(e0, e1), min(e2, e3));
-}*/
-
 static int evaluate_mse(const Color32 palette[4], const Color32 & c) {
     int e0 = evaluate_mse(palette[0], c);
     int e1 = evaluate_mse(palette[1], c);
@@ -2760,17 +2748,20 @@ static float evaluate_mse(const Vector4 input_colors[16], const float input_weig
     return error;
 }
 
+static float evaluate_mse(const Vector4 input_colors[16], const float input_weights[16], const Vector3& color_weights, Vector3 palette[4], uint32 indices) {
+
+    // evaluate error for each index.
+    float error = 0.0f;
+    for (int i = 0; i < 16; i++) {
+        int index = (indices >> (2 * i)) & 3;
+        error += input_weights[i] * evaluate_mse(palette[index], input_colors[i].xyz, color_weights);
+    }
+    return error;
+}
+
 float evaluate_dxt1_error(const uint8 rgba_block[16*4], const BlockDXT1 * block, Decoder decoder) {
     Color32 palette[4];
-    if (decoder == Decoder_NVIDIA) {
-        evaluate_palette_nv(block->col0, block->col1, palette);
-    }
-    else if (decoder == Decoder_AMD) {
-        evaluate_palette_amd(block->col0, block->col1, palette);
-    }
-    else {
-        evaluate_palette(block->col0, block->col1, palette);
-    }
+    evaluate_palette(block->col0, block->col1, palette, decoder);
 
     // evaluate error for each index.
     float error = 0.0f;
@@ -2845,32 +2836,7 @@ static uint compute_indices4(const Vector4 input_colors[16], const Vector3 & col
     return interleave(indices1, indices0);
 }
 
-static uint compute_indices3(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 palette[4]) {
-#if 0
-    Vector3 p0 = palette[0] * color_weights;
-    Vector3 p1 = palette[1] * color_weights;
-    Vector3 p2 = palette[2] * color_weights;
-    Vector3 p3 = palette[3] * color_weights;
-
-    uint indices = 0;
-    for (int i = 0; i < 16; i++) {
-        Vector3 ci = input_colors[i].xyz * color_weights;
-        float d0 = lengthSquared(p0 - ci);
-        float d1 = lengthSquared(p1 - ci);
-        float d2 = lengthSquared(p2 - ci);
-        float d3 = lengthSquared(p3 - ci);
-
-        uint index;
-        if (d0 < d1 && d0 < d2 && d0 < d3) index = 0;
-        else if (d1 < d2 && d1 < d3) index = 1;
-        else if (d2 < d3) index = 2;
-        else index = 3;
-
-        indices |= index << (2 * i);
-    }
-
-    return indices;
-#else
+static uint compute_indices3(const Vector4 input_colors[16], const Vector3 & color_weights, bool allow_transparent_black, const Vector3 palette[4]) {
     uint indices0 = 0;
     uint indices1 = 0;
 
@@ -2879,90 +2845,40 @@ static uint compute_indices3(const Vector4 input_colors[16], const Vector3 & col
     VVector3 vp1 = vbroadcast(palette[1]) * vw;
     VVector3 vp2 = vbroadcast(palette[2]) * vw;
 
-    for (int i = 0; i < 16; i += VEC_SIZE) {
-        VVector3 vc = vload(&input_colors[i]) * vw;
+    if (allow_transparent_black) {
+        for (int i = 0; i < 16; i += VEC_SIZE) {
+            VVector3 vc = vload(&input_colors[i]) * vw;
 
-        VFloat d0 = vlen2(vc - vp0);
-        VFloat d1 = vlen2(vc - vp1);
-        VFloat d2 = vlen2(vc - vp2);
-        VFloat d3 = vdot(vc, vc);
+            VFloat d0 = vlen2(vp0 - vc);
+            VFloat d1 = vlen2(vp1 - vc);
+            VFloat d2 = vlen2(vp2 - vc);
+            VFloat d3 = vdot(vc, vc);
 
-        // @@ simplify i1 & i2
-        //VMask i0 = (d0 < d1) & (d0 < d2) & (d0 < d3); // 0
-        VMask i1 = (d1 <= d0) & (d1 < d2) & (d1 < d3); // 1
-        VMask i2 = (d2 <= d0) & (d2 <= d1) & (d2 < d3); // 2
-        VMask i3 = (d3 <= d0) & (d3 <= d1) & (d3 <= d2); // 3
-        //VFloat vindex = vselect(i0, vselect(i1, vselect(i2, vbroadcast(3), vbroadcast(2)), vbroadcast(1)), vbroadcast(0));
+            VMask i1 = (d1 < d2);
+            VMask i2 = (d2 <= d0) & (d2 <= d1);
+            VMask i3 = (d3 <= d0) & (d3 <= d1) & (d3 <= d2);
 
-        indices0 |= mask(i2 | i3) << i;
-        indices1 |= mask(i1 | i3) << i;
+            indices0 |= mask(i2 | i3) << i;
+            indices1 |= mask(i1 | i3) << i;
+        }
     }
+    else {
+        for (int i = 0; i < 16; i += VEC_SIZE) {
+            VVector3 vc = vload(&input_colors[i]) * vw;
 
-    uint indices = interleave(indices1, indices0);
-    return indices;
-#endif
-}
-
-
-
-static uint compute_indices4(const Vector3 input_colors[16], const Vector3 palette[4]) {
-#if 0
-    uint indices0 = 0;
-    uint indices1 = 0;
-
-    VVector3 vp0 = vbroadcast(palette[0]);
-    VVector3 vp1 = vbroadcast(palette[1]);
-    VVector3 vp2 = vbroadcast(palette[2]);
-    VVector3 vp3 = vbroadcast(palette[3]);
-
-    for (int i = 0; i < 16; i += VEC_SIZE) {
-        VVector3 vc = vload(&input_colors[i]);
-
-        VFloat d0 = vlen2(vc - vp0);
-        VFloat d1 = vlen2(vc - vp1);
-        VFloat d2 = vlen2(vc - vp2);
-        VFloat d3 = vlen2(vc - vp3);
-
-        VMask b1 = d1 > d2;
-        VMask b2 = d0 > d2;
-        VMask x0 = b1 & b2;
+            VFloat d0 = vlen2(vc - vp0);
+            VFloat d1 = vlen2(vc - vp1);
+            VFloat d2 = vlen2(vc - vp2);
 
-        VMask b0 = d0 > d3;
-        VMask b3 = d1 > d3;
-        x0 = x0 | (b0 & b3);
+            VMask i1 = (d1 < d2);
+            VMask i2 = (d2 <= d0) & (d2 <= d1);
 
-        VMask b4 = d2 > d3;
-        VMask x1 = b0 & b4;
-
-        indices0 |= mask(x0) << i;
-        indices1 |= mask(x1) << i;
+            indices0 |= mask(i2) << i;
+            indices1 |= mask(i1) << i;
+        }
     }
 
     return interleave(indices1, indices0);
-#else
-    uint indices = 0;
-    for (int i = 0; i < 16; i++) {
-        Vector3 ci = input_colors[i];
-        float d0 = lengthSquared(palette[0] - ci);
-        float d1 = lengthSquared(palette[1] - ci);
-        float d2 = lengthSquared(palette[2] - ci);
-        float d3 = lengthSquared(palette[3] - ci);
-
-        uint b0 = d0 > d3;
-        uint b1 = d1 > d2;
-        uint b2 = d0 > d2;
-        uint b3 = d1 > d3;
-        uint b4 = d2 > d3;
-
-        uint x0 = b1 & b2;
-        uint x1 = b0 & b3;
-        uint x2 = b0 & b4;
-
-        indices |= (x2 | ((x0 | x1) << 1)) << (2 * i);
-    }
-
-    return indices;
-#endif
 }
 
 
@@ -3025,7 +2941,7 @@ static uint compute_indices(const Vector4 input_colors[16], const Vector3 & colo
 }
 
 
-static void output_block3(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 & v0, const Vector3 & v1, BlockDXT1 * block)
+static float output_block3(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, bool allow_transparent_black, const Vector3 & v0, const Vector3 & v1, BlockDXT1 * block)
 {
     Color16 color0 = vector3_to_color16(v0);
     Color16 color1 = vector3_to_color16(v1);
@@ -3039,10 +2955,12 @@ static void output_block3(const Vector4 input_colors[16], const Vector3 & color_
 
     block->col0 = color0;
     block->col1 = color1;
-    block->indices = compute_indices(input_colors, color_weights, palette);
+    block->indices = compute_indices3(input_colors, color_weights, allow_transparent_black, palette);
+
+    return evaluate_mse(input_colors, input_weights, color_weights, palette, block->indices);
 }
 
-static void output_block4(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 & v0, const Vector3 & v1, BlockDXT1 * block)
+static float output_block4(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, const Vector3 & v0, const Vector3 & v1, BlockDXT1 * block)
 {
     Color16 color0 = vector3_to_color16(v0);
     Color16 color1 = vector3_to_color16(v1);
@@ -3057,26 +2975,11 @@ static void output_block4(const Vector4 input_colors[16], const Vector3 & color_
     block->col0 = color0;
     block->col1 = color1;
     block->indices = compute_indices4(input_colors, color_weights, palette);
-}
-
-
-static void output_block4(const Vector3 input_colors[16], const Vector3 & v0, const Vector3 & v1, BlockDXT1 * block)
-{
-    Color16 color0 = vector3_to_color16(v0);
-    Color16 color1 = vector3_to_color16(v1);
-
-    if (color0.u < color1.u) {
-        swap(color0, color1);
-    }
 
-    Vector3 palette[4];
-    evaluate_palette(color0, color1, palette);
-
-    block->col0 = color0;
-    block->col1 = color1;
-    block->indices = compute_indices4(input_colors, palette);
+    return evaluate_mse(input_colors, input_weights, color_weights, palette, block->indices);
 }
 
+
 // Least squares fitting of color end points for the given indices. @@ Take weights into account.
 static bool optimize_end_points4(uint indices, const Vector4 * colors, /*const float * weights,*/ int count, Vector3 * a, Vector3 * b)
 {
@@ -3263,7 +3166,7 @@ static inline int Lerp13(int a, int b)
 static void PrepareOptTable5(uint8 * table, Decoder decoder)
 {
     uint8 expand[32];
-    for (int i = 0; i < 32; i++) expand[i] = (i << 3) | (i >> 2);
+    for (int i = 0; i < 32; i++) expand[i] = uint8((i << 3) | (i >> 2));
 
     for (int i = 0; i < 256; i++) {
         int bestErr = 256 * 100;
@@ -3292,17 +3195,17 @@ static void PrepareOptTable5(uint8 * table, Decoder decoder)
                     // Another approach is to consider the worst of AMD and NVIDIA errors.
                     err = max(amd_err, nv_err);                    
                 }
-                else if (decoder == Decoder_AMD) {
-                    err = amd_err;
-                }
                 else if (decoder == Decoder_NVIDIA) {
                     err = nv_err;
                 }
+                else /*if (decoder == Decoder_AMD)*/ {
+                    err = amd_err;
+                }
 
                 if (err < bestErr) {
                     bestErr = err;
-                    table[i * 2 + 0] = mx;
-                    table[i * 2 + 1] = mn;
+                    table[i * 2 + 0] = uint8(mx);
+                    table[i * 2 + 1] = uint8(mn);
                 }
             }
         }
@@ -3312,7 +3215,7 @@ static void PrepareOptTable5(uint8 * table, Decoder decoder)
 static void PrepareOptTable6(uint8 * table, Decoder decoder)
 {
     uint8 expand[64];
-    for (int i = 0; i < 64; i++) expand[i] = (i << 2) | (i >> 4);
+    for (int i = 0; i < 64; i++) expand[i] = uint8((i << 2) | (i >> 4));
 
     for (int i = 0; i < 256; i++) {
         int bestErr = 256 * 100;
@@ -3341,17 +3244,17 @@ static void PrepareOptTable6(uint8 * table, Decoder decoder)
                     // Another approach is to consider the worst of AMD and NVIDIA errors.
                     err = max(amd_err, nv_err);
                 }
-                else if (decoder == Decoder_AMD) {
-                    err = amd_err;
-                }
                 else if (decoder == Decoder_NVIDIA) {
                     err = nv_err;
                 }
+                else /*if (decoder == Decoder_AMD)*/ {
+                    err = amd_err;
+                }
 
                 if (err < bestErr) {
                     bestErr = err;
-                    table[i * 2 + 0] = mx;
-                    table[i * 2 + 1] = mn;
+                    table[i * 2 + 0] = uint8(mx);
+                    table[i * 2 + 1] = uint8(mn);
                 }
             }
         }
@@ -3386,7 +3289,7 @@ static void compress_dxt1_single_color_optimal(Color32 c, BlockDXT1 * output)
 }
 
 
-static float compress_dxt1_cluster_fit(const Vector4 input_colors[16], const float input_weights[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, bool three_color_mode, bool use_transparent_black, BlockDXT1 * output)
+static float compress_dxt1_cluster_fit(const Vector4 input_colors[16], const float input_weights[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, bool three_color_mode, bool try_transparent_black, bool allow_transparent_black, BlockDXT1 * output)
 {
     Vector3 metric_sqr = color_weights * color_weights;
 
@@ -3396,12 +3299,10 @@ static float compress_dxt1_cluster_fit(const Vector4 input_colors[16], const flo
     Vector3 start, end;
     cluster_fit_four(sat, sat_count, metric_sqr, &start, &end);
 
-    output_block4(input_colors, color_weights, start, end, output);
-
-    float best_error = evaluate_mse(input_colors, input_weights, color_weights, output);
+    float best_error = output_block4(input_colors, input_weights, color_weights, start, end, output);
 
     if (three_color_mode) {
-        if (use_transparent_black) {
+        if (try_transparent_black) {
             Vector3 tmp_colors[16];
             float tmp_weights[16];
             int tmp_count = skip_blacks(colors, weights, count, tmp_colors, tmp_weights);
@@ -3413,9 +3314,7 @@ static float compress_dxt1_cluster_fit(const Vector4 input_colors[16], const flo
         cluster_fit_three(sat, sat_count, metric_sqr, &start, &end);
 
         BlockDXT1 three_color_block;
-        output_block3(input_colors, color_weights, start, end, &three_color_block);
-
-        float three_color_error = evaluate_mse(input_colors, input_weights, color_weights, &three_color_block);
+        float three_color_error = output_block3(input_colors, input_weights, color_weights, allow_transparent_black, start, end, &three_color_block);
 
         if (three_color_error < best_error) {
             best_error = three_color_error;
@@ -3623,16 +3522,13 @@ static float compress_dxt1(Quality level, const Vector4 input_colors[16], const
         fit_colors_bbox(colors, count, &c0, &c1);
         inset_bbox(&c0, &c1);
         select_diagonal(colors, count, &c0, &c1);
-        output_block4(input_colors, color_weights, c0, c1, output);
-
-        error = evaluate_mse(input_colors, input_weights, color_weights, output);
+        error = output_block4(input_colors, input_weights, color_weights, c0, c1, output);
 
         // Refine color for the selected indices.
         if (opt.least_squares_fit && optimize_end_points4(output->indices, input_colors, 16, &c0, &c1)) {
             BlockDXT1 optimized_block;
-            output_block4(input_colors, color_weights, c0, c1, &optimized_block);
+            float optimized_error = output_block4(input_colors, input_weights, color_weights, c0, c1, &optimized_block);
 
-            float optimized_error = evaluate_mse(input_colors, input_weights, color_weights, &optimized_block);
             if (optimized_error < error) {
                 error = optimized_error;
                 *output = optimized_block;
@@ -3648,7 +3544,7 @@ static float compress_dxt1(Quality level, const Vector4 input_colors[16], const
 
         // Try cluster fit.
         BlockDXT1 cluster_fit_output;
-        float cluster_fit_error = compress_dxt1_cluster_fit(input_colors, input_weights, colors, weights, count, color_weights, use_three_color_mode, use_three_color_black, &cluster_fit_output);
+        float cluster_fit_error = compress_dxt1_cluster_fit(input_colors, input_weights, colors, weights, count, color_weights, use_three_color_mode, use_three_color_black, three_color_black, &cluster_fit_output);
         if (cluster_fit_error < error) {
             *output = cluster_fit_output;
             error = cluster_fit_error;
@@ -3713,6 +3609,7 @@ float compress_dxt1(Quality level, const float * input_colors, const float * inp
 // v1.02 - Removed SIMD code path.
 // v1.03 - Quality levels. AVX512, Neon, Altivec, vectorized reduction and index selection.
 // v1.04 - Automatic compile-time SIMD selection. Specify hw decoder at runtime. More optimizations.
+// v1.05 - Bug fixes. Small optimizations.
 
 // Copyright (c) 2020 Ignacio Castano <castano@gmail.com>
 //