From 831c8e6667812f749d90a2c5e472b48184b7cebc Mon Sep 17 00:00:00 2001 From: Ignacio Castano Date: Sun, 23 Aug 2020 21:58:18 -0700 Subject: [PATCH] Upgrade ICBC to 1.05 --- src/nvtt/icbc.h | 349 +++++++++++++++++------------------------------- 1 file changed, 123 insertions(+), 226 deletions(-) diff --git a/src/nvtt/icbc.h b/src/nvtt/icbc.h index 82fd9fd..681a77a 100644 --- a/src/nvtt/icbc.h +++ b/src/nvtt/icbc.h @@ -1,4 +1,4 @@ -// icbc.h v1.04 +// icbc.h v1.05 // A High Quality SIMD BC1 Encoder by Ignacio Castano . // // LICENSE: @@ -376,6 +376,7 @@ ICBC_FORCEINLINE VFloat vsaturate(VFloat a) { return min(max(a, 0.0f), 1.0f); } ICBC_FORCEINLINE VFloat vround01(VFloat a) { return float(int(a + 0.5f)); } ICBC_FORCEINLINE VFloat lane_id() { return 0; } ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) { return mask ? b : a; } +ICBC_FORCEINLINE VMask vbroadcast(bool b) { return b; } ICBC_FORCEINLINE bool all(VMask m) { return m; } ICBC_FORCEINLINE bool any(VMask m) { return m; } ICBC_FORCEINLINE uint mask(VMask m) { return (uint)m; } @@ -511,6 +512,10 @@ ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) { #endif } +ICBC_FORCEINLINE VMask vbroadcast(bool b) { + return _mm_castsi128_ps(_mm_set1_epi32(-int32_t(b))); +} + ICBC_FORCEINLINE bool all(VMask m) { int value = _mm_movemask_ps(m); return value == 0x7; @@ -702,6 +707,10 @@ ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) { return _mm256_blendv_ps(a, b, mask); } +ICBC_FORCEINLINE VMask vbroadcast(bool b) { + return _mm256_castsi256_ps(_mm256_set1_epi32(-int32_t(b))); +} + ICBC_FORCEINLINE bool all(VMask m) { __m256 zero = _mm256_setzero_ps(); return _mm256_testc_ps(_mm256_cmp_ps(zero, zero, _CMP_EQ_UQ), m) == 0; @@ -715,6 +724,11 @@ ICBC_FORCEINLINE uint mask(VMask m) { return (uint)_mm256_movemask_ps(m); } +// This is missing on some GCC versions. +#if !defined _mm256_set_m128 +#define _mm256_set_m128(hi, lo) _mm256_insertf128_ps(_mm256_castps128_ps256(lo), (hi), 0x1) +#endif + ICBC_FORCEINLINE int reduce_min_index(VFloat v) { __m128 vlow = _mm256_castps256_ps128(v); @@ -917,6 +931,10 @@ ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) { return _mm512_mask_blend_ps(mask.m, a, b); } +ICBC_FORCEINLINE VMask vbroadcast(bool b) { + return { __mmask16(-int16_t(b)) }; +} + ICBC_FORCEINLINE bool all(VMask mask) { return mask.m == 0xFFFFFFFF; } @@ -1572,7 +1590,7 @@ static Color16 vector3_to_color16(const Vector3 & v) { b += (v.z > midpoints5[b]); Color16 c; - c.u = (r << 11) | (g << 5) | b; + c.u = uint16((r << 11) | (g << 5) | b); return c; } @@ -1822,7 +1840,7 @@ int compute_sat(const Vector3 * colors, const float * weights, int count, Summed sat->w[0] = w; for (int i = 1; i < count; i++) { - float w = weights[order[i]]; + w = weights[order[i]]; sat->r[i] = sat->r[i - 1] + colors[order[i]].x * w; sat->g[i] = sat->g[i - 1] + colors[order[i]].y * w; sat->b[i] = sat->b[i - 1] + colors[order[i]].z * w; @@ -1906,9 +1924,9 @@ static void init_cluster_tables() { } if (!found) { - s_fourCluster[i].c0 = c0; - s_fourCluster[i].c1 = c0+c1; - s_fourCluster[i].c2 = c0+c1+c2; + s_fourCluster[i].c0 = uint8(c0); + s_fourCluster[i].c1 = uint8(c0+c1); + s_fourCluster[i].c2 = uint8(c0+c1+c2); i++; } } @@ -1941,8 +1959,8 @@ static void init_cluster_tables() { } if (!found) { - s_threeCluster[i].c0 = c0; - s_threeCluster[i].c1 = c0 + c1; + s_threeCluster[i].c0 = uint8(c0); + s_threeCluster[i].c1 = uint8(c0 + c1); i++; } } @@ -2040,7 +2058,6 @@ static void cluster_fit_three(const SummedAreaTable & sat, int count, Vector3 me x1.z = vpermuteif(c1 >= 0, vbsat, c1); w1 = vpermuteif(c1 >= 0, vwsat, c1); - #elif ICBC_USE_AVX2_PERMUTE2 // Load 4 uint8 per lane. @@ Ideally I should pack this better and load only 2. @@ -2156,6 +2173,7 @@ static void cluster_fit_three(const SummedAreaTable & sat, int count, Vector3 me } #else + // Scalar path x0.x = vzero(); x0.y = vzero(); x0.z = vzero(); w0 = vzero(); x1.x = vzero(); x1.y = vzero(); x1.z = vzero(); w1 = vzero(); @@ -2177,6 +2195,7 @@ static void cluster_fit_three(const SummedAreaTable & sat, int count, Vector3 me lane(w1, l) = sat.w[c1]; } } + #endif VFloat w2 = vbroadcast(w_sum) - w1; @@ -2536,7 +2555,7 @@ static void cluster_fit_four(const SummedAreaTable & sat, int count, Vector3 met Decoder s_decoder = Decoder_D3D10; // D3D10 -inline void evaluate_palette4_d3d10(Color16 c0, Color16 c1, Color32 palette[4]) { +inline void evaluate_palette4_d3d10(Color32 palette[4]) { palette[2].r = (2 * palette[0].r + palette[1].r) / 3; palette[2].g = (2 * palette[0].g + palette[1].g) / 3; palette[2].b = (2 * palette[0].b + palette[1].b) / 3; @@ -2547,7 +2566,7 @@ inline void evaluate_palette4_d3d10(Color16 c0, Color16 c1, Color32 palette[4]) palette[3].b = (2 * palette[1].b + palette[0].b) / 3; palette[3].a = 0xFF; } -inline void evaluate_palette3_d3d10(Color16 c0, Color16 c1, Color32 palette[4]) { +inline void evaluate_palette3_d3d10(Color32 palette[4]) { palette[2].r = (palette[0].r + palette[1].r) / 2; palette[2].g = (palette[0].g + palette[1].g) / 2; palette[2].b = (palette[0].b + palette[1].b) / 2; @@ -2558,31 +2577,31 @@ static void evaluate_palette_d3d10(Color16 c0, Color16 c1, Color32 palette[4]) { palette[0] = bitexpand_color16_to_color32(c0); palette[1] = bitexpand_color16_to_color32(c1); if (c0.u > c1.u) { - evaluate_palette4_d3d10(c0, c1, palette); + evaluate_palette4_d3d10(palette); } else { - evaluate_palette3_d3d10(c0, c1, palette); + evaluate_palette3_d3d10(palette); } } // NV inline void evaluate_palette4_nv(Color16 c0, Color16 c1, Color32 palette[4]) { int gdiff = palette[1].g - palette[0].g; - palette[2].r = ((2 * c0.r + c1.r) * 22) / 8; - palette[2].g = (256 * palette[0].g + gdiff / 4 + 128 + gdiff * 80) / 256; - palette[2].b = ((2 * c0.b + c1.b) * 22) / 8; + palette[2].r = uint8(((2 * c0.r + c1.r) * 22) / 8); + palette[2].g = uint8((256 * palette[0].g + gdiff / 4 + 128 + gdiff * 80) / 256); + palette[2].b = uint8(((2 * c0.b + c1.b) * 22) / 8); palette[2].a = 0xFF; - palette[3].r = ((2 * c1.r + c0.r) * 22) / 8; - palette[3].g = (256 * palette[1].g - gdiff / 4 + 128 - gdiff * 80) / 256; - palette[3].b = ((2 * c1.b + c0.b) * 22) / 8; + palette[3].r = uint8(((2 * c1.r + c0.r) * 22) / 8); + palette[3].g = uint8((256 * palette[1].g - gdiff / 4 + 128 - gdiff * 80) / 256); + palette[3].b = uint8(((2 * c1.b + c0.b) * 22) / 8); palette[3].a = 0xFF; } inline void evaluate_palette3_nv(Color16 c0, Color16 c1, Color32 palette[4]) { int gdiff = palette[1].g - palette[0].g; - palette[2].r = ((c0.r + c1.r) * 33) / 8; - palette[2].g = (256 * palette[0].g + gdiff / 4 + 128 + gdiff * 128) / 256; - palette[2].b = ((c0.b + c1.b) * 33) / 8; + palette[2].r = uint8(((c0.r + c1.r) * 33) / 8); + palette[2].g = uint8((256 * palette[0].g + gdiff / 4 + 128 + gdiff * 128) / 256); + palette[2].b = uint8(((c0.b + c1.b) * 33) / 8); palette[2].a = 0xFF; palette[3].u = 0; } @@ -2599,21 +2618,21 @@ static void evaluate_palette_nv(Color16 c0, Color16 c1, Color32 palette[4]) { } // AMD -inline void evaluate_palette4_amd(Color16 c0, Color16 c1, Color32 palette[4]) { - palette[2].r = (43 * palette[0].r + 21 * palette[1].r + 32) >> 6; - palette[2].g = (43 * palette[0].g + 21 * palette[1].g + 32) >> 6; - palette[2].b = (43 * palette[0].b + 21 * palette[1].b + 32) >> 6; +inline void evaluate_palette4_amd(Color32 palette[4]) { + palette[2].r = uint8((43 * palette[0].r + 21 * palette[1].r + 32) >> 6); + palette[2].g = uint8((43 * palette[0].g + 21 * palette[1].g + 32) >> 6); + palette[2].b = uint8((43 * palette[0].b + 21 * palette[1].b + 32) >> 6); palette[2].a = 0xFF; - palette[3].r = (43 * palette[1].r + 21 * palette[0].r + 32) >> 6; - palette[3].g = (43 * palette[1].g + 21 * palette[0].g + 32) >> 6; - palette[3].b = (43 * palette[1].b + 21 * palette[0].b + 32) >> 6; + palette[3].r = uint8((43 * palette[1].r + 21 * palette[0].r + 32) >> 6); + palette[3].g = uint8((43 * palette[1].g + 21 * palette[0].g + 32) >> 6); + palette[3].b = uint8((43 * palette[1].b + 21 * palette[0].b + 32) >> 6); palette[3].a = 0xFF; } -inline void evaluate_palette3_amd(Color16 c0, Color16 c1, Color32 palette[4]) { - palette[2].r = (palette[0].r + palette[1].r + 1) / 2; - palette[2].g = (palette[0].g + palette[1].g + 1) / 2; - palette[2].b = (palette[0].b + palette[1].b + 1) / 2; +inline void evaluate_palette3_amd(Color32 palette[4]) { + palette[2].r = uint8((palette[0].r + palette[1].r + 1) / 2); + palette[2].g = uint8((palette[0].g + palette[1].g + 1) / 2); + palette[2].b = uint8((palette[0].b + palette[1].b + 1) / 2); palette[2].a = 0xFF; palette[3].u = 0; } @@ -2622,27 +2641,17 @@ static void evaluate_palette_amd(Color16 c0, Color16 c1, Color32 palette[4]) { palette[1] = bitexpand_color16_to_color32(c1); if (c0.u > c1.u) { - evaluate_palette4_amd(c0, c1, palette); + evaluate_palette4_amd(palette); } else { - evaluate_palette3_amd(c0, c1, palette); + evaluate_palette3_amd(palette); } } -inline void evaluate_palette4(Color16 c0, Color16 c1, Color32 palette[4]) { - if (s_decoder == Decoder_D3D10) evaluate_palette4_d3d10(c0, c1, palette); - else if (s_decoder == Decoder_NVIDIA) evaluate_palette4_nv(c0, c1, palette); - else if (s_decoder == Decoder_AMD) evaluate_palette4_amd(c0, c1, palette); -} -inline void evaluate_palette3(Color16 c0, Color16 c1, Color32 palette[4]) { - if (s_decoder == Decoder_D3D10) evaluate_palette3_d3d10(c0, c1, palette); - else if (s_decoder == Decoder_NVIDIA) evaluate_palette3_nv(c0, c1, palette); - else if (s_decoder == Decoder_AMD) evaluate_palette3_amd(c0, c1, palette); -} -inline void evaluate_palette(Color16 c0, Color16 c1, Color32 palette[4]) { - if (s_decoder == Decoder_D3D10) evaluate_palette_d3d10(c0, c1, palette); - else if (s_decoder == Decoder_NVIDIA) evaluate_palette_nv(c0, c1, palette); - else if (s_decoder == Decoder_AMD) evaluate_palette_amd(c0, c1, palette); +inline void evaluate_palette(Color16 c0, Color16 c1, Color32 palette[4], Decoder decoder = s_decoder) { + if (decoder == Decoder_D3D10) evaluate_palette_d3d10(c0, c1, palette); + else if (decoder == Decoder_NVIDIA) evaluate_palette_nv(c0, c1, palette); + else if (decoder == Decoder_AMD) evaluate_palette_amd(c0, c1, palette); } static void evaluate_palette(Color16 c0, Color16 c1, Vector3 palette[4]) { @@ -2657,15 +2666,7 @@ static void evaluate_palette(Color16 c0, Color16 c1, Vector3 palette[4]) { static void decode_dxt1(const BlockDXT1 * block, unsigned char rgba_block[16 * 4], Decoder decoder) { Color32 palette[4]; - if (decoder == Decoder_NVIDIA) { - evaluate_palette_nv(block->col0, block->col1, palette); - } - else if (decoder == Decoder_AMD) { - evaluate_palette_amd(block->col0, block->col1, palette); - } - else { - evaluate_palette(block->col0, block->col1, palette); - } + evaluate_palette(block->col0, block->col1, palette, decoder); for (int i = 0; i < 16; i++) { int index = (block->indices >> (2 * i)) & 3; @@ -2693,23 +2694,10 @@ static float evaluate_mse(const Color32 & p, const Vector3 & c, const Vector3 & return dot(d, d); } - -/*static float evaluate_mse(const Vector3 & p, const Vector3 & c, const Vector3 & w) { - return ww.x * square(p.x-c.x) + ww.y * square(p.y-c.y) + ww.z * square(p.z-c.z); -}*/ - static int evaluate_mse(const Color32 & p, const Color32 & c) { return (square(int(p.r)-c.r) + square(int(p.g)-c.g) + square(int(p.b)-c.b)); } -/*static float evaluate_mse(const Vector3 palette[4], const Vector3 & c, const Vector3 & w) { - float e0 = evaluate_mse(palette[0], c, w); - float e1 = evaluate_mse(palette[1], c, w); - float e2 = evaluate_mse(palette[2], c, w); - float e3 = evaluate_mse(palette[3], c, w); - return min(min(e0, e1), min(e2, e3)); -}*/ - static int evaluate_mse(const Color32 palette[4], const Color32 & c) { int e0 = evaluate_mse(palette[0], c); int e1 = evaluate_mse(palette[1], c); @@ -2760,17 +2748,20 @@ static float evaluate_mse(const Vector4 input_colors[16], const float input_weig return error; } +static float evaluate_mse(const Vector4 input_colors[16], const float input_weights[16], const Vector3& color_weights, Vector3 palette[4], uint32 indices) { + + // evaluate error for each index. + float error = 0.0f; + for (int i = 0; i < 16; i++) { + int index = (indices >> (2 * i)) & 3; + error += input_weights[i] * evaluate_mse(palette[index], input_colors[i].xyz, color_weights); + } + return error; +} + float evaluate_dxt1_error(const uint8 rgba_block[16*4], const BlockDXT1 * block, Decoder decoder) { Color32 palette[4]; - if (decoder == Decoder_NVIDIA) { - evaluate_palette_nv(block->col0, block->col1, palette); - } - else if (decoder == Decoder_AMD) { - evaluate_palette_amd(block->col0, block->col1, palette); - } - else { - evaluate_palette(block->col0, block->col1, palette); - } + evaluate_palette(block->col0, block->col1, palette, decoder); // evaluate error for each index. float error = 0.0f; @@ -2845,32 +2836,7 @@ static uint compute_indices4(const Vector4 input_colors[16], const Vector3 & col return interleave(indices1, indices0); } -static uint compute_indices3(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 palette[4]) { -#if 0 - Vector3 p0 = palette[0] * color_weights; - Vector3 p1 = palette[1] * color_weights; - Vector3 p2 = palette[2] * color_weights; - Vector3 p3 = palette[3] * color_weights; - - uint indices = 0; - for (int i = 0; i < 16; i++) { - Vector3 ci = input_colors[i].xyz * color_weights; - float d0 = lengthSquared(p0 - ci); - float d1 = lengthSquared(p1 - ci); - float d2 = lengthSquared(p2 - ci); - float d3 = lengthSquared(p3 - ci); - - uint index; - if (d0 < d1 && d0 < d2 && d0 < d3) index = 0; - else if (d1 < d2 && d1 < d3) index = 1; - else if (d2 < d3) index = 2; - else index = 3; - - indices |= index << (2 * i); - } - - return indices; -#else +static uint compute_indices3(const Vector4 input_colors[16], const Vector3 & color_weights, bool allow_transparent_black, const Vector3 palette[4]) { uint indices0 = 0; uint indices1 = 0; @@ -2879,90 +2845,40 @@ static uint compute_indices3(const Vector4 input_colors[16], const Vector3 & col VVector3 vp1 = vbroadcast(palette[1]) * vw; VVector3 vp2 = vbroadcast(palette[2]) * vw; - for (int i = 0; i < 16; i += VEC_SIZE) { - VVector3 vc = vload(&input_colors[i]) * vw; + if (allow_transparent_black) { + for (int i = 0; i < 16; i += VEC_SIZE) { + VVector3 vc = vload(&input_colors[i]) * vw; - VFloat d0 = vlen2(vc - vp0); - VFloat d1 = vlen2(vc - vp1); - VFloat d2 = vlen2(vc - vp2); - VFloat d3 = vdot(vc, vc); + VFloat d0 = vlen2(vp0 - vc); + VFloat d1 = vlen2(vp1 - vc); + VFloat d2 = vlen2(vp2 - vc); + VFloat d3 = vdot(vc, vc); - // @@ simplify i1 & i2 - //VMask i0 = (d0 < d1) & (d0 < d2) & (d0 < d3); // 0 - VMask i1 = (d1 <= d0) & (d1 < d2) & (d1 < d3); // 1 - VMask i2 = (d2 <= d0) & (d2 <= d1) & (d2 < d3); // 2 - VMask i3 = (d3 <= d0) & (d3 <= d1) & (d3 <= d2); // 3 - //VFloat vindex = vselect(i0, vselect(i1, vselect(i2, vbroadcast(3), vbroadcast(2)), vbroadcast(1)), vbroadcast(0)); + VMask i1 = (d1 < d2); + VMask i2 = (d2 <= d0) & (d2 <= d1); + VMask i3 = (d3 <= d0) & (d3 <= d1) & (d3 <= d2); - indices0 |= mask(i2 | i3) << i; - indices1 |= mask(i1 | i3) << i; + indices0 |= mask(i2 | i3) << i; + indices1 |= mask(i1 | i3) << i; + } } + else { + for (int i = 0; i < 16; i += VEC_SIZE) { + VVector3 vc = vload(&input_colors[i]) * vw; - uint indices = interleave(indices1, indices0); - return indices; -#endif -} - - - -static uint compute_indices4(const Vector3 input_colors[16], const Vector3 palette[4]) { -#if 0 - uint indices0 = 0; - uint indices1 = 0; - - VVector3 vp0 = vbroadcast(palette[0]); - VVector3 vp1 = vbroadcast(palette[1]); - VVector3 vp2 = vbroadcast(palette[2]); - VVector3 vp3 = vbroadcast(palette[3]); - - for (int i = 0; i < 16; i += VEC_SIZE) { - VVector3 vc = vload(&input_colors[i]); - - VFloat d0 = vlen2(vc - vp0); - VFloat d1 = vlen2(vc - vp1); - VFloat d2 = vlen2(vc - vp2); - VFloat d3 = vlen2(vc - vp3); - - VMask b1 = d1 > d2; - VMask b2 = d0 > d2; - VMask x0 = b1 & b2; + VFloat d0 = vlen2(vc - vp0); + VFloat d1 = vlen2(vc - vp1); + VFloat d2 = vlen2(vc - vp2); - VMask b0 = d0 > d3; - VMask b3 = d1 > d3; - x0 = x0 | (b0 & b3); + VMask i1 = (d1 < d2); + VMask i2 = (d2 <= d0) & (d2 <= d1); - VMask b4 = d2 > d3; - VMask x1 = b0 & b4; - - indices0 |= mask(x0) << i; - indices1 |= mask(x1) << i; + indices0 |= mask(i2) << i; + indices1 |= mask(i1) << i; + } } return interleave(indices1, indices0); -#else - uint indices = 0; - for (int i = 0; i < 16; i++) { - Vector3 ci = input_colors[i]; - float d0 = lengthSquared(palette[0] - ci); - float d1 = lengthSquared(palette[1] - ci); - float d2 = lengthSquared(palette[2] - ci); - float d3 = lengthSquared(palette[3] - ci); - - uint b0 = d0 > d3; - uint b1 = d1 > d2; - uint b2 = d0 > d2; - uint b3 = d1 > d3; - uint b4 = d2 > d3; - - uint x0 = b1 & b2; - uint x1 = b0 & b3; - uint x2 = b0 & b4; - - indices |= (x2 | ((x0 | x1) << 1)) << (2 * i); - } - - return indices; -#endif } @@ -3025,7 +2941,7 @@ static uint compute_indices(const Vector4 input_colors[16], const Vector3 & colo } -static void output_block3(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 & v0, const Vector3 & v1, BlockDXT1 * block) +static float output_block3(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, bool allow_transparent_black, const Vector3 & v0, const Vector3 & v1, BlockDXT1 * block) { Color16 color0 = vector3_to_color16(v0); Color16 color1 = vector3_to_color16(v1); @@ -3039,10 +2955,12 @@ static void output_block3(const Vector4 input_colors[16], const Vector3 & color_ block->col0 = color0; block->col1 = color1; - block->indices = compute_indices(input_colors, color_weights, palette); + block->indices = compute_indices3(input_colors, color_weights, allow_transparent_black, palette); + + return evaluate_mse(input_colors, input_weights, color_weights, palette, block->indices); } -static void output_block4(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 & v0, const Vector3 & v1, BlockDXT1 * block) +static float output_block4(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, const Vector3 & v0, const Vector3 & v1, BlockDXT1 * block) { Color16 color0 = vector3_to_color16(v0); Color16 color1 = vector3_to_color16(v1); @@ -3057,26 +2975,11 @@ static void output_block4(const Vector4 input_colors[16], const Vector3 & color_ block->col0 = color0; block->col1 = color1; block->indices = compute_indices4(input_colors, color_weights, palette); -} - - -static void output_block4(const Vector3 input_colors[16], const Vector3 & v0, const Vector3 & v1, BlockDXT1 * block) -{ - Color16 color0 = vector3_to_color16(v0); - Color16 color1 = vector3_to_color16(v1); - - if (color0.u < color1.u) { - swap(color0, color1); - } - Vector3 palette[4]; - evaluate_palette(color0, color1, palette); - - block->col0 = color0; - block->col1 = color1; - block->indices = compute_indices4(input_colors, palette); + return evaluate_mse(input_colors, input_weights, color_weights, palette, block->indices); } + // Least squares fitting of color end points for the given indices. @@ Take weights into account. static bool optimize_end_points4(uint indices, const Vector4 * colors, /*const float * weights,*/ int count, Vector3 * a, Vector3 * b) { @@ -3263,7 +3166,7 @@ static inline int Lerp13(int a, int b) static void PrepareOptTable5(uint8 * table, Decoder decoder) { uint8 expand[32]; - for (int i = 0; i < 32; i++) expand[i] = (i << 3) | (i >> 2); + for (int i = 0; i < 32; i++) expand[i] = uint8((i << 3) | (i >> 2)); for (int i = 0; i < 256; i++) { int bestErr = 256 * 100; @@ -3292,17 +3195,17 @@ static void PrepareOptTable5(uint8 * table, Decoder decoder) // Another approach is to consider the worst of AMD and NVIDIA errors. err = max(amd_err, nv_err); } - else if (decoder == Decoder_AMD) { - err = amd_err; - } else if (decoder == Decoder_NVIDIA) { err = nv_err; } + else /*if (decoder == Decoder_AMD)*/ { + err = amd_err; + } if (err < bestErr) { bestErr = err; - table[i * 2 + 0] = mx; - table[i * 2 + 1] = mn; + table[i * 2 + 0] = uint8(mx); + table[i * 2 + 1] = uint8(mn); } } } @@ -3312,7 +3215,7 @@ static void PrepareOptTable5(uint8 * table, Decoder decoder) static void PrepareOptTable6(uint8 * table, Decoder decoder) { uint8 expand[64]; - for (int i = 0; i < 64; i++) expand[i] = (i << 2) | (i >> 4); + for (int i = 0; i < 64; i++) expand[i] = uint8((i << 2) | (i >> 4)); for (int i = 0; i < 256; i++) { int bestErr = 256 * 100; @@ -3341,17 +3244,17 @@ static void PrepareOptTable6(uint8 * table, Decoder decoder) // Another approach is to consider the worst of AMD and NVIDIA errors. err = max(amd_err, nv_err); } - else if (decoder == Decoder_AMD) { - err = amd_err; - } else if (decoder == Decoder_NVIDIA) { err = nv_err; } + else /*if (decoder == Decoder_AMD)*/ { + err = amd_err; + } if (err < bestErr) { bestErr = err; - table[i * 2 + 0] = mx; - table[i * 2 + 1] = mn; + table[i * 2 + 0] = uint8(mx); + table[i * 2 + 1] = uint8(mn); } } } @@ -3386,7 +3289,7 @@ static void compress_dxt1_single_color_optimal(Color32 c, BlockDXT1 * output) } -static float compress_dxt1_cluster_fit(const Vector4 input_colors[16], const float input_weights[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, bool three_color_mode, bool use_transparent_black, BlockDXT1 * output) +static float compress_dxt1_cluster_fit(const Vector4 input_colors[16], const float input_weights[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, bool three_color_mode, bool try_transparent_black, bool allow_transparent_black, BlockDXT1 * output) { Vector3 metric_sqr = color_weights * color_weights; @@ -3396,12 +3299,10 @@ static float compress_dxt1_cluster_fit(const Vector4 input_colors[16], const flo Vector3 start, end; cluster_fit_four(sat, sat_count, metric_sqr, &start, &end); - output_block4(input_colors, color_weights, start, end, output); - - float best_error = evaluate_mse(input_colors, input_weights, color_weights, output); + float best_error = output_block4(input_colors, input_weights, color_weights, start, end, output); if (three_color_mode) { - if (use_transparent_black) { + if (try_transparent_black) { Vector3 tmp_colors[16]; float tmp_weights[16]; int tmp_count = skip_blacks(colors, weights, count, tmp_colors, tmp_weights); @@ -3413,9 +3314,7 @@ static float compress_dxt1_cluster_fit(const Vector4 input_colors[16], const flo cluster_fit_three(sat, sat_count, metric_sqr, &start, &end); BlockDXT1 three_color_block; - output_block3(input_colors, color_weights, start, end, &three_color_block); - - float three_color_error = evaluate_mse(input_colors, input_weights, color_weights, &three_color_block); + float three_color_error = output_block3(input_colors, input_weights, color_weights, allow_transparent_black, start, end, &three_color_block); if (three_color_error < best_error) { best_error = three_color_error; @@ -3623,16 +3522,13 @@ static float compress_dxt1(Quality level, const Vector4 input_colors[16], const fit_colors_bbox(colors, count, &c0, &c1); inset_bbox(&c0, &c1); select_diagonal(colors, count, &c0, &c1); - output_block4(input_colors, color_weights, c0, c1, output); - - error = evaluate_mse(input_colors, input_weights, color_weights, output); + error = output_block4(input_colors, input_weights, color_weights, c0, c1, output); // Refine color for the selected indices. if (opt.least_squares_fit && optimize_end_points4(output->indices, input_colors, 16, &c0, &c1)) { BlockDXT1 optimized_block; - output_block4(input_colors, color_weights, c0, c1, &optimized_block); + float optimized_error = output_block4(input_colors, input_weights, color_weights, c0, c1, &optimized_block); - float optimized_error = evaluate_mse(input_colors, input_weights, color_weights, &optimized_block); if (optimized_error < error) { error = optimized_error; *output = optimized_block; @@ -3648,7 +3544,7 @@ static float compress_dxt1(Quality level, const Vector4 input_colors[16], const // Try cluster fit. BlockDXT1 cluster_fit_output; - float cluster_fit_error = compress_dxt1_cluster_fit(input_colors, input_weights, colors, weights, count, color_weights, use_three_color_mode, use_three_color_black, &cluster_fit_output); + float cluster_fit_error = compress_dxt1_cluster_fit(input_colors, input_weights, colors, weights, count, color_weights, use_three_color_mode, use_three_color_black, three_color_black, &cluster_fit_output); if (cluster_fit_error < error) { *output = cluster_fit_output; error = cluster_fit_error; @@ -3713,6 +3609,7 @@ float compress_dxt1(Quality level, const float * input_colors, const float * inp // v1.02 - Removed SIMD code path. // v1.03 - Quality levels. AVX512, Neon, Altivec, vectorized reduction and index selection. // v1.04 - Automatic compile-time SIMD selection. Specify hw decoder at runtime. More optimizations. +// v1.05 - Bug fixes. Small optimizations. // Copyright (c) 2020 Ignacio Castano //