// icbc.h v1.05 // A High Quality SIMD BC1 Encoder by Ignacio Castano . // // LICENSE: // MIT license at the end of this file. #ifndef ICBC_H #define ICBC_H namespace icbc { enum Decoder { Decoder_D3D10 = 0, Decoder_NVIDIA = 1, Decoder_AMD = 2 }; void init_dxt1(Decoder decoder = Decoder_D3D10); enum Quality { Quality_Level1, // Box fit + least squares fit. Quality_Level2, // Cluster fit 4, threshold = 24. Quality_Level3, // Cluster fit 4, threshold = 32. Quality_Level4, // Cluster fit 4, threshold = 48. Quality_Level5, // Cluster fit 4, threshold = 64. Quality_Level6, // Cluster fit 4, threshold = 96. Quality_Level7, // Cluster fit 4, threshold = 128. Quality_Level8, // Cluster fit 4+3, threshold = 256. Quality_Level9, // Cluster fit 4+3, threshold = 256 + Refinement. Quality_Fast = Quality_Level1, Quality_Default = Quality_Level8, Quality_Max = Quality_Level9, }; void decode_dxt1(const void * block, unsigned char rgba_block[16 * 4], Decoder decoder = Decoder_D3D10); float evaluate_dxt1_error(const unsigned char rgba_block[16 * 4], const void * block, Decoder decoder = Decoder_D3D10); float compress_dxt1(Quality level, const float * input_colors, const float * input_weights, const float color_weights[3], bool three_color_mode, bool three_color_black, void * output); } #endif // ICBC_H #ifdef ICBC_IMPLEMENTATION // Instruction level support must be chosen at compile time setting ICBC_SIMD to one of these values: #define ICBC_SCALAR 0 #define ICBC_SSE2 1 #define ICBC_SSE41 2 #define ICBC_AVX1 3 #define ICBC_AVX2 4 #define ICBC_AVX512 5 #define ICBC_NEON -1 #define ICBC_VMX -2 #if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64) #define ICBC_X86 1 #endif #if (defined(__arm__) || defined(_M_ARM)) #define ICBC_ARM 1 #endif #if (defined(__PPC__) || defined(_M_PPC)) #define ICBC_PPC 1 #endif // SIMD version. #ifndef ICBC_SIMD #if ICBC_X86 #if __AVX512F__ #define ICBC_SIMD ICBC_AVX512 #elif __AVX2__ #define ICBC_SIMD ICBC_AVX2 #elif __AVX__ #define ICBC_SIMD ICBC_AVX1 #elif __SSE4_1__ #define ICBC_SIMD ICBC_SSE41 #elif __SSE2__ #define ICBC_SIMD ICBC_SSE2 #else #define ICBC_SIMD ICBC_SCALAR #endif #endif #if ICBC_ARM #if __ARM_NEON__ #define ICBC_SIMD ICBC_NEON #else #define ICBC_SIMD ICBC_SCALAR #endif #endif #if ICBC_PPC #define ICBC_SIMD ICBC_VMX #endif #endif // AVX1 does not require FMA, and depending on whether it's Intel or AMD you may have FMA3 or FMA4. What a mess. #ifndef ICBC_USE_FMA //#define ICBC_USE_FMA 3 //#define ICBC_USE_FMA 4 #endif #if ICBC_SIMD >= ICBC_AVX2 #define ICBC_BMI2 1 #endif // Apparently rcp is not deterministic (different precision on Intel and AMD), enable if you don't care about that for a small performance boost. //#define ICBC_USE_RCP 1 #if ICBC_SIMD == ICBC_AVX2 #define ICBC_USE_AVX2_PERMUTE2 1 // Using permutevar8x32 and bitops. #endif #if ICBC_SIMD == ICBC_AVX512 #define ICBC_USE_AVX512_PERMUTE 1 #endif #if ICBC_SIMD == ICBC_NEON #define ICBC_USE_NEON_VTL 0 // Not tested. #endif // Some experimental knobs: #define ICBC_PERFECT_ROUND 0 // Enable perfect rounding to compute cluster fit residual. #if ICBC_SIMD >= ICBC_SSE2 #include #endif #if ICBC_SIMD >= ICBC_SSE41 #include #endif #if ICBC_SIMD >= ICBC_AVX1 #include #endif #if ICBC_SIMD >= ICBC_AVX512 && _MSC_VER #include #endif #if ICBC_SIMD == ICBC_NEON #include #endif #if ICBC_SIMD == ICBC_VMX #include #endif #if _MSC_VER #include // _BitScanReverse #endif #include #include // abs #include // memset #include // fabsf #include // FLT_MAX #ifndef ICBC_ASSERT #if _DEBUG #define ICBC_ASSERT assert #include #else #define ICBC_ASSERT(x) #endif #endif namespace icbc { /////////////////////////////////////////////////////////////////////////////////////////////////// // Basic Templates template inline void swap(T & a, T & b) { T temp(a); a = b; b = temp; } template inline T max(const T & a, const T & b) { return (b < a) ? a : b; } template inline T min(const T & a, const T & b) { return (a < b) ? a : b; } template inline T clamp(const T & x, const T & a, const T & b) { return min(max(x, a), b); } template inline T square(const T & a) { return a * a; } /////////////////////////////////////////////////////////////////////////////////////////////////// // Basic Types typedef uint8_t uint8; typedef int8_t int8; typedef uint16_t uint16; typedef uint32_t uint32; typedef uint32_t uint; struct Color16 { union { struct { uint16 b : 5; uint16 g : 6; uint16 r : 5; }; uint16 u; }; }; struct Color32 { union { struct { uint8 b, g, r, a; }; uint32 u; }; }; struct BlockDXT1 { Color16 col0; Color16 col1; uint32 indices; }; struct Vector3 { float x; float y; float z; inline void operator+=(Vector3 v) { x += v.x; y += v.y; z += v.z; } inline void operator*=(Vector3 v) { x *= v.x; y *= v.y; z *= v.z; } inline void operator*=(float s) { x *= s; y *= s; z *= s; } }; struct Vector4 { union { struct { float x, y, z, w; }; Vector3 xyz; }; }; inline Vector3 operator*(Vector3 v, float s) { return { v.x * s, v.y * s, v.z * s }; } inline Vector3 operator*(float s, Vector3 v) { return { v.x * s, v.y * s, v.z * s }; } inline Vector3 operator*(Vector3 a, Vector3 b) { return { a.x * b.x, a.y * b.y, a.z * b.z }; } inline float dot(Vector3 a, Vector3 b) { return a.x * b.x + a.y * b.y + a.z * b.z; } inline Vector3 operator+(Vector3 a, Vector3 b) { return { a.x + b.x, a.y + b.y, a.z + b.z }; } inline Vector3 operator-(Vector3 a, Vector3 b) { return { a.x - b.x, a.y - b.y, a.z - b.z }; } inline Vector3 operator/(Vector3 v, float s) { return { v.x / s, v.y / s, v.z / s }; } inline float saturate(float x) { return clamp(x, 0.0f, 1.0f); } inline Vector3 saturate(Vector3 v) { return { saturate(v.x), saturate(v.y), saturate(v.z) }; } inline Vector3 min(Vector3 a, Vector3 b) { return { min(a.x, b.x), min(a.y, b.y), min(a.z, b.z) }; } inline Vector3 max(Vector3 a, Vector3 b) { return { max(a.x, b.x), max(a.y, b.y), max(a.z, b.z) }; } inline bool operator==(const Vector3 & a, const Vector3 & b) { return a.x == b.x && a.y == b.y && a.z == b.z; } inline Vector3 scalar_to_vector3(float f) { return {f, f, f}; } inline float lengthSquared(Vector3 v) { return dot(v, v); } inline bool equal(float a, float b, float epsilon = 0.0001) { // http://realtimecollisiondetection.net/blog/?p=89 //return fabsf(a - b) < epsilon * max(1.0f, max(fabsf(a), fabsf(b))); return fabsf(a - b) < epsilon; } inline bool equal(Vector3 a, Vector3 b, float epsilon) { return equal(a.x, b.x, epsilon) && equal(a.y, b.y, epsilon) && equal(a.z, b.z, epsilon); } /////////////////////////////////////////////////////////////////////////////////////////////////// // SIMD #ifndef ICBC_ALIGN_16 #if __GNUC__ # define ICBC_ALIGN_16 __attribute__ ((__aligned__ (16))) #else // _MSC_VER # define ICBC_ALIGN_16 __declspec(align(16)) #endif #endif #if __GNUC__ #define ICBC_FORCEINLINE inline __attribute__((always_inline)) #else #define ICBC_FORCEINLINE __forceinline #endif // Count trailing zeros (BSR). ICBC_FORCEINLINE int ctz(uint mask) { #if __GNUC__ return __builtin_ctz(mask); #else unsigned long index; _BitScanReverse(&index, mask); return (int)index; #endif } #if ICBC_SIMD == ICBC_SCALAR // Purely scalar version. constexpr int VEC_SIZE = 1; using VFloat = float; using VMask = bool; ICBC_FORCEINLINE float & lane(VFloat & v, int i) { return v; } ICBC_FORCEINLINE VFloat vzero() { return 0.0f; } ICBC_FORCEINLINE VFloat vbroadcast(float x) { return x; } ICBC_FORCEINLINE VFloat vload(const float * ptr) { return *ptr; } ICBC_FORCEINLINE VFloat vrcp(VFloat a) { return 1.0f / a; } ICBC_FORCEINLINE VFloat vmadd(VFloat a, VFloat b, VFloat c) { return a * b + c; } ICBC_FORCEINLINE VFloat vmsub(VFloat a, VFloat b, VFloat c) { return a * b - c; } ICBC_FORCEINLINE VFloat vm2sub(VFloat a, VFloat b, VFloat c, VFloat d) { return a * b - c * d; } ICBC_FORCEINLINE VFloat vsaturate(VFloat a) { return min(max(a, 0.0f), 1.0f); } ICBC_FORCEINLINE VFloat vround01(VFloat a) { return float(int(a + 0.5f)); } ICBC_FORCEINLINE VFloat lane_id() { return 0; } ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) { return mask ? b : a; } ICBC_FORCEINLINE VMask vbroadcast(bool b) { return b; } ICBC_FORCEINLINE bool all(VMask m) { return m; } ICBC_FORCEINLINE bool any(VMask m) { return m; } ICBC_FORCEINLINE uint mask(VMask m) { return (uint)m; } ICBC_FORCEINLINE int reduce_min_index(VFloat v) { return 0; } ICBC_FORCEINLINE void vtranspose4(VFloat & a, VFloat & b, VFloat & c, VFloat & d) {} #elif ICBC_SIMD == ICBC_SSE2 || ICBC_SIMD == ICBC_SSE41 constexpr int VEC_SIZE = 4; #if __GNUC__ // GCC needs a struct so that we can overload operators. union VFloat { __m128 v; float m128_f32[VEC_SIZE]; VFloat() {} VFloat(__m128 v) : v(v) {} operator __m128 & () { return v; } }; union VMask { __m128 m; VMask() {} VMask(__m128 m) : m(m) {} operator __m128 & () { return m; } }; #else using VFloat = __m128; using VMask = __m128; #endif ICBC_FORCEINLINE float & lane(VFloat & v, int i) { return v.m128_f32[i]; } ICBC_FORCEINLINE VFloat vzero() { return _mm_setzero_ps(); } ICBC_FORCEINLINE VFloat vbroadcast(float x) { return _mm_set1_ps(x); } ICBC_FORCEINLINE VFloat vload(const float * ptr) { return _mm_load_ps(ptr); } ICBC_FORCEINLINE VFloat vgather(const float * base, VFloat index) { VFloat v; for (int i = 0; i < VEC_SIZE; i++) { lane(v, i) = base[int(lane(index, i))]; } return v; } ICBC_FORCEINLINE VFloat operator+(VFloat a, VFloat b) { return _mm_add_ps(a, b); } ICBC_FORCEINLINE VFloat operator-(VFloat a, VFloat b) { return _mm_sub_ps(a, b); } ICBC_FORCEINLINE VFloat operator*(VFloat a, VFloat b) { return _mm_mul_ps(a, b); } ICBC_FORCEINLINE VFloat vrcp(VFloat a) { #if ICBC_USE_RCP VFloat r = _mm_rcp_ps(a); return _mm_mul_ps(r, _mm_sub_ps(vbroadcast(2.0f), _mm_mul_ps(r, a))); // r * (2 - r * a) #else return _mm_div_ps(vbroadcast(1.0f), a); #endif } // a*b+c ICBC_FORCEINLINE VFloat vmadd(VFloat a, VFloat b, VFloat c) { return a * b + c; } ICBC_FORCEINLINE VFloat vmsub(VFloat a, VFloat b, VFloat c) { return a * b - c; } ICBC_FORCEINLINE VFloat vm2sub(VFloat a, VFloat b, VFloat c, VFloat d) { return a * b - c * d; } ICBC_FORCEINLINE VFloat vsaturate(VFloat a) { auto zero = _mm_setzero_ps(); auto one = _mm_set1_ps(1.0f); return _mm_min_ps(_mm_max_ps(a, zero), one); } // Assumes a is in [0, 1] range. ICBC_FORCEINLINE VFloat vround01(VFloat a) { #if ICBC_SIMD == ICBC_SSE41 return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); #else return _mm_cvtepi32_ps(_mm_cvttps_epi32(a + vbroadcast(0.5f))); #endif } ICBC_FORCEINLINE VFloat vtruncate(VFloat a) { #if ICBC_SIMD == ICBC_SSE41 return _mm_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); #else return _mm_cvtepi32_ps(_mm_cvttps_epi32(a)); #endif } ICBC_FORCEINLINE VFloat lane_id() { return _mm_set_ps(3, 2, 1, 0); } ICBC_FORCEINLINE VMask operator> (VFloat A, VFloat B) { return _mm_cmpgt_ps(A, B); } ICBC_FORCEINLINE VMask operator>=(VFloat A, VFloat B) { return _mm_cmpge_ps(A, B); } ICBC_FORCEINLINE VMask operator< (VFloat A, VFloat B) { return _mm_cmplt_ps(A, B); } ICBC_FORCEINLINE VMask operator<=(VFloat A, VFloat B) { return _mm_cmple_ps(A, B); } ICBC_FORCEINLINE VMask operator| (VMask A, VMask B) { return _mm_or_ps(A, B); } ICBC_FORCEINLINE VMask operator& (VMask A, VMask B) { return _mm_and_ps(A, B); } ICBC_FORCEINLINE VMask operator^ (VMask A, VMask B) { return _mm_xor_ps(A, B); } // mask ? b : a ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) { #if ICBC_SIMD == ICBC_SSE41 return _mm_blendv_ps(a, b, mask); #else return _mm_or_ps(_mm_andnot_ps(mask, a), _mm_and_ps(mask, b)); #endif } ICBC_FORCEINLINE VMask vbroadcast(bool b) { return _mm_castsi128_ps(_mm_set1_epi32(-int32_t(b))); } ICBC_FORCEINLINE bool all(VMask m) { int value = _mm_movemask_ps(m); return value == 0x7; } ICBC_FORCEINLINE bool any(VMask m) { int value = _mm_movemask_ps(m); return value != 0; } ICBC_FORCEINLINE uint mask(VMask m) { return (uint)_mm_movemask_ps(m); } ICBC_FORCEINLINE int reduce_min_index(VFloat v) { // First do an horizontal reduction. // v = [ D C | B A ] VFloat shuf = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 3, 0, 1)); // [ C D | A B ] VFloat mins = _mm_min_ps(v, shuf); // mins = [ D+C C+D | B+A A+B ] shuf = _mm_movehl_ps(shuf, mins); // [ C D | D+C C+D ] // let the compiler avoid a mov by reusing shuf mins = _mm_min_ss(mins, shuf); mins = _mm_shuffle_ps(mins, mins, _MM_SHUFFLE(0, 0, 0, 0)); // Then find the index. uint mask = _mm_movemask_ps(v <= mins); return ctz(mask); } // https://gcc.gnu.org/legacy-ml/gcc-patches/2005-10/msg00324.html ICBC_FORCEINLINE void vtranspose4(VFloat & r0, VFloat & r1, VFloat & r2, VFloat & r3) { VFloat t0 = _mm_unpacklo_ps(r0, r1); VFloat t1 = _mm_unpacklo_ps(r2, r3); VFloat t2 = _mm_unpackhi_ps(r0, r1); VFloat t3 = _mm_unpackhi_ps(r2, r3); r0 = _mm_movelh_ps(t0, t1); r1 = _mm_movehl_ps(t1, t0); r2 = _mm_movelh_ps(t2, t3); r3 = _mm_movehl_ps(t3, t2); } #elif ICBC_SIMD == ICBC_AVX1 || ICBC_SIMD == ICBC_AVX2 constexpr int VEC_SIZE = 8; #if __GNUC__ union VFloat { __m256 v; float m256_f32[VEC_SIZE]; VFloat() {} VFloat(__m256 v) : v(v) {} operator __m256 & () { return v; } }; union VInt { __m256i v; int m256_i32[VEC_SIZE]; VInt() {} VInt(__m256i v) : v(v) {} operator __m256i & () { return v; } }; union VMask { __m256 m; VMask() {} VMask(__m256 m) : m(m) {} operator __m256 & () { return m; } }; #else using VFloat = __m256; using VInt = __m256i; using VMask = __m256; // Emulate mask vector using packed float. #endif ICBC_FORCEINLINE float & lane(VFloat & v, int i) { return v.m256_f32[i]; } ICBC_FORCEINLINE VFloat vzero() { return _mm256_setzero_ps(); } ICBC_FORCEINLINE VFloat vbroadcast(float a) { return _mm256_set1_ps(a); } ICBC_FORCEINLINE VFloat vload(const float * ptr) { return _mm256_load_ps(ptr); } ICBC_FORCEINLINE VFloat vgather(const float * base, VFloat index) { #if ICBC_SIMD == ICBC_AVX2 return _mm256_i32gather_ps(base, _mm256_cvtps_epi32(index), 4); #else VFloat v; for (int i = 0; i < VEC_SIZE; i++) { lane(v, i) = base[int(lane(index, i))]; } return v; #endif } ICBC_FORCEINLINE VFloat operator+(VFloat a, VFloat b) { return _mm256_add_ps(a, b); } ICBC_FORCEINLINE VFloat operator-(VFloat a, VFloat b) { return _mm256_sub_ps(a, b); } ICBC_FORCEINLINE VFloat operator*(VFloat a, VFloat b) { return _mm256_mul_ps(a, b); } ICBC_FORCEINLINE VFloat vrcp(VFloat a) { #if ICBC_USE_RCP #if ICBC_SIMD == ICBC_AVX512 VFloat r = _mm256_rcp14_ps(a); #else VFloat r = _mm256_rcp_ps(a); #endif // r = r * (2 - r * a) #if ICBC_USE_FMA == 3 || ICBC_AVX2 return _mm256_mul_ps(r, _mm256_fnmadd_ps(r, a, vbroadcast(2.0f))); #else return _mm256_mul_ps(r, _mm256_sub_ps(vbroadcast(2.0f), _mm256_mul_ps(r, a))); #endif #else return _mm256_div_ps(vbroadcast(1.0f), a); #endif } // a*b+c ICBC_FORCEINLINE VFloat vmadd(VFloat a, VFloat b, VFloat c) { #if ICBC_USE_FMA == 3 || ICBC_SIMD == ICBC_AVX2 return _mm256_fmadd_ps(a, b, c); #elif ICBC_USE_FMA == 4 return _mm256_macc_ps(a, b, c); #else return ((a * b) + c); #endif } ICBC_FORCEINLINE VFloat vmsub(VFloat a, VFloat b, VFloat c) { #if ICBC_USE_FMA == 3 || ICBC_SIMD == ICBC_AVX2 return _mm256_fmsub_ps(a, b, c); #elif ICBC_USE_FMA == 4 return _mm256_msub_ps(a, b, c); #else return ((a * b) - c); #endif } ICBC_FORCEINLINE VFloat vm2sub(VFloat a, VFloat b, VFloat c, VFloat d) { return vmsub(a, b, c * d); } ICBC_FORCEINLINE VFloat vsaturate(VFloat a) { __m256 zero = _mm256_setzero_ps(); __m256 one = _mm256_set1_ps(1.0f); return _mm256_min_ps(_mm256_max_ps(a, zero), one); } ICBC_FORCEINLINE VFloat vround01(VFloat a) { return _mm256_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } ICBC_FORCEINLINE VFloat vtruncate(VFloat a) { return _mm256_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); } ICBC_FORCEINLINE VFloat lane_id() { return _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0); } ICBC_FORCEINLINE VMask operator> (VFloat A, VFloat B) { return _mm256_cmp_ps(A, B, _CMP_GT_OQ); } ICBC_FORCEINLINE VMask operator>=(VFloat A, VFloat B) { return _mm256_cmp_ps(A, B, _CMP_GE_OQ); } ICBC_FORCEINLINE VMask operator< (VFloat A, VFloat B) { return _mm256_cmp_ps(A, B, _CMP_LT_OQ); } ICBC_FORCEINLINE VMask operator<=(VFloat A, VFloat B) { return _mm256_cmp_ps(A, B, _CMP_LE_OQ); } ICBC_FORCEINLINE VMask operator| (VMask A, VMask B) { return _mm256_or_ps(A, B); } ICBC_FORCEINLINE VMask operator& (VMask A, VMask B) { return _mm256_and_ps(A, B); } ICBC_FORCEINLINE VMask operator^ (VMask A, VMask B) { return _mm256_xor_ps(A, B); } // mask ? b : a ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) { return _mm256_blendv_ps(a, b, mask); } ICBC_FORCEINLINE VMask vbroadcast(bool b) { return _mm256_castsi256_ps(_mm256_set1_epi32(-int32_t(b))); } ICBC_FORCEINLINE bool all(VMask m) { __m256 zero = _mm256_setzero_ps(); return _mm256_testc_ps(_mm256_cmp_ps(zero, zero, _CMP_EQ_UQ), m) == 0; } ICBC_FORCEINLINE bool any(VMask m) { return _mm256_testz_ps(m, m) == 0; } ICBC_FORCEINLINE uint mask(VMask m) { return (uint)_mm256_movemask_ps(m); } // This is missing on some GCC versions. #if !defined _mm256_set_m128 #define _mm256_set_m128(hi, lo) _mm256_insertf128_ps(_mm256_castps128_ps256(lo), (hi), 0x1) #endif ICBC_FORCEINLINE int reduce_min_index(VFloat v) { __m128 vlow = _mm256_castps256_ps128(v); __m128 vhigh = _mm256_extractf128_ps(v, 1); vlow = _mm_min_ps(vlow, vhigh); // First do an horizontal reduction. // v = [ D C | B A ] __m128 shuf = _mm_shuffle_ps(vlow, vlow, _MM_SHUFFLE(2, 3, 0, 1)); // [ C D | A B ] __m128 mins = _mm_min_ps(vlow, shuf); // mins = [ D+C C+D | B+A A+B ] shuf = _mm_movehl_ps(shuf, mins); // [ C D | D+C C+D ] mins = _mm_min_ss(mins, shuf); VFloat vmin = _mm256_permute_ps(_mm256_set_m128(mins, mins), 0); // _MM256_PERMUTE(0, 0, 0, 0, 0, 0, 0, 0) // Then find the index. uint mask = _mm256_movemask_ps(v <= vmin); return ctz(mask); } // AoS to SoA ICBC_FORCEINLINE void vtranspose4(VFloat & a, VFloat & b, VFloat & c, VFloat & d) { VFloat r0 = _mm256_unpacklo_ps(a, b); VFloat r1 = _mm256_unpacklo_ps(c, d); VFloat r2 = _mm256_permute2f128_ps(r0, r1, 0x20); VFloat r3 = _mm256_permute2f128_ps(r0, r1, 0x31); r0 = _mm256_unpackhi_ps(a, b); r1 = _mm256_unpackhi_ps(c, d); a = _mm256_unpacklo_ps(r2, r3); b = _mm256_unpackhi_ps(r2, r3); r2 = _mm256_permute2f128_ps(r0, r1, 0x20); r3 = _mm256_permute2f128_ps(r0, r1, 0x31); c = _mm256_unpacklo_ps(r2, r3); d = _mm256_unpackhi_ps(r2, r3); } ICBC_FORCEINLINE VInt vzeroi() { return _mm256_setzero_si256(); } ICBC_FORCEINLINE VInt vbroadcast(int a) { return _mm256_set1_epi32(a); } ICBC_FORCEINLINE VInt vload(const int * ptr) { return _mm256_load_si256((const __m256i*)ptr); } ICBC_FORCEINLINE VInt operator- (VInt A, int b) { return _mm256_sub_epi32(A, _mm256_set1_epi32(b)); } ICBC_FORCEINLINE VInt operator& (VInt A, int b) { return _mm256_and_si256(A, _mm256_set1_epi32(b)); } ICBC_FORCEINLINE VInt operator>> (VInt A, int b) { return _mm256_srli_epi32(A, b); } ICBC_FORCEINLINE VMask operator> (VInt A, int b) { return _mm256_castsi256_ps(_mm256_cmpgt_epi32(A, _mm256_set1_epi32(b))); } ICBC_FORCEINLINE VMask operator== (VInt A, int b) { return _mm256_castsi256_ps(_mm256_cmpeq_epi32(A, _mm256_set1_epi32(b))); } // mask ? v[idx] : 0 ICBC_FORCEINLINE VFloat vpermuteif(VMask mask, VFloat v, VInt idx) { return _mm256_and_ps(_mm256_permutevar8x32_ps(v, idx), mask); } // mask ? (idx > 8 ? vhi[idx] : vlo[idx]) : 0 ICBC_FORCEINLINE VFloat vpermute2if(VMask mask, VFloat vlo, VFloat vhi, VInt idx) { #if 0 VMask mhi = idx > 7; vlo = _mm256_permutevar8x32_ps(vlo, idx); vhi = _mm256_permutevar8x32_ps(vhi, idx); VFloat v = _mm256_blendv_ps(vlo, vhi, mhi); return _mm256_and_ps(v, mask); #else // Fabian Giesen says not to mix _mm256_blendv_ps and _mm256_permutevar8x32_ps since they contend for the same gates and instead suggests the following: vhi = _mm256_xor_ps(vhi, vlo); VFloat v = _mm256_permutevar8x32_ps(vlo, idx); VMask mhi = idx > 7; v = _mm256_xor_ps(v, _mm256_and_ps(_mm256_permutevar8x32_ps(vhi, idx), mhi)); return _mm256_and_ps(v, mask); #endif } #elif ICBC_SIMD == ICBC_AVX512 constexpr int VEC_SIZE = 16; #if __GNUC__ union VFloat { __m512 v; float m512_f32[VEC_SIZE]; VFloat() {} VFloat(__m512 v) : v(v) {} operator __m512 & () { return v; } }; union VInt { __m512i v; int m512i_i32[VEC_SIZE]; VInt() {} VInt(__m512i v) : v(v) {} operator __m512i & () { return v; } }; #else using VFloat = __m512; using VInt = __m512i; #endif struct VMask { __mmask16 m; }; ICBC_FORCEINLINE float & lane(VFloat & v, int i) { return v.m512_f32[i]; } ICBC_FORCEINLINE VFloat vzero() { return _mm512_setzero_ps(); } ICBC_FORCEINLINE VFloat vbroadcast(float a) { return _mm512_set1_ps(a); } ICBC_FORCEINLINE VFloat vload(const float * ptr) { return _mm512_load_ps(ptr); } ICBC_FORCEINLINE VFloat vload(VMask mask, const float * ptr) { return _mm512_mask_load_ps(_mm512_undefined(), mask.m, ptr); } ICBC_FORCEINLINE VFloat vload(VMask mask, const float * ptr, float fallback) { return _mm512_mask_load_ps(_mm512_set1_ps(fallback), mask.m, ptr); } ICBC_FORCEINLINE VFloat vgather(const float * base, VFloat index) { return _mm512_i32gather_ps(_mm512_cvtps_epi32(index), base, 4); } ICBC_FORCEINLINE VFloat operator+(VFloat a, VFloat b) { return _mm512_add_ps(a, b); } ICBC_FORCEINLINE VFloat operator-(VFloat a, VFloat b) { return _mm512_sub_ps(a, b); } ICBC_FORCEINLINE VFloat operator*(VFloat a, VFloat b) { return _mm512_mul_ps(a, b); } ICBC_FORCEINLINE VFloat vrcp(VFloat a) { #if ICBC_USE_RCP VFloat r = _mm512_rcp14_ps(a); // r = r * (2 - r * a) return _mm512_mul_ps(r, _mm512_fnmadd_ps(r, a, vbroadcast(2.0f))); #else return _mm512_div_ps(vbroadcast(1.0f), a); #endif } // a*b+c ICBC_FORCEINLINE VFloat vmadd(VFloat a, VFloat b, VFloat c) { return _mm512_fmadd_ps(a, b, c); } ICBC_FORCEINLINE VFloat vmsub(VFloat a, VFloat b, VFloat c) { return _mm512_fmsub_ps(a, b, c); } ICBC_FORCEINLINE VFloat vm2sub(VFloat a, VFloat b, VFloat c, VFloat d) { return vmsub(a, b, c * d); } ICBC_FORCEINLINE VFloat vsaturate(VFloat a) { auto zero = _mm512_setzero_ps(); auto one = _mm512_set1_ps(1.0f); return _mm512_min_ps(_mm512_max_ps(a, zero), one); } ICBC_FORCEINLINE VFloat vround01(VFloat a) { return _mm512_roundscale_ps(a, _MM_FROUND_TO_NEAREST_INT); } ICBC_FORCEINLINE VFloat vtruncate(VFloat a) { return _mm512_roundscale_ps(a, _MM_FROUND_TO_ZERO); } ICBC_FORCEINLINE VFloat lane_id() { return _mm512_set_ps(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); } ICBC_FORCEINLINE VMask operator> (VFloat A, VFloat B) { return { _mm512_cmp_ps_mask(A, B, _CMP_GT_OQ) }; } ICBC_FORCEINLINE VMask operator>=(VFloat A, VFloat B) { return { _mm512_cmp_ps_mask(A, B, _CMP_GE_OQ) }; } ICBC_FORCEINLINE VMask operator< (VFloat A, VFloat B) { return { _mm512_cmp_ps_mask(A, B, _CMP_LT_OQ) }; } ICBC_FORCEINLINE VMask operator<=(VFloat A, VFloat B) { return { _mm512_cmp_ps_mask(A, B, _CMP_LE_OQ) }; } ICBC_FORCEINLINE VMask operator! (VMask A) { return { _mm512_knot(A.m) }; } ICBC_FORCEINLINE VMask operator| (VMask A, VMask B) { return { _mm512_kor(A.m, B.m) }; } ICBC_FORCEINLINE VMask operator& (VMask A, VMask B) { return { _mm512_kand(A.m, B.m) }; } ICBC_FORCEINLINE VMask operator^ (VMask A, VMask B) { return { _mm512_kxor(A.m, B.m) }; } // mask ? b : a ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) { return _mm512_mask_blend_ps(mask.m, a, b); } ICBC_FORCEINLINE VMask vbroadcast(bool b) { return { __mmask16(-int16_t(b)) }; } ICBC_FORCEINLINE bool all(VMask mask) { return mask.m == 0xFFFFFFFF; } ICBC_FORCEINLINE bool any(VMask mask) { return mask.m != 0; } ICBC_FORCEINLINE uint mask(VMask mask) { return mask.m; } ICBC_FORCEINLINE int reduce_min_index(VFloat v) { // First do an horizontal reduction. VFloat vmin = vbroadcast(_mm512_reduce_min_ps(v)); // Then find the index. VMask mask = (v <= vmin); return ctz(mask.m); } //ICBC_FORCEINLINE void vtranspose4(VFloat & a, VFloat & b, VFloat & c, VFloat & d); // @@ ICBC_FORCEINLINE int lane(VInt v, int i) { //return _mm256_extract_epi32(v, i); return v.m512i_i32[i]; } ICBC_FORCEINLINE VInt vzeroi() { return _mm512_setzero_epi32(); } ICBC_FORCEINLINE VInt vbroadcast(int a) { return _mm512_set1_epi32(a); } ICBC_FORCEINLINE VInt vload(const int * ptr) { return _mm512_load_epi32(ptr); } ICBC_FORCEINLINE VInt operator- (VInt A, int b) { return _mm512_sub_epi32(A, vbroadcast(b)); } ICBC_FORCEINLINE VInt operator& (VInt A, int b) { return _mm512_and_epi32(A, vbroadcast(b)); } ICBC_FORCEINLINE VInt operator>> (VInt A, int b) { return _mm512_srli_epi32(A, b); } ICBC_FORCEINLINE VMask operator> (VInt A, int b) { return { _mm512_cmpgt_epi32_mask(A, vbroadcast(b)) }; } ICBC_FORCEINLINE VMask operator>=(VInt A, int b) { return { _mm512_cmpge_epi32_mask(A, vbroadcast(b)) }; } ICBC_FORCEINLINE VMask operator== (VInt A, int b) { return { _mm512_cmpeq_epi32_mask(A, vbroadcast(b)) }; } // mask ? v[idx] : 0 ICBC_FORCEINLINE VFloat vpermuteif(VMask mask, VFloat v, VInt idx) { return _mm512_maskz_permutexvar_ps(mask.m, idx, v); } #elif ICBC_SIMD == ICBC_NEON constexpr int VEC_SIZE = 4; #if __GNUC__ union VFloat { float32x4_t v; float e[4]; VFloat() {} VFloat(float32x4_t v) : v(v) {} operator float32x4_t & () { return v; } }; struct VMask { uint32x4_t v; VMask() {} VMask(uint32x4_t v) : v(v) {} operator uint32x4_t & () { return v; } }; #else using VFloat = float32x4_t; using VMask = uint32x4_t; #endif ICBC_FORCEINLINE float & lane(VFloat & v, int i) { #if defined(__clang__) return v.e[i]; #elif defined(_MSC_VER) return v.n128_f32[i]; #else return v.v[i]; #endif } ICBC_FORCEINLINE VFloat vzero() { return vdupq_n_f32(0.0f); } ICBC_FORCEINLINE VFloat vbroadcast(float a) { return vdupq_n_f32(a); } ICBC_FORCEINLINE VFloat vload(const float * ptr) { return vld1q_f32(ptr); } ICBC_FORCEINLINE VFloat operator+(VFloat a, VFloat b) { return vaddq_f32(a, b); } ICBC_FORCEINLINE VFloat operator-(VFloat a, VFloat b) { return vsubq_f32(a, b); } ICBC_FORCEINLINE VFloat operator*(VFloat a, VFloat b) { return vmulq_f32(a, b); } ICBC_FORCEINLINE VFloat vrcp(VFloat a) { //#if ICBC_USE_RCP VFloat rcp = vrecpeq_f32(a); //return rcp; return vmulq_f32(vrecpsq_f32(a, rcp), rcp); //#else // return vdiv_f32(vbroadcast(1.0f), a); // @@ ARMv8 only? //#endif } // a*b+c ICBC_FORCEINLINE VFloat vmadd(VFloat a, VFloat b, VFloat c) { return vmlaq_f32(c, a, b); } ICBC_FORCEINLINE VFloat vmsub(VFloat a, VFloat b, VFloat c) { return a * b - c; } ICBC_FORCEINLINE VFloat vm2sub(VFloat a, VFloat b, VFloat c, VFloat d) { // vmlsq_f32(a, b, c) == a - (b * c) return vmlsq_f32(a * b, c, d); } ICBC_FORCEINLINE VFloat vsaturate(VFloat a) { return vminq_f32(vmaxq_f32(a, vzero()), vbroadcast(1)); } ICBC_FORCEINLINE VFloat vround01(VFloat a) { #if __ARM_RACH >= 8 return vrndqn_f32(a); // Round to integral (to nearest, ties to even) #else return vcvtq_f32_s32(vcvtq_s32_f32(a + vbroadcast(0.5))); #endif } ICBC_FORCEINLINE VFloat vtruncate(VFloat a) { #if __ARM_RACH >= 8 return vrndq_f32(a); #else return vcvtq_f32_s32(vcvtq_s32_f32(a)); #endif } ICBC_FORCEINLINE VFloat lane_id() { ICBC_ALIGN_16 float data[4] = { 0, 1, 2, 3 }; return vld1q_f32(data); } ICBC_FORCEINLINE VMask operator> (VFloat A, VFloat B) { return { vcgtq_f32(A, B) }; } ICBC_FORCEINLINE VMask operator>=(VFloat A, VFloat B) { return { vcgeq_f32(A, B) }; } ICBC_FORCEINLINE VMask operator< (VFloat A, VFloat B) { return { vcltq_f32(A, B) }; } ICBC_FORCEINLINE VMask operator<=(VFloat A, VFloat B) { return { vcleq_f32(A, B) }; } ICBC_FORCEINLINE VMask operator| (VMask A, VMask B) { return { vorrq_u32(A, B) }; } ICBC_FORCEINLINE VMask operator& (VMask A, VMask B) { return { vandq_u32(A, B) }; } ICBC_FORCEINLINE VMask operator^ (VMask A, VMask B) { return { veorq_u32(A, B) }; } // mask ? b : a ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) { return vbslq_f32(mask, b, a); } ICBC_FORCEINLINE bool all(VMask mask) { uint32x2_t m2 = vpmin_u32(vget_low_u32(mask), vget_high_u32(mask)); uint32x2_t m1 = vpmin_u32(m2, m2); #if defined(_MSC_VER) return m1.n64_u32[0] != 0; #else return m1[0] != 0; #endif } ICBC_FORCEINLINE bool any(VMask mask) { uint32x2_t m2 = vpmax_u32(vget_low_u32(mask), vget_high_u32(mask)); uint32x2_t m1 = vpmax_u32(m2, m2); #if defined(_MSC_VER) return m1.n64_u32[0] != 0; #else return m1[0] != 0; #endif } // @@ Is this the best we can do? // From: https://github.com/jratcliff63367/sse2neon/blob/master/SSE2NEON.h ICBC_FORCEINLINE uint mask(VMask mask) { static const uint32x4_t movemask = { 1, 2, 4, 8 }; static const uint32x4_t highbit = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }; uint32x4_t t1 = vtstq_u32(mask, highbit); uint32x4_t t2 = vandq_u32(t1, movemask); uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2)); return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1); } inline int reduce_min_index(VFloat v) { #if 0 float32x2_t m2 = vpmin_f32(vget_low_u32(V), vget_high_u32(v)); float32x2_t m1 = vpmin_f32(m2, m2); float min_value = vget_lane_f32(m1, 0); VFloat vmin = vbroadcast(min_value); // (v <= vmin) // @@ Find the lane that contains minValue? #endif // @@ Is there a better way to do this reduction? int min_idx = 0; float min_value = lane(v, 0); for (int i = 1; i < VEC_SIZE; i++) { float value = lane(v, i); if (value < min_value) { min_value = value; min_idx = i; } } return min_idx; } // https://github.com/Maratyszcza/NNPACK/blob/master/src/neon/transpose.h ICBC_FORCEINLINE void vtranspose4(VFloat & a, VFloat & b, VFloat & c, VFloat & d) { // row0 = ( x00 x01 x02 x03 ) // row1 = ( x10 x11 x12 x13 ) // row2 = ( x20 x21 x22 x23 ) // row3 = ( x30 x31 x32 x33 ) // row01 = ( x00 x10 x02 x12 ), ( x01 x11 x03, x13 ) // row23 = ( x20 x30 x22 x32 ), ( x21 x31 x23, x33 ) float32x4x2_t row01 = vtrnq_f32(a, b); float32x4x2_t row23 = vtrnq_f32(c, d); // row0 = ( x00 x10 x20 x30 ) // row1 = ( x01 x11 x21 x31 ) // row2 = ( x02 x12 x22 x32 ) // row3 = ( x03 x13 x23 x33 ) a = vcombine_f32(vget_low_f32(row01.val[0]), vget_low_f32(row23.val[0])); b = vcombine_f32(vget_low_f32(row01.val[1]), vget_low_f32(row23.val[1])); c = vcombine_f32(vget_high_f32(row01.val[0]), vget_high_f32(row23.val[0])); d = vcombine_f32(vget_high_f32(row01.val[1]), vget_high_f32(row23.val[1])); } #elif ICBC_SIMD == ICBC_VMX constexpr int VEC_SIZE = 4; union VFloat { vectro float v; float e[4]; VFloat() {} VFloat(vector float v) : v(v) {} operator vector float & () { return v; } }; struct VMask { vector unsigned int v; VMask() {} VMask(vector unsigned int v) : v(v) {} operator vector unsigned int & () { return v; } }; ICBC_FORCEINLINE float & lane(VFloat & v, int i) { return v.e[i]; } ICBC_FORCEINLINE VFloat vzero() { return vec_splats(0.0f); } ICBC_FORCEINLINE VFloat vbroadcast(float a) { return vec_splats(a); } ICBC_FORCEINLINE VFloat vload(const float * ptr) { return vec_ld(ptr) } ICBC_FORCEINLINE VFloat operator+(VFloat a, VFloat b) { return vec_add(a, b); } ICBC_FORCEINLINE VFloat operator-(VFloat a, VFloat b) { return vec_sub(a, b); } ICBC_FORCEINLINE VFloat operator*(VFloat a, VFloat b) { return vec_madd(a, b, vec_splats(-0.0f)); } ICBC_FORCEINLINE VFloat vrcp(VFloat a) { #if ICBC_USE_RCP // get the reciprocal estimate vector float estimate = vec_re( v.vec ); // one round of Newton-Rhaphson refinement vector float diff = vec_nmsub( estimate, v.vec, vec_splats( 1.0f ) ); return vec_madd(diff, estimate, estimate ); #else return vec_div(vec_splats(1),a); #endif } // a*b+c ICBC_FORCEINLINE VFloat vmadd(VFloat a, VFloat b, VFloat c) { return vec_madd(a, b, c); } ICBC_FORCEINLINE VFloat vmsub(VFloat a, VFloat b, VFloat c) { return vec_msub(a, b, c); // @@ Is this right? } ICBC_FORCEINLINE VFloat vm2sub(VFloat a, VFloat b, VFloat c, VFloat d) { return vmsub(a, b, c * d); } ICBC_FORCEINLINE VFloat vsaturate(VFloat a) { return vec_min(vec_max(a, vzero()), vbroadcast(1)); } ICBC_FORCEINLINE VFloat vround01(VFloat a) { // @@ Assumes a is positive and ~small return vec_trunc(a + vbroadcast(0.5)); } ICBC_FORCEINLINE VFloat vtruncate(VFloat a) { return vec_trunc(a); } ICBC_FORCEINLINE VFloat lane_id() { return (VFloat){ 0, 1, 2, 3 }; } ICBC_FORCEINLINE VMask operator> (VFloat A, VFloat B) { return { vec_cmpgt(A, B) }; } ICBC_FORCEINLINE VMask operator>=(VFloat A, VFloat B) { return { vec_cmpge(A, B) }; } ICBC_FORCEINLINE VMask operator< (VFloat A, VFloat B) { return { vec_cmplt(A, B) }; } ICBC_FORCEINLINE VMask operator<=(VFloat A, VFloat B) { return { vec_cmple(A, B) }; } ICBC_FORCEINLINE VMask operator| (VMask A, VMask B) { return { vec_or(A, B) }; } ICBC_FORCEINLINE VMask operator& (VMask A, VMask B) { return { vec_and(A, B) }; } ICBC_FORCEINLINE VMask operator^ (VMask A, VMask B) { return { vec_xor(A, B) }; } // mask ? b : a ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) { return vec_sel(a, b, mask); } ICBC_FORCEINLINE int reduce_min_index(VFloat v) { //VFloat vmin = //@@ Horizontal min? //return vec_cmpeq_idx(v, vmin); // @@ Is there a better way to do this reduction? int min_idx = 0; float min_value = lane(v, 0); for (int i = 1; i < VEC_SIZE; i++) { float value = lane(v, i); if (value < min_value) { min_value = value; min_idx = i; } } return min_idx; } ICBC_FORCEINLINE void vtranspose4(VFloat & a, VFloat & b, VFloat & c, VFloat & d) { VFloat t1 = vec_mergeh(a, c); VFloat t2 = vec_mergel(a, c); VFloat t3 = vec_mergeh(b, d); VFloat t4 = vec_mergel(b, d); a = vec_mergeh(t1, t3); b = vec_mergel(t1, t3); c = vec_mergeh(t2, t4); d = vec_mergel(t2, t4); } #endif // ICBC_SIMD == * #if ICBC_SIMD != ICBC_SCALAR ICBC_FORCEINLINE VFloat vmadd(VFloat a, float b, VFloat c) { VFloat vb = vbroadcast(b); return vmadd(a, vb, c); } #endif struct VVector3 { VFloat x; VFloat y; VFloat z; }; ICBC_FORCEINLINE VVector3 vbroadcast(Vector3 v) { VVector3 v8; v8.x = vbroadcast(v.x); v8.y = vbroadcast(v.y); v8.z = vbroadcast(v.z); return v8; } ICBC_FORCEINLINE VVector3 vbroadcast(float x, float y, float z) { VVector3 v8; v8.x = vbroadcast(x); v8.y = vbroadcast(y); v8.z = vbroadcast(z); return v8; } /*ICBC_FORCEINLINE VVector3 vload(const Vector3 * v) { // @@ See this for a 8x3 transpose: https://software.intel.com/content/www/us/en/develop/articles/3d-vector-normalization-using-256-bit-intel-advanced-vector-extensions-intel-avx.html }*/ ICBC_FORCEINLINE VVector3 vload(const Vector4 * ptr) { #if ICBC_SIMD == ICBC_AVX512 // @@ AVX512 transpose not implemented. __m512i vindex = _mm512_set_epi32(4 * 15, 4 * 14, 4 * 13, 4 * 12, 4 * 11, 4 * 10, 4 * 9, 4 * 8, 4 * 7, 4 * 6, 4 * 5, 4 * 4, 4 * 3, 4 * 2, 4 * 1, 0); VVector3 v; v.x = _mm512_i32gather_ps(vindex, &ptr->x, 4); v.y = _mm512_i32gather_ps(vindex, &ptr->y, 4); v.z = _mm512_i32gather_ps(vindex, &ptr->z, 4); return v; #else VVector3 v; v.x = vload(&ptr->x + 0 * VEC_SIZE); v.y = vload(&ptr->x + 1 * VEC_SIZE); v.z = vload(&ptr->x + 2 * VEC_SIZE); VFloat tmp = vload(&ptr->x + 3 * VEC_SIZE); vtranspose4(v.x, v.y, v.z, tmp); return v; #endif } ICBC_FORCEINLINE VVector3 operator+(VVector3 a, VVector3 b) { VVector3 v; v.x = (a.x + b.x); v.y = (a.y + b.y); v.z = (a.z + b.z); return v; } ICBC_FORCEINLINE VVector3 operator-(VVector3 a, VVector3 b) { VVector3 v8; v8.x = (a.x - b.x); v8.y = (a.y - b.y); v8.z = (a.z - b.z); return v8; } ICBC_FORCEINLINE VVector3 operator*(VVector3 a, VVector3 b) { VVector3 v8; v8.x = (a.x * b.x); v8.y = (a.y * b.y); v8.z = (a.z * b.z); return v8; } ICBC_FORCEINLINE VVector3 operator*(VVector3 a, VFloat b) { VVector3 v8; v8.x = (a.x * b); v8.y = (a.y * b); v8.z = (a.z * b); return v8; } ICBC_FORCEINLINE VVector3 vmadd(VVector3 a, VVector3 b, VVector3 c) { VVector3 v8; v8.x = vmadd(a.x, b.x, c.x); v8.y = vmadd(a.y, b.y, c.y); v8.z = vmadd(a.z, b.z, c.z); return v8; } ICBC_FORCEINLINE VVector3 vmadd(VVector3 a, VFloat b, VVector3 c) { VVector3 v8; v8.x = vmadd(a.x, b, c.x); v8.y = vmadd(a.y, b, c.y); v8.z = vmadd(a.z, b, c.z); return v8; } #if ICBC_SIMD != ICBC_SCALAR ICBC_FORCEINLINE VVector3 vmadd(VVector3 a, float b, VVector3 c) { VVector3 v8; VFloat vb = vbroadcast(b); v8.x = vmadd(a.x, vb, c.x); v8.y = vmadd(a.y, vb, c.y); v8.z = vmadd(a.z, vb, c.z); return v8; } #endif ICBC_FORCEINLINE VVector3 vmsub(VVector3 a, VFloat b, VFloat c) { VVector3 v8; v8.x = vmsub(a.x, b, c); v8.y = vmsub(a.y, b, c); v8.z = vmsub(a.z, b, c); return v8; } ICBC_FORCEINLINE VVector3 vmsub(VVector3 a, VFloat b, VVector3 c) { VVector3 v8; v8.x = vmsub(a.x, b, c.x); v8.y = vmsub(a.y, b, c.y); v8.z = vmsub(a.z, b, c.z); return v8; } ICBC_FORCEINLINE VVector3 vm2sub(VVector3 a, VFloat b, VVector3 c, VFloat d) { VVector3 v; v.x = vm2sub(a.x, b, c.x, d); v.y = vm2sub(a.y, b, c.y, d); v.z = vm2sub(a.z, b, c.z, d); return v; } ICBC_FORCEINLINE VVector3 vm2sub(VVector3 a, VVector3 b, VVector3 c, VFloat d) { VVector3 v; v.x = vm2sub(a.x, b.x, c.x, d); v.y = vm2sub(a.y, b.y, c.y, d); v.z = vm2sub(a.z, b.z, c.z, d); return v; } ICBC_FORCEINLINE VVector3 vm2sub(VVector3 a, VVector3 b, VVector3 c, VVector3 d) { VVector3 v; v.x = vm2sub(a.x, b.x, c.x, d.x); v.y = vm2sub(a.y, b.y, c.y, d.y); v.z = vm2sub(a.z, b.z, c.z, d.z); return v; } ICBC_FORCEINLINE VFloat vdot(VVector3 a, VVector3 b) { VFloat r; r = a.x * b.x + a.y * b.y + a.z * b.z; return r; } // Length squared. ICBC_FORCEINLINE VFloat vlen2(VVector3 v) { return vdot(v, v); } // mask ? b : a ICBC_FORCEINLINE VVector3 vselect(VMask mask, VVector3 a, VVector3 b) { VVector3 r; r.x = vselect(mask, a.x, b.x); r.y = vselect(mask, a.y, b.y); r.z = vselect(mask, a.z, b.z); return r; } /////////////////////////////////////////////////////////////////////////////////////////////////// // Color conversion functions. static const float midpoints5[32] = { 0.015686f, 0.047059f, 0.078431f, 0.111765f, 0.145098f, 0.176471f, 0.207843f, 0.241176f, 0.274510f, 0.305882f, 0.337255f, 0.370588f, 0.403922f, 0.435294f, 0.466667f, 0.5f, 0.533333f, 0.564706f, 0.596078f, 0.629412f, 0.662745f, 0.694118f, 0.725490f, 0.758824f, 0.792157f, 0.823529f, 0.854902f, 0.888235f, 0.921569f, 0.952941f, 0.984314f, FLT_MAX }; static const float midpoints6[64] = { 0.007843f, 0.023529f, 0.039216f, 0.054902f, 0.070588f, 0.086275f, 0.101961f, 0.117647f, 0.133333f, 0.149020f, 0.164706f, 0.180392f, 0.196078f, 0.211765f, 0.227451f, 0.245098f, 0.262745f, 0.278431f, 0.294118f, 0.309804f, 0.325490f, 0.341176f, 0.356863f, 0.372549f, 0.388235f, 0.403922f, 0.419608f, 0.435294f, 0.450980f, 0.466667f, 0.482353f, 0.500000f, 0.517647f, 0.533333f, 0.549020f, 0.564706f, 0.580392f, 0.596078f, 0.611765f, 0.627451f, 0.643137f, 0.658824f, 0.674510f, 0.690196f, 0.705882f, 0.721569f, 0.737255f, 0.754902f, 0.772549f, 0.788235f, 0.803922f, 0.819608f, 0.835294f, 0.850980f, 0.866667f, 0.882353f, 0.898039f, 0.913725f, 0.929412f, 0.945098f, 0.960784f, 0.976471f, 0.992157f, FLT_MAX }; /*void init_tables() { for (int i = 0; i < 31; i++) { float f0 = float(((i+0) << 3) | ((i+0) >> 2)) / 255.0f; float f1 = float(((i+1) << 3) | ((i+1) >> 2)) / 255.0f; midpoints5[i] = (f0 + f1) * 0.5; } midpoints5[31] = FLT_MAX; for (int i = 0; i < 63; i++) { float f0 = float(((i+0) << 2) | ((i+0) >> 4)) / 255.0f; float f1 = float(((i+1) << 2) | ((i+1) >> 4)) / 255.0f; midpoints6[i] = (f0 + f1) * 0.5; } midpoints6[63] = FLT_MAX; }*/ ICBC_FORCEINLINE VFloat vround5(VFloat x) { const VFloat rb_scale = vbroadcast(31.0f); const VFloat rb_inv_scale = vbroadcast(1.0f / 31.0f); #if ICBC_PERFECT_ROUND VFloat q = vtruncate(x * rb_scale); VFloat mp = vgather(midpoints5, q); //return (q + (vbroadcast(1.0f) & (x > mp))) * rb_inv_scale; return (q + vselect(x > mp, vzero(), vbroadcast(1))) * rb_inv_scale; #else return vround01(x * rb_scale) * rb_inv_scale; #endif } ICBC_FORCEINLINE VFloat vround6(VFloat x) { const VFloat g_scale = vbroadcast(63.0f); const VFloat g_inv_scale = vbroadcast(1.0f / 63.0f); #if ICBC_PERFECT_ROUND VFloat q = vtruncate(x * g_scale); VFloat mp = vgather(midpoints6, q); //return (q + (vbroadcast(1) & (x > mp))) * g_inv_scale; return (q + vselect(x > mp, vzero(), vbroadcast(1))) * g_inv_scale; #else return vround01(x * g_scale) * g_inv_scale; #endif } ICBC_FORCEINLINE VVector3 vround_ept(VVector3 v) { VVector3 r; r.x = vround5(vsaturate(v.x)); r.y = vround6(vsaturate(v.y)); r.z = vround5(vsaturate(v.z)); return r; } static Color16 vector3_to_color16(const Vector3 & v) { // Truncate. uint r = uint(clamp(v.x * 31.0f, 0.0f, 31.0f)); uint g = uint(clamp(v.y * 63.0f, 0.0f, 63.0f)); uint b = uint(clamp(v.z * 31.0f, 0.0f, 31.0f)); // Round exactly according to 565 bit-expansion. r += (v.x > midpoints5[r]); g += (v.y > midpoints6[g]); b += (v.z > midpoints5[b]); Color16 c; c.u = uint16((r << 11) | (g << 5) | b); return c; } static Color32 bitexpand_color16_to_color32(Color16 c16) { Color32 c32; //c32.b = (c16.b << 3) | (c16.b >> 2); //c32.g = (c16.g << 2) | (c16.g >> 4); //c32.r = (c16.r << 3) | (c16.r >> 2); //c32.a = 0xFF; c32.u = ((c16.u << 3) & 0xf8) | ((c16.u << 5) & 0xfc00) | ((c16.u << 8) & 0xf80000); c32.u |= (c32.u >> 5) & 0x070007; c32.u |= (c32.u >> 6) & 0x000300; return c32; } inline Vector3 color_to_vector3(Color32 c) { return { c.r / 255.0f, c.g / 255.0f, c.b / 255.0f }; } inline Color32 vector3_to_color32(Vector3 v) { Color32 color; color.r = uint8(saturate(v.x) * 255 + 0.5f); color.g = uint8(saturate(v.y) * 255 + 0.5f); color.b = uint8(saturate(v.z) * 255 + 0.5f); color.a = 255; return color; } /////////////////////////////////////////////////////////////////////////////////////////////////// // Input block processing. inline bool is_black(Vector3 c) { // This large threshold seems to improve compression. This is not forcing these texels to be black, just // causes them to be ignored during PCA. // @@ We may want to adjust this based on the quality level, since this increases the number of blocks that try cluster-3 fitting. //return c.x < midpoints5[0] && c.y < midpoints6[0] && c.z < midpoints5[0]; //return c.x < 1.0f / 32 && c.y < 1.0f / 32 && c.z < 1.0f / 32; return c.x < 1.0f / 8 && c.y < 1.0f / 8 && c.z < 1.0f / 8; } // Find similar colors and combine them together. static int reduce_colors(const Vector4 * input_colors, const float * input_weights, int count, float threshold, Vector3 * colors, float * weights, bool * any_black) { #if 0 for (int i = 0; i < 16; i++) { colors[i] = input_colors[i].xyz; weights[i] = input_weights[i]; } return 16; #else *any_black = false; int n = 0; for (int i = 0; i < count; i++) { Vector3 ci = input_colors[i].xyz; float wi = input_weights[i]; if (wi > 0) { // Find matching color. int j; for (j = 0; j < n; j++) { if (equal(colors[j], ci, threshold)) { colors[j] = (colors[j] * weights[j] + ci * wi) / (weights[j] + wi); weights[j] += wi; break; } } // No match found. Add new color. if (j == n) { colors[n] = ci; weights[n] = wi; n++; } if (is_black(ci)) { *any_black = true; } } } ICBC_ASSERT(n <= count); return n; #endif } static int skip_blacks(const Vector3 * input_colors, const float * input_weights, int count, Vector3 * colors, float * weights) { int n = 0; for (int i = 0; i < count; i++) { Vector3 ci = input_colors[i]; float wi = input_weights[i]; if (is_black(ci)) { continue; } colors[n] = ci; weights[n] = wi; n += 1; } return n; } /////////////////////////////////////////////////////////////////////////////////////////////////// // PCA static Vector3 computeCentroid(int n, const Vector3 *__restrict points, const float *__restrict weights) { Vector3 centroid = { 0 }; float total = 0.0f; for (int i = 0; i < n; i++) { total += weights[i]; centroid += weights[i] * points[i]; } centroid *= (1.0f / total); return centroid; } static Vector3 computeCovariance(int n, const Vector3 *__restrict points, const float *__restrict weights, float *__restrict covariance) { // compute the centroid Vector3 centroid = computeCentroid(n, points, weights); // compute covariance matrix for (int i = 0; i < 6; i++) { covariance[i] = 0.0f; } for (int i = 0; i < n; i++) { Vector3 a = (points[i] - centroid); // @@ I think weight should be squared, but that seems to increase the error slightly. Vector3 b = weights[i] * a; covariance[0] += a.x * b.x; covariance[1] += a.x * b.y; covariance[2] += a.x * b.z; covariance[3] += a.y * b.y; covariance[4] += a.y * b.z; covariance[5] += a.z * b.z; } return centroid; } // @@ We should be able to do something cheaper... static Vector3 estimatePrincipalComponent(const float * __restrict matrix) { const Vector3 row0 = { matrix[0], matrix[1], matrix[2] }; const Vector3 row1 = { matrix[1], matrix[3], matrix[4] }; const Vector3 row2 = { matrix[2], matrix[4], matrix[5] }; float r0 = lengthSquared(row0); float r1 = lengthSquared(row1); float r2 = lengthSquared(row2); if (r0 > r1 && r0 > r2) return row0; if (r1 > r2) return row1; return row2; } static inline Vector3 firstEigenVector_PowerMethod(const float *__restrict matrix) { if (matrix[0] == 0 && matrix[3] == 0 && matrix[5] == 0) { return {0}; } Vector3 v = estimatePrincipalComponent(matrix); const int NUM = 8; for (int i = 0; i < NUM; i++) { float x = v.x * matrix[0] + v.y * matrix[1] + v.z * matrix[2]; float y = v.x * matrix[1] + v.y * matrix[3] + v.z * matrix[4]; float z = v.x * matrix[2] + v.y * matrix[4] + v.z * matrix[5]; float norm = max(max(x, y), z); v = { x, y, z }; v *= (1.0f / norm); } return v; } static Vector3 computePrincipalComponent_PowerMethod(int n, const Vector3 *__restrict points, const float *__restrict weights) { float matrix[6]; computeCovariance(n, points, weights, matrix); return firstEigenVector_PowerMethod(matrix); } /////////////////////////////////////////////////////////////////////////////////////////////////// // SAT struct SummedAreaTable { ICBC_ALIGN_16 float r[16]; ICBC_ALIGN_16 float g[16]; ICBC_ALIGN_16 float b[16]; ICBC_ALIGN_16 float w[16]; }; int compute_sat(const Vector3 * colors, const float * weights, int count, SummedAreaTable * sat) { // I've tried using a lower quality approximation of the principal direction, but the best fit line seems to produce best results. Vector3 principal = computePrincipalComponent_PowerMethod(count, colors, weights); // build the list of values int order[16]; float dps[16]; for (int i = 0; i < count; ++i) { order[i] = i; dps[i] = dot(colors[i], principal); } // stable sort for (int i = 0; i < count; ++i) { for (int j = i; j > 0 && dps[j] < dps[j - 1]; --j) { swap(dps[j], dps[j - 1]); swap(order[j], order[j - 1]); } } float w = weights[order[0]]; sat->r[0] = colors[order[0]].x * w; sat->g[0] = colors[order[0]].y * w; sat->b[0] = colors[order[0]].z * w; sat->w[0] = w; for (int i = 1; i < count; i++) { w = weights[order[i]]; sat->r[i] = sat->r[i - 1] + colors[order[i]].x * w; sat->g[i] = sat->g[i - 1] + colors[order[i]].y * w; sat->b[i] = sat->b[i - 1] + colors[order[i]].z * w; sat->w[i] = sat->w[i - 1] + w; } // Try incremental decimation: /*if (count > 4) { float threshold = 1.0f / 4; for (uint i = 1; i < count; ++i) { if (sat->r[i] - sat->r[i - 1] < threshold && sat->g[i] - sat->g[i - 1] < threshold && sat->b[i] - sat->b[i - 1] < threshold) { for (int j = i+1; j < count; j++) { sat->r[j - 1] = sat->r[j]; sat->g[j - 1] = sat->g[j]; sat->b[j - 1] = sat->b[j]; sat->w[j - 1] = sat->w[j]; } count -= 1; i -= 1; if (count == 4) break; } } }*/ for (int i = count; i < 16; i++) { sat->r[i] = FLT_MAX; sat->g[i] = FLT_MAX; sat->b[i] = FLT_MAX; sat->w[i] = FLT_MAX; } return count; } /////////////////////////////////////////////////////////////////////////////////////////////////// // Cluster Fit struct Combinations { uint8 c0, c1, c2, pad; }; static ICBC_ALIGN_16 int s_fourClusterTotal[16]; static ICBC_ALIGN_16 int s_threeClusterTotal[16]; static ICBC_ALIGN_16 Combinations s_fourCluster[968 + 8]; static ICBC_ALIGN_16 Combinations s_threeCluster[152 + 8]; #if ICBC_USE_NEON_VTL static uint8 s_neon_vtl_index0_4[4 * 968]; static uint8 s_neon_vtl_index1_4[4 * 968]; static uint8 s_neon_vtl_index2_4[4 * 968]; static uint8 s_neon_vtl_index0_3[4 * 152]; static uint8 s_neon_vtl_index1_3[4 * 152]; #endif static void init_cluster_tables() { for (int t = 1, i = 0; t <= 16; t++) { for (int c0 = 0; c0 <= t; c0++) { for (int c1 = 0; c1 <= t - c0; c1++) { for (int c2 = 0; c2 <= t - c0 - c1; c2++) { // Skip this cluster so that the total is a multiple of 8 if (c0 == 0 && c1 == 0 && c2 == 0) continue; bool found = false; if (t > 1) { for (int j = 0; j < s_fourClusterTotal[t-2]; j++) { if (s_fourCluster[j].c0 == c0 && s_fourCluster[j].c1 == c0+c1 && s_fourCluster[j].c2 == c0+c1+c2) { found = true; break; } } } if (!found) { s_fourCluster[i].c0 = uint8(c0); s_fourCluster[i].c1 = uint8(c0+c1); s_fourCluster[i].c2 = uint8(c0+c1+c2); i++; } } } } s_fourClusterTotal[t - 1] = i; } // Replicate last entry. for (int i = 0; i < 8; i++) { s_fourCluster[968 + i] = s_fourCluster[968-1]; } for (int t = 1, i = 0; t <= 16; t++) { for (int c0 = 0; c0 <= t; c0++) { for (int c1 = 0; c1 <= t - c0; c1++) { // Skip this cluster so that the total is a multiple of 8 if (c0 == 0 && c1 == 0) continue; bool found = false; if (t > 1) { for (int j = 0; j < s_threeClusterTotal[t - 2]; j++) { if (s_threeCluster[j].c0 == c0 && s_threeCluster[j].c1 == c0+c1) { found = true; break; } } } if (!found) { s_threeCluster[i].c0 = uint8(c0); s_threeCluster[i].c1 = uint8(c0 + c1); i++; } } } s_threeClusterTotal[t - 1] = i; } // Replicate last entry. for (int i = 0; i < 8; i++) { s_threeCluster[152 + i] = s_threeCluster[152 - 1]; } #if ICBC_USE_NEON_VTL for (int i = 0; i < 968; i++) { int c0 = (s_fourCluster[i].c0) - 1; s_neon_vtl_index0_4[4 * i + 0] = uint8(c0 * 4 + 0); s_neon_vtl_index0_4[4 * i + 1] = uint8(c0 * 4 + 1); s_neon_vtl_index0_4[4 * i + 2] = uint8(c0 * 4 + 2); s_neon_vtl_index0_4[4 * i + 3] = uint8(c0 * 4 + 3); int c1 = (s_fourCluster[i].c1) - 1; s_neon_vtl_index1_4[4 * i + 0] = uint8(c1 * 4 + 0); s_neon_vtl_index1_4[4 * i + 1] = uint8(c1 * 4 + 1); s_neon_vtl_index1_4[4 * i + 2] = uint8(c1 * 4 + 2); s_neon_vtl_index1_4[4 * i + 3] = uint8(c1 * 4 + 3); int c2 = (s_fourCluster[i].c2) - 1; s_neon_vtl_index2_4[4 * i + 0] = uint8(c2 * 4 + 0); s_neon_vtl_index2_4[4 * i + 1] = uint8(c2 * 4 + 1); s_neon_vtl_index2_4[4 * i + 2] = uint8(c2 * 4 + 2); s_neon_vtl_index2_4[4 * i + 3] = uint8(c2 * 4 + 3); } for (int i = 0; i < 152; i++) { int c0 = (s_threeCluster[i].c0) - 1; s_neon_vtl_index0_3[4 * i + 0] = uint8(c0 * 4 + 0); s_neon_vtl_index0_3[4 * i + 1] = uint8(c0 * 4 + 1); s_neon_vtl_index0_3[4 * i + 2] = uint8(c0 * 4 + 2); s_neon_vtl_index0_3[4 * i + 3] = uint8(c0 * 4 + 3); int c1 = (s_threeCluster[i].c1) - 1; s_neon_vtl_index1_3[4 * i + 0] = uint8(c1 * 4 + 0); s_neon_vtl_index1_3[4 * i + 1] = uint8(c1 * 4 + 1); s_neon_vtl_index1_3[4 * i + 2] = uint8(c1 * 4 + 2); s_neon_vtl_index1_3[4 * i + 3] = uint8(c1 * 4 + 3); } #endif } static void cluster_fit_three(const SummedAreaTable & sat, int count, Vector3 metric_sqr, Vector3 * start, Vector3 * end) { const float r_sum = sat.r[count-1]; const float g_sum = sat.g[count-1]; const float b_sum = sat.b[count-1]; const float w_sum = sat.w[count-1]; VFloat vbesterror = vbroadcast(FLT_MAX); VVector3 vbeststart = { vzero(), vzero(), vzero() }; VVector3 vbestend = { vzero(), vzero(), vzero() }; // check all possible clusters for this total order const int total_order_count = s_threeClusterTotal[count - 1]; for (int i = 0; i < total_order_count; i += VEC_SIZE) { VVector3 x0, x1; VFloat w0, w1; #if ICBC_USE_AVX512_PERMUTE auto loadmask = lane_id() < vbroadcast(float(count)); // Load sat in one register: VFloat vrsat = vload(loadmask, sat.r, FLT_MAX); VFloat vgsat = vload(loadmask, sat.g, FLT_MAX); VFloat vbsat = vload(loadmask, sat.b, FLT_MAX); VFloat vwsat = vload(loadmask, sat.w, FLT_MAX); // Load 4 uint8 per lane. VInt packedClusterIndex = vload((int *)&s_threeCluster[i]); VInt c0 = (packedClusterIndex & 0xFF) - 1; VInt c1 = (packedClusterIndex >> 8) - 1; x0.x = vpermuteif(c0 >= 0, vrsat, c0); x0.y = vpermuteif(c0 >= 0, vgsat, c0); x0.z = vpermuteif(c0 >= 0, vbsat, c0); w0 = vpermuteif(c0 >= 0, vwsat, c0); x1.x = vpermuteif(c1 >= 0, vrsat, c1); x1.y = vpermuteif(c1 >= 0, vgsat, c1); x1.z = vpermuteif(c1 >= 0, vbsat, c1); w1 = vpermuteif(c1 >= 0, vwsat, c1); #elif ICBC_USE_AVX2_PERMUTE2 // Load 4 uint8 per lane. @@ Ideally I should pack this better and load only 2. VInt packedClusterIndex = vload((int *)&s_threeCluster[i]); VInt c0 = (packedClusterIndex & 0xFF); VInt c1 = ((packedClusterIndex >> 8)); // No need for & 0xFF if (count <= 8) { // Load sat.r in one register: VFloat rLo = vload(sat.r); VFloat gLo = vload(sat.g); VFloat bLo = vload(sat.b); VFloat wLo = vload(sat.w); x0.x = vpermuteif(c0>0, rLo, c0-1); x0.y = vpermuteif(c0>0, gLo, c0-1); x0.z = vpermuteif(c0>0, bLo, c0-1); w0 = vpermuteif(c0>0, wLo, c0-1); x1.x = vpermuteif(c1>0, rLo, c1-1); x1.y = vpermuteif(c1>0, gLo, c1-1); x1.z = vpermuteif(c1>0, bLo, c1-1); w1 = vpermuteif(c1>0, wLo, c1-1); } else { // Load sat.r in two registers: VFloat rLo = vload(sat.r); VFloat rHi = vload(sat.r + 8); VFloat gLo = vload(sat.g); VFloat gHi = vload(sat.g + 8); VFloat bLo = vload(sat.b); VFloat bHi = vload(sat.b + 8); VFloat wLo = vload(sat.w); VFloat wHi = vload(sat.w + 8); x0.x = vpermute2if(c0>0, rLo, rHi, c0-1); x0.y = vpermute2if(c0>0, gLo, gHi, c0-1); x0.z = vpermute2if(c0>0, bLo, bHi, c0-1); w0 = vpermute2if(c0>0, wLo, wHi, c0-1); x1.x = vpermute2if(c1>0, rLo, rHi, c1-1); x1.y = vpermute2if(c1>0, gLo, gHi, c1-1); x1.z = vpermute2if(c1>0, bLo, bHi, c1-1); w1 = vpermute2if(c1>0, wLo, wHi, c1-1); } #elif ICBC_USE_NEON_VTL uint8x16_t idx0 = (uint8x16_t &)s_neon_vtl_index0_3[4*i]; uint8x16_t idx1 = (uint8x16_t &)s_neon_vtl_index1_3[4*i]; if (count <= 4) { uint8x16_t rsat1 = vld1q_u8((uint8*)sat.r); uint8x16_t gsat1 = vld1q_u8((uint8*)sat.g); uint8x16_t bsat1 = vld1q_u8((uint8*)sat.b); uint8x16_t wsat1 = vld1q_u8((uint8*)sat.w); x0.x = vreinterpretq_f32_u8(vqtbl1q_u8(rsat1, idx0)); x0.y = vreinterpretq_f32_u8(vqtbl1q_u8(gsat1, idx0)); x0.z = vreinterpretq_f32_u8(vqtbl1q_u8(bsat1, idx0)); w0 = vreinterpretq_f32_u8(vqtbl1q_u8(wsat1, idx0)); x1.x = vreinterpretq_f32_u8(vqtbl1q_u8(rsat1, idx1)); x1.y = vreinterpretq_f32_u8(vqtbl1q_u8(gsat1, idx1)); x1.z = vreinterpretq_f32_u8(vqtbl1q_u8(bsat1, idx1)); w1 = vreinterpretq_f32_u8(vqtbl1q_u8(wsat1, idx1)); } else if (count <= 8) { uint8x16x2_t rsat2 = vld2q_u8((uint8*)sat.r); uint8x16x2_t gsat2 = vld2q_u8((uint8*)sat.g); uint8x16x2_t bsat2 = vld2q_u8((uint8*)sat.b); uint8x16x2_t wsat2 = vld2q_u8((uint8*)sat.w); x0.x = vreinterpretq_f32_u8(vqtbl2q_u8(rsat2, idx0)); x0.y = vreinterpretq_f32_u8(vqtbl2q_u8(gsat2, idx0)); x0.z = vreinterpretq_f32_u8(vqtbl2q_u8(bsat2, idx0)); w0 = vreinterpretq_f32_u8(vqtbl2q_u8(wsat2, idx0)); x1.x = vreinterpretq_f32_u8(vqtbl2q_u8(rsat2, idx1)); x1.y = vreinterpretq_f32_u8(vqtbl2q_u8(gsat2, idx1)); x1.z = vreinterpretq_f32_u8(vqtbl2q_u8(bsat2, idx1)); w1 = vreinterpretq_f32_u8(vqtbl2q_u8(wsat2, idx1)); } else if (count <= 12) { uint8x16x3_t rsat3 = vld3q_u8((uint8*)sat.r); uint8x16x3_t gsat3 = vld3q_u8((uint8*)sat.g); uint8x16x3_t bsat3 = vld3q_u8((uint8*)sat.b); uint8x16x3_t wsat3 = vld3q_u8((uint8*)sat.w); x0.x = vreinterpretq_f32_u8(vqtbl3q_u8(rsat3, idx0)); x0.y = vreinterpretq_f32_u8(vqtbl3q_u8(gsat3, idx0)); x0.z = vreinterpretq_f32_u8(vqtbl3q_u8(bsat3, idx0)); w0 = vreinterpretq_f32_u8(vqtbl3q_u8(wsat3, idx0)); x1.x = vreinterpretq_f32_u8(vqtbl3q_u8(rsat3, idx1)); x1.y = vreinterpretq_f32_u8(vqtbl3q_u8(gsat3, idx1)); x1.z = vreinterpretq_f32_u8(vqtbl3q_u8(bsat3, idx1)); w1 = vreinterpretq_f32_u8(vqtbl3q_u8(wsat3, idx1)); } else { // Load SAT. uint8x16x4_t rsat4 = vld4q_u8((uint8*)sat.r); uint8x16x4_t gsat4 = vld4q_u8((uint8*)sat.g); uint8x16x4_t bsat4 = vld4q_u8((uint8*)sat.b); uint8x16x4_t wsat4 = vld4q_u8((uint8*)sat.w); x0.x = vreinterpretq_f32_u8(vqtbl4q_u8(rsat4, idx0)); x0.y = vreinterpretq_f32_u8(vqtbl4q_u8(gsat4, idx0)); x0.z = vreinterpretq_f32_u8(vqtbl4q_u8(bsat4, idx0)); w0 = vreinterpretq_f32_u8(vqtbl4q_u8(wsat4, idx0)); x1.x = vreinterpretq_f32_u8(vqtbl4q_u8(rsat4, idx1)); x1.y = vreinterpretq_f32_u8(vqtbl4q_u8(gsat4, idx1)); x1.z = vreinterpretq_f32_u8(vqtbl4q_u8(bsat4, idx1)); w1 = vreinterpretq_f32_u8(vqtbl4q_u8(wsat4, idx1)); } #else // Scalar path x0.x = vzero(); x0.y = vzero(); x0.z = vzero(); w0 = vzero(); x1.x = vzero(); x1.y = vzero(); x1.z = vzero(); w1 = vzero(); for (int l = 0; l < VEC_SIZE; l++) { int c0 = s_threeCluster[i + l].c0 - 1; if (c0 >= 0) { lane(x0.x, l) = sat.r[c0]; lane(x0.y, l) = sat.g[c0]; lane(x0.z, l) = sat.b[c0]; lane(w0, l) = sat.w[c0]; } int c1 = s_threeCluster[i + l].c1 - 1; if (c1 >= 0) { lane(x1.x, l) = sat.r[c1]; lane(x1.y, l) = sat.g[c1]; lane(x1.z, l) = sat.b[c1]; lane(w1, l) = sat.w[c1]; } } #endif VFloat w2 = vbroadcast(w_sum) - w1; x1 = x1 - x0; w1 = w1 - w0; VFloat alphabeta_sum = w1 * vbroadcast(0.25f); VFloat alpha2_sum = w0 + alphabeta_sum; VFloat beta2_sum = w2 + alphabeta_sum; VFloat factor = vrcp(vm2sub(alpha2_sum, beta2_sum, alphabeta_sum, alphabeta_sum)); VVector3 alphax_sum = x0 + x1 * vbroadcast(0.5f); VVector3 betax_sum = vbroadcast(r_sum, g_sum, b_sum) - alphax_sum; VVector3 a = vm2sub(alphax_sum, beta2_sum, betax_sum, alphabeta_sum) * factor; VVector3 b = vm2sub(betax_sum, alpha2_sum, alphax_sum, alphabeta_sum) * factor; // snap to the grid a = vround_ept(a); b = vround_ept(b); // compute the error VVector3 e2 = vm2sub(a, vmsub(b, alphabeta_sum, alphax_sum), b, betax_sum) * vbroadcast(2.0f); VVector3 e1 = vmadd(a * a, alpha2_sum, vmadd(b * b, beta2_sum, e2)); // apply the metric to the error term VFloat error = vdot(e1, vbroadcast(metric_sqr)); // keep the solution if it wins auto mask = (error < vbesterror); // I could mask the unused lanes here, but instead I set the invalid SAT entries to FLT_MAX. //mask = (mask & (vbroadcast(total_order_count) >= tid8(i))); // This doesn't seem to help. Is it OK to consider elements out of bounds? vbesterror = vselect(mask, vbesterror, error); vbeststart = vselect(mask, vbeststart, a); vbestend = vselect(mask, vbestend, b); } int bestindex = reduce_min_index(vbesterror); start->x = lane(vbeststart.x, bestindex); start->y = lane(vbeststart.y, bestindex); start->z = lane(vbeststart.z, bestindex); end->x = lane(vbestend.x, bestindex); end->y = lane(vbestend.y, bestindex); end->z = lane(vbestend.z, bestindex); } static void cluster_fit_four(const SummedAreaTable & sat, int count, Vector3 metric_sqr, Vector3 * start, Vector3 * end) { const float r_sum = sat.r[count-1]; const float g_sum = sat.g[count-1]; const float b_sum = sat.b[count-1]; const float w_sum = sat.w[count-1]; VFloat vbesterror = vbroadcast(FLT_MAX); VVector3 vbeststart = { vzero(), vzero(), vzero() }; VVector3 vbestend = { vzero(), vzero(), vzero() }; // check all possible clusters for this total order const int total_order_count = s_fourClusterTotal[count - 1]; for (int i = 0; i < total_order_count; i += VEC_SIZE) { VVector3 x0, x1, x2; VFloat w0, w1, w2; /* // Another approach would be to load and broadcast one color at a time like I do in my old CUDA implementation. uint akku = 0; // Compute alpha & beta for this permutation. #pragma unroll for (int i = 0; i < 16; i++) { const uint bits = permutation >> (2*i); alphax_sum += alphaTable4[bits & 3] * colors[i]; akku += prods4[bits & 3]; } float alpha2_sum = float(akku >> 16); float beta2_sum = float((akku >> 8) & 0xff); float alphabeta_sum = float(akku & 0xff); float3 betax_sum = 9.0f * color_sum - alphax_sum; */ #if ICBC_USE_AVX512_PERMUTE auto loadmask = lane_id() < vbroadcast(float(count)); // Load sat in one register: VFloat vrsat = vload(loadmask, sat.r, FLT_MAX); VFloat vgsat = vload(loadmask, sat.g, FLT_MAX); VFloat vbsat = vload(loadmask, sat.b, FLT_MAX); VFloat vwsat = vload(loadmask, sat.w, FLT_MAX); // Load 4 uint8 per lane. VInt packedClusterIndex = vload((int *)&s_fourCluster[i]); VInt c0 = (packedClusterIndex & 0xFF) - 1; VInt c1 = ((packedClusterIndex >> 8) & 0xFF) - 1; VInt c2 = ((packedClusterIndex >> 16)) - 1; // @@ No need for & x0.x = vpermuteif(c0 >= 0, vrsat, c0); x0.y = vpermuteif(c0 >= 0, vgsat, c0); x0.z = vpermuteif(c0 >= 0, vbsat, c0); w0 = vpermuteif(c0 >= 0, vwsat, c0); x1.x = vpermuteif(c1 >= 0, vrsat, c1); x1.y = vpermuteif(c1 >= 0, vgsat, c1); x1.z = vpermuteif(c1 >= 0, vbsat, c1); w1 = vpermuteif(c1 >= 0, vwsat, c1); x2.x = vpermuteif(c2 >= 0, vrsat, c2); x2.y = vpermuteif(c2 >= 0, vgsat, c2); x2.z = vpermuteif(c2 >= 0, vbsat, c2); w2 = vpermuteif(c2 >= 0, vwsat, c2); #elif ICBC_USE_AVX2_PERMUTE2 // Load 4 uint8 per lane. VInt packedClusterIndex = vload((int *)&s_fourCluster[i]); VInt c0 = (packedClusterIndex & 0xFF); VInt c1 = ((packedClusterIndex >> 8) & 0xFF); VInt c2 = ((packedClusterIndex >> 16)); // @@ No need for & if (count <= 8) { // Load sat.r in one register: VFloat rLo = vload(sat.r); VFloat gLo = vload(sat.g); VFloat bLo = vload(sat.b); VFloat wLo = vload(sat.w); x0.x = vpermuteif(c0>0, rLo, c0-1); x0.y = vpermuteif(c0>0, gLo, c0-1); x0.z = vpermuteif(c0>0, bLo, c0-1); w0 = vpermuteif(c0>0, wLo, c0-1); x1.x = vpermuteif(c1>0, rLo, c1-1); x1.y = vpermuteif(c1>0, gLo, c1-1); x1.z = vpermuteif(c1>0, bLo, c1-1); w1 = vpermuteif(c1>0, wLo, c1-1); x2.x = vpermuteif(c2>0, rLo, c2-1); x2.y = vpermuteif(c2>0, gLo, c2-1); x2.z = vpermuteif(c2>0, bLo, c2-1); w2 = vpermuteif(c2>0, wLo, c2-1); } else { // Load sat.r in two registers: VFloat rLo = vload(sat.r); VFloat rHi = vload(sat.r + 8); VFloat gLo = vload(sat.g); VFloat gHi = vload(sat.g + 8); VFloat bLo = vload(sat.b); VFloat bHi = vload(sat.b + 8); VFloat wLo = vload(sat.w); VFloat wHi = vload(sat.w + 8); x0.x = vpermute2if(c0>0, rLo, rHi, c0-1); x0.y = vpermute2if(c0>0, gLo, gHi, c0-1); x0.z = vpermute2if(c0>0, bLo, bHi, c0-1); w0 = vpermute2if(c0>0, wLo, wHi, c0-1); x1.x = vpermute2if(c1>0, rLo, rHi, c1-1); x1.y = vpermute2if(c1>0, gLo, gHi, c1-1); x1.z = vpermute2if(c1>0, bLo, bHi, c1-1); w1 = vpermute2if(c1>0, wLo, wHi, c1-1); x2.x = vpermute2if(c2>0, rLo, rHi, c2-1); x2.y = vpermute2if(c2>0, gLo, gHi, c2-1); x2.z = vpermute2if(c2>0, bLo, bHi, c2-1); w2 = vpermute2if(c2>0, wLo, wHi, c2-1); } #elif ICBC_USE_NEON_VTL uint8x16_t idx0 = (uint8x16_t &)s_neon_vtl_index0_4[4*i]; uint8x16_t idx1 = (uint8x16_t &)s_neon_vtl_index1_4[4*i]; uint8x16_t idx2 = (uint8x16_t &)s_neon_vtl_index2_4[4*i]; if (count <= 4) { uint8x16_t rsat1 = vld1q_u8((uint8*)sat.r); uint8x16_t gsat1 = vld1q_u8((uint8*)sat.g); uint8x16_t bsat1 = vld1q_u8((uint8*)sat.b); uint8x16_t wsat1 = vld1q_u8((uint8*)sat.w); x0.x = vreinterpretq_f32_u8(vqtbl1q_u8(rsat1, idx0)); x0.y = vreinterpretq_f32_u8(vqtbl1q_u8(gsat1, idx0)); x0.z = vreinterpretq_f32_u8(vqtbl1q_u8(bsat1, idx0)); w0 = vreinterpretq_f32_u8(vqtbl1q_u8(wsat1, idx0)); x1.x = vreinterpretq_f32_u8(vqtbl1q_u8(rsat1, idx1)); x1.y = vreinterpretq_f32_u8(vqtbl1q_u8(gsat1, idx1)); x1.z = vreinterpretq_f32_u8(vqtbl1q_u8(bsat1, idx1)); w1 = vreinterpretq_f32_u8(vqtbl1q_u8(wsat1, idx1)); x2.x = vreinterpretq_f32_u8(vqtbl1q_u8(rsat1, idx2)); x2.y = vreinterpretq_f32_u8(vqtbl1q_u8(gsat1, idx2)); x2.z = vreinterpretq_f32_u8(vqtbl1q_u8(bsat1, idx2)); w2 = vreinterpretq_f32_u8(vqtbl1q_u8(wsat1, idx2)); } else if (count <= 8) { uint8x16x2_t rsat2 = vld2q_u8((uint8*)sat.r); uint8x16x2_t gsat2 = vld2q_u8((uint8*)sat.g); uint8x16x2_t bsat2 = vld2q_u8((uint8*)sat.b); uint8x16x2_t wsat2 = vld2q_u8((uint8*)sat.w); x0.x = vreinterpretq_f32_u8(vqtbl2q_u8(rsat2, idx0)); x0.y = vreinterpretq_f32_u8(vqtbl2q_u8(gsat2, idx0)); x0.z = vreinterpretq_f32_u8(vqtbl2q_u8(bsat2, idx0)); w0 = vreinterpretq_f32_u8(vqtbl2q_u8(wsat2, idx0)); x1.x = vreinterpretq_f32_u8(vqtbl2q_u8(rsat2, idx1)); x1.y = vreinterpretq_f32_u8(vqtbl2q_u8(gsat2, idx1)); x1.z = vreinterpretq_f32_u8(vqtbl2q_u8(bsat2, idx1)); w1 = vreinterpretq_f32_u8(vqtbl2q_u8(wsat2, idx1)); x2.x = vreinterpretq_f32_u8(vqtbl2q_u8(rsat2, idx2)); x2.y = vreinterpretq_f32_u8(vqtbl2q_u8(gsat2, idx2)); x2.z = vreinterpretq_f32_u8(vqtbl2q_u8(bsat2, idx2)); w2 = vreinterpretq_f32_u8(vqtbl2q_u8(wsat2, idx2)); } else if (count <= 12) { uint8x16x3_t rsat3 = vld3q_u8((uint8*)sat.r); uint8x16x3_t gsat3 = vld3q_u8((uint8*)sat.g); uint8x16x3_t bsat3 = vld3q_u8((uint8*)sat.b); uint8x16x3_t wsat3 = vld3q_u8((uint8*)sat.w); x0.x = vreinterpretq_f32_u8(vqtbl3q_u8(rsat3, idx0)); x0.y = vreinterpretq_f32_u8(vqtbl3q_u8(gsat3, idx0)); x0.z = vreinterpretq_f32_u8(vqtbl3q_u8(bsat3, idx0)); w0 = vreinterpretq_f32_u8(vqtbl3q_u8(wsat3, idx0)); x1.x = vreinterpretq_f32_u8(vqtbl3q_u8(rsat3, idx1)); x1.y = vreinterpretq_f32_u8(vqtbl3q_u8(gsat3, idx1)); x1.z = vreinterpretq_f32_u8(vqtbl3q_u8(bsat3, idx1)); w1 = vreinterpretq_f32_u8(vqtbl3q_u8(wsat3, idx1)); x2.x = vreinterpretq_f32_u8(vqtbl3q_u8(rsat3, idx2)); x2.y = vreinterpretq_f32_u8(vqtbl3q_u8(gsat3, idx2)); x2.z = vreinterpretq_f32_u8(vqtbl3q_u8(bsat3, idx2)); w2 = vreinterpretq_f32_u8(vqtbl3q_u8(wsat3, idx2)); } else { uint8x16x4_t rsat4 = vld4q_u8((uint8*)sat.r); uint8x16x4_t gsat4 = vld4q_u8((uint8*)sat.g); uint8x16x4_t bsat4 = vld4q_u8((uint8*)sat.b); uint8x16x4_t wsat4 = vld4q_u8((uint8*)sat.w); x0.x = vreinterpretq_f32_u8(vqtbl4q_u8(rsat4, idx0)); x0.y = vreinterpretq_f32_u8(vqtbl4q_u8(gsat4, idx0)); x0.z = vreinterpretq_f32_u8(vqtbl4q_u8(bsat4, idx0)); w0 = vreinterpretq_f32_u8(vqtbl4q_u8(wsat4, idx0)); x1.x = vreinterpretq_f32_u8(vqtbl4q_u8(rsat4, idx1)); x1.y = vreinterpretq_f32_u8(vqtbl4q_u8(gsat4, idx1)); x1.z = vreinterpretq_f32_u8(vqtbl4q_u8(bsat4, idx1)); w1 = vreinterpretq_f32_u8(vqtbl4q_u8(wsat4, idx1)); x2.x = vreinterpretq_f32_u8(vqtbl4q_u8(rsat4, idx2)); x2.y = vreinterpretq_f32_u8(vqtbl4q_u8(gsat4, idx2)); x2.z = vreinterpretq_f32_u8(vqtbl4q_u8(bsat4, idx2)); w2 = vreinterpretq_f32_u8(vqtbl4q_u8(wsat4, idx2)); } #else // Scalar path x0.x = vzero(); x0.y = vzero(); x0.z = vzero(); w0 = vzero(); x1.x = vzero(); x1.y = vzero(); x1.z = vzero(); w1 = vzero(); x2.x = vzero(); x2.y = vzero(); x2.z = vzero(); w2 = vzero(); for (int l = 0; l < VEC_SIZE; l++) { int c0 = s_fourCluster[i + l].c0 - 1; if (c0 >= 0) { lane(x0.x, l) = sat.r[c0]; lane(x0.y, l) = sat.g[c0]; lane(x0.z, l) = sat.b[c0]; lane(w0, l) = sat.w[c0]; } int c1 = s_fourCluster[i + l].c1 - 1; if (c1 >= 0) { lane(x1.x, l) = sat.r[c1]; lane(x1.y, l) = sat.g[c1]; lane(x1.z, l) = sat.b[c1]; lane(w1, l) = sat.w[c1]; } int c2 = s_fourCluster[i + l].c2 - 1; if (c2 >= 0) { lane(x2.x, l) = sat.r[c2]; lane(x2.y, l) = sat.g[c2]; lane(x2.z, l) = sat.b[c2]; lane(w2, l) = sat.w[c2]; } } #endif VFloat w3 = vbroadcast(w_sum) - w2; x2 = x2 - x1; x1 = x1 - x0; w2 = w2 - w1; w1 = w1 - w0; VFloat alpha2_sum = vmadd(w2, (1.0f / 9.0f), vmadd(w1, (4.0f / 9.0f), w0)); VFloat beta2_sum = vmadd(w1, (1.0f / 9.0f), vmadd(w2, (4.0f / 9.0f), w3)); VFloat alphabeta_sum = (w1 + w2) * vbroadcast(2.0f / 9.0f); VFloat factor = vrcp(vm2sub(alpha2_sum, beta2_sum, alphabeta_sum, alphabeta_sum)); VVector3 alphax_sum = vmadd(x2, (1.0f / 3.0f), vmadd(x1, (2.0f / 3.0f), x0)); VVector3 betax_sum = vbroadcast(r_sum, g_sum, b_sum) - alphax_sum; VVector3 a = vm2sub(alphax_sum, beta2_sum, betax_sum, alphabeta_sum) * factor; VVector3 b = vm2sub(betax_sum, alpha2_sum, alphax_sum, alphabeta_sum) * factor; // snap to the grid a = vround_ept(a); b = vround_ept(b); // compute the error VVector3 e2 = vm2sub(a, vmsub(b, alphabeta_sum, alphax_sum), b, betax_sum) * vbroadcast(2.0f); VVector3 e1 = vmadd(a * a, alpha2_sum, vmadd(b * b, beta2_sum, e2)); // apply the metric to the error term VFloat error = vdot(e1, vbroadcast(metric_sqr)); // keep the solution if it wins auto mask = (error < vbesterror); // We could mask the unused lanes here, but instead set the invalid SAT entries to FLT_MAX. //mask = (mask & (vbroadcast(total_order_count) >= tid8(i))); // This doesn't seem to help. Is it OK to consider elements out of bounds? vbesterror = vselect(mask, vbesterror, error); vbeststart = vselect(mask, vbeststart, a); vbestend = vselect(mask, vbestend, b); } int bestindex = reduce_min_index(vbesterror); start->x = lane(vbeststart.x, bestindex); start->y = lane(vbeststart.y, bestindex); start->z = lane(vbeststart.z, bestindex); end->x = lane(vbestend.x, bestindex); end->y = lane(vbestend.y, bestindex); end->z = lane(vbestend.z, bestindex); } /////////////////////////////////////////////////////////////////////////////////////////////////// // Palette evaluation. Decoder s_decoder = Decoder_D3D10; // D3D10 inline void evaluate_palette4_d3d10(Color32 palette[4]) { palette[2].r = (2 * palette[0].r + palette[1].r) / 3; palette[2].g = (2 * palette[0].g + palette[1].g) / 3; palette[2].b = (2 * palette[0].b + palette[1].b) / 3; palette[2].a = 0xFF; palette[3].r = (2 * palette[1].r + palette[0].r) / 3; palette[3].g = (2 * palette[1].g + palette[0].g) / 3; palette[3].b = (2 * palette[1].b + palette[0].b) / 3; palette[3].a = 0xFF; } inline void evaluate_palette3_d3d10(Color32 palette[4]) { palette[2].r = (palette[0].r + palette[1].r) / 2; palette[2].g = (palette[0].g + palette[1].g) / 2; palette[2].b = (palette[0].b + palette[1].b) / 2; palette[2].a = 0xFF; palette[3].u = 0; } static void evaluate_palette_d3d10(Color16 c0, Color16 c1, Color32 palette[4]) { palette[0] = bitexpand_color16_to_color32(c0); palette[1] = bitexpand_color16_to_color32(c1); if (c0.u > c1.u) { evaluate_palette4_d3d10(palette); } else { evaluate_palette3_d3d10(palette); } } // NV inline void evaluate_palette4_nv(Color16 c0, Color16 c1, Color32 palette[4]) { int gdiff = palette[1].g - palette[0].g; palette[2].r = uint8(((2 * c0.r + c1.r) * 22) / 8); palette[2].g = uint8((256 * palette[0].g + gdiff / 4 + 128 + gdiff * 80) / 256); palette[2].b = uint8(((2 * c0.b + c1.b) * 22) / 8); palette[2].a = 0xFF; palette[3].r = uint8(((2 * c1.r + c0.r) * 22) / 8); palette[3].g = uint8((256 * palette[1].g - gdiff / 4 + 128 - gdiff * 80) / 256); palette[3].b = uint8(((2 * c1.b + c0.b) * 22) / 8); palette[3].a = 0xFF; } inline void evaluate_palette3_nv(Color16 c0, Color16 c1, Color32 palette[4]) { int gdiff = palette[1].g - palette[0].g; palette[2].r = uint8(((c0.r + c1.r) * 33) / 8); palette[2].g = uint8((256 * palette[0].g + gdiff / 4 + 128 + gdiff * 128) / 256); palette[2].b = uint8(((c0.b + c1.b) * 33) / 8); palette[2].a = 0xFF; palette[3].u = 0; } static void evaluate_palette_nv(Color16 c0, Color16 c1, Color32 palette[4]) { palette[0] = bitexpand_color16_to_color32(c0); palette[1] = bitexpand_color16_to_color32(c1); if (c0.u > c1.u) { evaluate_palette4_nv(c0, c1, palette); } else { evaluate_palette3_nv(c0, c1, palette); } } // AMD inline void evaluate_palette4_amd(Color32 palette[4]) { palette[2].r = uint8((43 * palette[0].r + 21 * palette[1].r + 32) >> 6); palette[2].g = uint8((43 * palette[0].g + 21 * palette[1].g + 32) >> 6); palette[2].b = uint8((43 * palette[0].b + 21 * palette[1].b + 32) >> 6); palette[2].a = 0xFF; palette[3].r = uint8((43 * palette[1].r + 21 * palette[0].r + 32) >> 6); palette[3].g = uint8((43 * palette[1].g + 21 * palette[0].g + 32) >> 6); palette[3].b = uint8((43 * palette[1].b + 21 * palette[0].b + 32) >> 6); palette[3].a = 0xFF; } inline void evaluate_palette3_amd(Color32 palette[4]) { palette[2].r = uint8((palette[0].r + palette[1].r + 1) / 2); palette[2].g = uint8((palette[0].g + palette[1].g + 1) / 2); palette[2].b = uint8((palette[0].b + palette[1].b + 1) / 2); palette[2].a = 0xFF; palette[3].u = 0; } static void evaluate_palette_amd(Color16 c0, Color16 c1, Color32 palette[4]) { palette[0] = bitexpand_color16_to_color32(c0); palette[1] = bitexpand_color16_to_color32(c1); if (c0.u > c1.u) { evaluate_palette4_amd(palette); } else { evaluate_palette3_amd(palette); } } inline void evaluate_palette(Color16 c0, Color16 c1, Color32 palette[4], Decoder decoder = s_decoder) { if (decoder == Decoder_D3D10) evaluate_palette_d3d10(c0, c1, palette); else if (decoder == Decoder_NVIDIA) evaluate_palette_nv(c0, c1, palette); else if (decoder == Decoder_AMD) evaluate_palette_amd(c0, c1, palette); } static void evaluate_palette(Color16 c0, Color16 c1, Vector3 palette[4]) { Color32 palette32[4]; evaluate_palette(c0, c1, palette32); for (int i = 0; i < 4; i++) { palette[i] = color_to_vector3(palette32[i]); } } static void decode_dxt1(const BlockDXT1 * block, unsigned char rgba_block[16 * 4], Decoder decoder) { Color32 palette[4]; evaluate_palette(block->col0, block->col1, palette, decoder); for (int i = 0; i < 16; i++) { int index = (block->indices >> (2 * i)) & 3; Color32 c = palette[index]; rgba_block[4 * i + 0] = c.r; rgba_block[4 * i + 1] = c.g; rgba_block[4 * i + 2] = c.b; rgba_block[4 * i + 3] = c.a; } } /////////////////////////////////////////////////////////////////////////////////////////////////// // Error evaluation. // Different ways of estimating the error. static float evaluate_mse(const Vector3 & p, const Vector3 & c, const Vector3 & w) { Vector3 d = (p - c) * w * 255; return dot(d, d); } static float evaluate_mse(const Color32 & p, const Vector3 & c, const Vector3 & w) { Vector3 d = (color_to_vector3(p) - c) * w * 255; return dot(d, d); } static int evaluate_mse(const Color32 & p, const Color32 & c) { return (square(int(p.r)-c.r) + square(int(p.g)-c.g) + square(int(p.b)-c.b)); } static int evaluate_mse(const Color32 palette[4], const Color32 & c) { int e0 = evaluate_mse(palette[0], c); int e1 = evaluate_mse(palette[1], c); int e2 = evaluate_mse(palette[2], c); int e3 = evaluate_mse(palette[3], c); return min(min(e0, e1), min(e2, e3)); } // Returns MSE error in [0-255] range. static int evaluate_mse(const BlockDXT1 * output, Color32 color, int index) { Color32 palette[4]; evaluate_palette(output->col0, output->col1, palette); return evaluate_mse(palette[index], color); } // Returns weighted MSE error in [0-255] range. static float evaluate_palette_error(Color32 palette[4], const Color32 * colors, const float * weights, int count) { float total = 0.0f; for (int i = 0; i < count; i++) { total += weights[i] * evaluate_mse(palette, colors[i]); } return total; } static float evaluate_palette_error(Color32 palette[4], const Color32 * colors, int count) { float total = 0.0f; for (int i = 0; i < count; i++) { total += evaluate_mse(palette, colors[i]); } return total; } static float evaluate_mse(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, const BlockDXT1 * output) { Color32 palette[4]; evaluate_palette(output->col0, output->col1, palette); // evaluate error for each index. float error = 0.0f; for (int i = 0; i < 16; i++) { int index = (output->indices >> (2 * i)) & 3; error += input_weights[i] * evaluate_mse(palette[index], input_colors[i].xyz, color_weights); } return error; } static float evaluate_mse(const Vector4 input_colors[16], const float input_weights[16], const Vector3& color_weights, Vector3 palette[4], uint32 indices) { // evaluate error for each index. float error = 0.0f; for (int i = 0; i < 16; i++) { int index = (indices >> (2 * i)) & 3; error += input_weights[i] * evaluate_mse(palette[index], input_colors[i].xyz, color_weights); } return error; } float evaluate_dxt1_error(const uint8 rgba_block[16*4], const BlockDXT1 * block, Decoder decoder) { Color32 palette[4]; evaluate_palette(block->col0, block->col1, palette, decoder); // evaluate error for each index. float error = 0.0f; for (int i = 0; i < 16; i++) { int index = (block->indices >> (2 * i)) & 3; Color32 c; c.r = rgba_block[4 * i + 0]; c.g = rgba_block[4 * i + 1]; c.b = rgba_block[4 * i + 2]; c.a = 255; error += evaluate_mse(palette[index], c); } return error; } /////////////////////////////////////////////////////////////////////////////////////////////////// // Index selection // @@ Can we interleave the two uint16 at once? inline uint32 interleave_uint16_with_zeros(uint32 input) { uint32 word = input; word = (word ^ (word << 8 )) & 0x00ff00ff; word = (word ^ (word << 4 )) & 0x0f0f0f0f; word = (word ^ (word << 2 )) & 0x33333333; word = (word ^ (word << 1 )) & 0x55555555; return word; } // Interleave the bits. https://lemire.me/blog/2018/01/08/how-fast-can-you-bit-interleave-32-bit-integers/ ICBC_FORCEINLINE uint32 interleave(uint32 a, uint32 b) { #if ICBC_BMI2 return _pdep_u32(a, 0x55555555) | _pdep_u32(b, 0xaaaaaaaa); #else return interleave_uint16_with_zeros(a) | (interleave_uint16_with_zeros(b) << 1); #endif } static uint compute_indices4(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 palette[4]) { uint indices0 = 0; uint indices1 = 0; VVector3 vw = vbroadcast(color_weights); VVector3 vp0 = vbroadcast(palette[0]) * vw; VVector3 vp1 = vbroadcast(palette[1]) * vw; VVector3 vp2 = vbroadcast(palette[2]) * vw; VVector3 vp3 = vbroadcast(palette[3]) * vw; for (int i = 0; i < 16; i += VEC_SIZE) { VVector3 vc = vload(&input_colors[i]) * vw; VFloat d0 = vlen2(vc - vp0); VFloat d1 = vlen2(vc - vp1); VFloat d2 = vlen2(vc - vp2); VFloat d3 = vlen2(vc - vp3); VMask b1 = d1 > d2; VMask b2 = d0 > d2; VMask x0 = b1 & b2; VMask b0 = d0 > d3; VMask b3 = d1 > d3; x0 = x0 | (b0 & b3); VMask b4 = d2 > d3; VMask x1 = b0 & b4; indices0 |= mask(x0) << i; indices1 |= mask(x1) << i; } return interleave(indices1, indices0); } static uint compute_indices3(const Vector4 input_colors[16], const Vector3 & color_weights, bool allow_transparent_black, const Vector3 palette[4]) { uint indices0 = 0; uint indices1 = 0; VVector3 vw = vbroadcast(color_weights); VVector3 vp0 = vbroadcast(palette[0]) * vw; VVector3 vp1 = vbroadcast(palette[1]) * vw; VVector3 vp2 = vbroadcast(palette[2]) * vw; if (allow_transparent_black) { for (int i = 0; i < 16; i += VEC_SIZE) { VVector3 vc = vload(&input_colors[i]) * vw; VFloat d0 = vlen2(vp0 - vc); VFloat d1 = vlen2(vp1 - vc); VFloat d2 = vlen2(vp2 - vc); VFloat d3 = vdot(vc, vc); VMask i1 = (d1 < d2); VMask i2 = (d2 <= d0) & (d2 <= d1); VMask i3 = (d3 <= d0) & (d3 <= d1) & (d3 <= d2); indices0 |= mask(i2 | i3) << i; indices1 |= mask(i1 | i3) << i; } } else { for (int i = 0; i < 16; i += VEC_SIZE) { VVector3 vc = vload(&input_colors[i]) * vw; VFloat d0 = vlen2(vc - vp0); VFloat d1 = vlen2(vc - vp1); VFloat d2 = vlen2(vc - vp2); VMask i1 = (d1 < d2); VMask i2 = (d2 <= d0) & (d2 <= d1); indices0 |= mask(i2) << i; indices1 |= mask(i1) << i; } } return interleave(indices1, indices0); } static uint compute_indices(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 palette[4]) { #if 0 Vector3 p0 = palette[0] * color_weights; Vector3 p1 = palette[1] * color_weights; Vector3 p2 = palette[2] * color_weights; Vector3 p3 = palette[3] * color_weights; uint indices = 0; for (int i = 0; i < 16; i++) { Vector3 ci = input_colors[i].xyz * color_weights; float d0 = lengthSquared(p0 - ci); float d1 = lengthSquared(p1 - ci); float d2 = lengthSquared(p2 - ci); float d3 = lengthSquared(p3 - ci); uint index; if (d0 < d1 && d0 < d2 && d0 < d3) index = 0; else if (d1 < d2 && d1 < d3) index = 1; else if (d2 < d3) index = 2; else index = 3; indices |= index << (2 * i); } return indices; #else uint indices0 = 0; uint indices1 = 0; VVector3 vw = vbroadcast(color_weights); VVector3 vp0 = vbroadcast(palette[0]) * vw; VVector3 vp1 = vbroadcast(palette[1]) * vw; VVector3 vp2 = vbroadcast(palette[2]) * vw; VVector3 vp3 = vbroadcast(palette[3]) * vw; for (int i = 0; i < 16; i += VEC_SIZE) { VVector3 vc = vload(&input_colors[i]) * vw; VFloat d0 = vlen2(vc - vp0); VFloat d1 = vlen2(vc - vp1); VFloat d2 = vlen2(vc - vp2); VFloat d3 = vlen2(vc - vp3); //VMask i0 = (d0 < d1) & (d0 < d2) & (d0 < d3); VMask i1 = (d1 <= d0) & (d1 < d2) & (d1 < d3); VMask i2 = (d2 <= d0) & (d2 <= d1) & (d2 < d3); VMask i3 = (d3 <= d0) & (d3 <= d1) & (d3 <= d2); //VFloat vindex = vselect(i0, vselect(i1, vselect(i2, vbroadcast(3), vbroadcast(2)), vbroadcast(1)), vbroadcast(0)); indices0 |= mask(i2 | i3) << i; indices1 |= mask(i1 | i3) << i; } uint indices = interleave(indices1, indices0); return indices; #endif } static float output_block3(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, bool allow_transparent_black, const Vector3 & v0, const Vector3 & v1, BlockDXT1 * block) { Color16 color0 = vector3_to_color16(v0); Color16 color1 = vector3_to_color16(v1); if (color0.u > color1.u) { swap(color0, color1); } Vector3 palette[4]; evaluate_palette(color0, color1, palette); block->col0 = color0; block->col1 = color1; block->indices = compute_indices3(input_colors, color_weights, allow_transparent_black, palette); return evaluate_mse(input_colors, input_weights, color_weights, palette, block->indices); } static float output_block4(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, const Vector3 & v0, const Vector3 & v1, BlockDXT1 * block) { Color16 color0 = vector3_to_color16(v0); Color16 color1 = vector3_to_color16(v1); if (color0.u < color1.u) { swap(color0, color1); } Vector3 palette[4]; evaluate_palette(color0, color1, palette); block->col0 = color0; block->col1 = color1; block->indices = compute_indices4(input_colors, color_weights, palette); return evaluate_mse(input_colors, input_weights, color_weights, palette, block->indices); } // Least squares fitting of color end points for the given indices. @@ Take weights into account. static bool optimize_end_points4(uint indices, const Vector4 * colors, /*const float * weights,*/ int count, Vector3 * a, Vector3 * b) { float alpha2_sum = 0.0f; float beta2_sum = 0.0f; float alphabeta_sum = 0.0f; Vector3 alphax_sum = { 0,0,0 }; Vector3 betax_sum = { 0,0,0 }; for (int i = 0; i < count; i++) { const uint bits = indices >> (2 * i); float beta = float(bits & 1); if (bits & 2) beta = (1 + beta) / 3.0f; float alpha = 1.0f - beta; alpha2_sum += alpha * alpha; beta2_sum += beta * beta; alphabeta_sum += alpha * beta; alphax_sum += alpha * colors[i].xyz; betax_sum += beta * colors[i].xyz; } float denom = alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum; if (equal(denom, 0.0f)) return false; float factor = 1.0f / denom; *a = saturate((alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor); *b = saturate((betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor); return true; } // Least squares optimization with custom factors. // This allows us passing the standard [1, 0, 2/3 1/3] weights by default, but also use alternative mappings when the number of clusters is not 4. static bool optimize_end_points4(uint indices, const Vector3 * colors, int count, float factors[4], Vector3 * a, Vector3 * b) { float alpha2_sum = 0.0f; float beta2_sum = 0.0f; float alphabeta_sum = 0.0f; Vector3 alphax_sum = { 0,0,0 }; Vector3 betax_sum = { 0,0,0 }; for (int i = 0; i < count; i++) { const uint idx = (indices >> (2 * i)) & 3; float alpha = factors[idx]; float beta = 1 - alpha; alpha2_sum += alpha * alpha; beta2_sum += beta * beta; alphabeta_sum += alpha * beta; alphax_sum += alpha * colors[i]; betax_sum += beta * colors[i]; } float denom = alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum; if (equal(denom, 0.0f)) return false; float factor = 1.0f / denom; *a = saturate((alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor); *b = saturate((betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor); return true; } static bool optimize_end_points4(uint indices, const Vector3 * colors, int count, Vector3 * a, Vector3 * b) { float factors[4] = { 1, 0, 2.f / 3, 1.f / 3 }; return optimize_end_points4(indices, colors, count, factors, a, b); } // Least squares fitting of color end points for the given indices. @@ This does not support black/transparent index. @@ Take weights into account. static bool optimize_end_points3(uint indices, const Vector3 * colors, /*const float * weights,*/ int count, Vector3 * a, Vector3 * b) { float alpha2_sum = 0.0f; float beta2_sum = 0.0f; float alphabeta_sum = 0.0f; Vector3 alphax_sum = { 0,0,0 }; Vector3 betax_sum = { 0,0,0 }; for (int i = 0; i < count; i++) { const uint bits = indices >> (2 * i); float beta = float(bits & 1); if (bits & 2) beta = 0.5f; float alpha = 1.0f - beta; alpha2_sum += alpha * alpha; beta2_sum += beta * beta; alphabeta_sum += alpha * beta; alphax_sum += alpha * colors[i]; betax_sum += beta * colors[i]; } float denom = alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum; if (equal(denom, 0.0f)) return false; float factor = 1.0f / denom; *a = saturate((alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor); *b = saturate((betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor); return true; } // find minimum and maximum colors based on bounding box in color space inline static void fit_colors_bbox(const Vector3 * colors, int count, Vector3 * __restrict c0, Vector3 * __restrict c1) { *c0 = { 0,0,0 }; *c1 = { 1,1,1 }; for (int i = 0; i < count; i++) { *c0 = max(*c0, colors[i]); *c1 = min(*c1, colors[i]); } } inline static void select_diagonal(const Vector3 * colors, int count, Vector3 * __restrict c0, Vector3 * __restrict c1) { Vector3 center = (*c0 + *c1) * 0.5f; /*Vector3 center = colors[0]; for (int i = 1; i < count; i++) { center = center * float(i-1) / i + colors[i] / i; }*/ /*Vector3 center = colors[0]; for (int i = 1; i < count; i++) { center += colors[i]; } center /= count;*/ float cov_xz = 0.0f; float cov_yz = 0.0f; for (int i = 0; i < count; i++) { Vector3 t = colors[i] - center; cov_xz += t.x * t.z; cov_yz += t.y * t.z; } float x0 = c0->x; float y0 = c0->y; float x1 = c1->x; float y1 = c1->y; if (cov_xz < 0) { swap(x0, x1); } if (cov_yz < 0) { swap(y0, y1); } *c0 = { x0, y0, c0->z }; *c1 = { x1, y1, c1->z }; } inline static void inset_bbox(Vector3 * __restrict c0, Vector3 * __restrict c1) { float bias = (8.0f / 255.0f) / 16.0f; Vector3 inset = (*c0 - *c1) / 16.0f - scalar_to_vector3(bias); *c0 = saturate(*c0 - inset); *c1 = saturate(*c1 + inset); } // Single color lookup tables from: // https://github.com/nothings/stb/blob/master/stb_dxt.h static uint8 s_match5[256][2]; static uint8 s_match6[256][2]; static inline int Lerp13(int a, int b) { // replace "/ 3" by "* 0xaaab) >> 17" if your compiler sucks or you really need every ounce of speed. return (a * 2 + b) / 3; } static void PrepareOptTable5(uint8 * table, Decoder decoder) { uint8 expand[32]; for (int i = 0; i < 32; i++) expand[i] = uint8((i << 3) | (i >> 2)); for (int i = 0; i < 256; i++) { int bestErr = 256 * 100; for (int mn = 0; mn < 32; mn++) { for (int mx = 0; mx < 32; mx++) { int mine = expand[mn]; int maxe = expand[mx]; int err; int amd_r = (43 * maxe + 21 * mine + 32) >> 6; int amd_err = abs(amd_r - i); int nv_r = ((2 * mx + mn) * 22) / 8; int nv_err = abs(nv_r - i); if (decoder == Decoder_D3D10) { // DX10 spec says that interpolation must be within 3% of "correct" result, // add this as error term. (normally we'd expect a random distribution of // +-1.5% error, but nowhere in the spec does it say that the error has to be // unbiased - better safe than sorry). int r = (maxe * 2 + mine) / 3; err = abs(r - i) * 100 + abs(mx - mn) * 3; // Another approach is to consider the worst of AMD and NVIDIA errors. err = max(amd_err, nv_err); } else if (decoder == Decoder_NVIDIA) { err = nv_err; } else /*if (decoder == Decoder_AMD)*/ { err = amd_err; } if (err < bestErr) { bestErr = err; table[i * 2 + 0] = uint8(mx); table[i * 2 + 1] = uint8(mn); } } } } } static void PrepareOptTable6(uint8 * table, Decoder decoder) { uint8 expand[64]; for (int i = 0; i < 64; i++) expand[i] = uint8((i << 2) | (i >> 4)); for (int i = 0; i < 256; i++) { int bestErr = 256 * 100; for (int mn = 0; mn < 64; mn++) { for (int mx = 0; mx < 64; mx++) { int mine = expand[mn]; int maxe = expand[mx]; int err; int amd_g = (43 * maxe + 21 * mine + 32) >> 6; int amd_err = abs(amd_g - i); int nv_g = (256 * mine + (maxe - mine) / 4 + 128 + (maxe - mine) * 80) / 256; int nv_err = abs(nv_g - i); if (decoder == Decoder_D3D10) { // DX10 spec says that interpolation must be within 3% of "correct" result, // add this as error term. (normally we'd expect a random distribution of // +-1.5% error, but nowhere in the spec does it say that the error has to be // unbiased - better safe than sorry). int g = (maxe * 2 + mine) / 3; err = abs(g - i) * 100 + abs(mx - mn) * 3; // Another approach is to consider the worst of AMD and NVIDIA errors. err = max(amd_err, nv_err); } else if (decoder == Decoder_NVIDIA) { err = nv_err; } else /*if (decoder == Decoder_AMD)*/ { err = amd_err; } if (err < bestErr) { bestErr = err; table[i * 2 + 0] = uint8(mx); table[i * 2 + 1] = uint8(mn); } } } } } static void init_single_color_tables(Decoder decoder) { // Prepare single color lookup tables. PrepareOptTable5(&s_match5[0][0], decoder); PrepareOptTable6(&s_match6[0][0], decoder); } // Single color compressor, based on: // https://mollyrocket.com/forums/viewtopic.php?t=392 static void compress_dxt1_single_color_optimal(Color32 c, BlockDXT1 * output) { output->col0.r = s_match5[c.r][0]; output->col0.g = s_match6[c.g][0]; output->col0.b = s_match5[c.b][0]; output->col1.r = s_match5[c.r][1]; output->col1.g = s_match6[c.g][1]; output->col1.b = s_match5[c.b][1]; output->indices = 0xaaaaaaaa; if (output->col0.u < output->col1.u) { swap(output->col0.u, output->col1.u); output->indices ^= 0x55555555; } } static float compress_dxt1_cluster_fit(const Vector4 input_colors[16], const float input_weights[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, bool three_color_mode, bool try_transparent_black, bool allow_transparent_black, BlockDXT1 * output) { Vector3 metric_sqr = color_weights * color_weights; SummedAreaTable sat; int sat_count = compute_sat(colors, weights, count, &sat); Vector3 start, end; cluster_fit_four(sat, sat_count, metric_sqr, &start, &end); float best_error = output_block4(input_colors, input_weights, color_weights, start, end, output); if (three_color_mode) { if (try_transparent_black) { Vector3 tmp_colors[16]; float tmp_weights[16]; int tmp_count = skip_blacks(colors, weights, count, tmp_colors, tmp_weights); if (!tmp_count) return best_error; sat_count = compute_sat(tmp_colors, tmp_weights, tmp_count, &sat); } cluster_fit_three(sat, sat_count, metric_sqr, &start, &end); BlockDXT1 three_color_block; float three_color_error = output_block3(input_colors, input_weights, color_weights, allow_transparent_black, start, end, &three_color_block); if (three_color_error < best_error) { best_error = three_color_error; *output = three_color_block; } } return best_error; } static float refine_endpoints(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, bool three_color_mode, float input_error, BlockDXT1 * output) { // TODO: // - Optimize palette evaluation when updating only one channel. // - try all diagonals. // Things that don't help: // - Alternate endpoint updates. // - Randomize order. // - If one direction does not improve, test opposite direction next. static const int8 deltas[16][3] = { {1,0,0}, {0,1,0}, {0,0,1}, {-1,0,0}, {0,-1,0}, {0,0,-1}, {1,1,0}, {1,0,1}, {0,1,1}, {-1,-1,0}, {-1,0,-1}, {0,-1,-1}, {-1,1,0}, //{-1,0,1}, {1,-1,0}, {0,-1,1}, //{1,0,-1}, {0,1,-1}, }; float best_error = input_error; int lastImprovement = 0; for (int i = 0; i < 256; i++) { BlockDXT1 refined = *output; int8 delta[3] = { deltas[i % 16][0], deltas[i % 16][1], deltas[i % 16][2] }; if ((i / 16) & 1) { refined.col0.r += delta[0]; refined.col0.g += delta[1]; refined.col0.b += delta[2]; } else { refined.col1.r += delta[0]; refined.col1.g += delta[1]; refined.col1.b += delta[2]; } if (!three_color_mode) { if (refined.col0.u == refined.col1.u) refined.col1.g += 1; if (refined.col0.u < refined.col1.u) swap(refined.col0.u, refined.col1.u); } Vector3 palette[4]; evaluate_palette(output->col0, output->col1, palette); refined.indices = compute_indices(input_colors, color_weights, palette); float refined_error = evaluate_mse(input_colors, input_weights, color_weights, &refined); if (refined_error < best_error) { best_error = refined_error; *output = refined; lastImprovement = i; } // Early out if the last 32 steps didn't improve error. if (i - lastImprovement > 32) break; } return best_error; } struct Options { float threshold = 0.0f; bool box_fit = false; bool least_squares_fit = false; bool cluster_fit = false; bool cluster_fit_3 = false; bool cluster_fit_3_black_only = false; bool endpoint_refinement = false; }; static Options setup_options(Quality level, bool enable_three_color_mode, bool enable_transparent_black) { Options opt; switch (level) { case Quality_Level1: // Box fit + least squares fit. opt.box_fit = true; opt.least_squares_fit = true; opt.threshold = 1.0f / 256; break; case Quality_Level2: // Cluster fit 4, threshold = 24. opt.box_fit = true; opt.least_squares_fit = true; opt.cluster_fit = true; opt.cluster_fit_3_black_only = enable_three_color_mode && enable_transparent_black; opt.threshold = 1.0f / 24; break; case Quality_Level3: // Cluster fit 4, threshold = 32. opt.box_fit = true; opt.cluster_fit = true; opt.cluster_fit_3_black_only = enable_three_color_mode && enable_transparent_black; opt.threshold = 1.0f / 32; break; case Quality_Level4: // Cluster fit 3+4, threshold = 48. opt.cluster_fit = true; opt.cluster_fit_3_black_only = enable_three_color_mode && enable_transparent_black; opt.threshold = 1.0f / 48; break; case Quality_Level5: // Cluster fit 3+4, threshold = 64. opt.cluster_fit = true; opt.cluster_fit_3_black_only = enable_three_color_mode && enable_transparent_black; opt.threshold = 1.0f / 64; break; case Quality_Level6: // Cluster fit 3+4, threshold = 96. opt.cluster_fit = true; opt.cluster_fit_3_black_only = enable_three_color_mode && enable_transparent_black; opt.threshold = 1.0f / 96; break; case Quality_Level7: // Cluster fit 3+4, threshold = 128. opt.cluster_fit = true; opt.cluster_fit_3_black_only = enable_three_color_mode && enable_transparent_black; opt.threshold = 1.0f / 128; break; case Quality_Level8: // Cluster fit 3+4, threshold = 256. opt.cluster_fit = true; opt.cluster_fit_3 = enable_three_color_mode; opt.threshold = 1.0f / 256; break; case Quality_Level9: // Cluster fit 3+4, threshold = 256 + Refinement. opt.cluster_fit = true; opt.cluster_fit_3 = enable_three_color_mode; opt.threshold = 1.0f / 256; opt.endpoint_refinement = true; break; } return opt; } static float compress_dxt1(Quality level, const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, bool three_color_mode, bool three_color_black, BlockDXT1 * output) { Options opt = setup_options(level, three_color_mode, three_color_black); Vector3 colors[16]; float weights[16]; bool any_black = false; int count; if (opt.cluster_fit) { count = reduce_colors(input_colors, input_weights, 16, opt.threshold, colors, weights, &any_black); } else { for (int i = 0; i < 16; i++) { colors[i] = input_colors[i].xyz; } count = 16; } if (count == 0) { // Output trivial block. output->col0.u = 0; output->col1.u = 0; output->indices = 0; return 0; } // Cluster fit cannot handle single color blocks, so encode them optimally. if (count == 1) { compress_dxt1_single_color_optimal(vector3_to_color32(colors[0]), output); return evaluate_mse(input_colors, input_weights, color_weights, output); } float error = FLT_MAX; // Quick end point selection. if (opt.box_fit) { Vector3 c0, c1; fit_colors_bbox(colors, count, &c0, &c1); inset_bbox(&c0, &c1); select_diagonal(colors, count, &c0, &c1); error = output_block4(input_colors, input_weights, color_weights, c0, c1, output); // Refine color for the selected indices. if (opt.least_squares_fit && optimize_end_points4(output->indices, input_colors, 16, &c0, &c1)) { BlockDXT1 optimized_block; float optimized_error = output_block4(input_colors, input_weights, color_weights, c0, c1, &optimized_block); if (optimized_error < error) { error = optimized_error; *output = optimized_block; } } } if (opt.cluster_fit) { // @@ Use current endpoints as input for initial PCA approximation? bool use_three_color_black = any_black && three_color_black; bool use_three_color_mode = opt.cluster_fit_3 || (use_three_color_black && opt.cluster_fit_3_black_only); // Try cluster fit. BlockDXT1 cluster_fit_output; float cluster_fit_error = compress_dxt1_cluster_fit(input_colors, input_weights, colors, weights, count, color_weights, use_three_color_mode, use_three_color_black, three_color_black, &cluster_fit_output); if (cluster_fit_error < error) { *output = cluster_fit_output; error = cluster_fit_error; } } if (opt.endpoint_refinement) { error = refine_endpoints(input_colors, input_weights, color_weights, three_color_mode, error, output); } return error; } // Public API void init_dxt1(Decoder decoder) { s_decoder = decoder; init_single_color_tables(decoder); init_cluster_tables(); } void decode_dxt1(const void * block, unsigned char rgba_block[16 * 4], Decoder decoder/*=Decoder_D3D10*/) { decode_dxt1((const BlockDXT1 *)block, rgba_block, decoder); } float evaluate_dxt1_error(const unsigned char rgba_block[16 * 4], const void * dxt_block, Decoder decoder/*=Decoder_D3D10*/) { return evaluate_dxt1_error(rgba_block, (const BlockDXT1 *)dxt_block, decoder); } float compress_dxt1(Quality level, const float * input_colors, const float * input_weights, const float rgb[3], bool three_color_mode, bool three_color_black, void * output) { return compress_dxt1(level, (Vector4*)input_colors, input_weights, { rgb[0], rgb[1], rgb[2] }, three_color_mode, three_color_black, (BlockDXT1*)output); } } // icbc // // Do not polute preprocessor definitions. // #undef ICBC_SIMD // #undef ICBC_ASSERT // #undef ICBC_SCALAR // #undef ICBC_SSE2 // #undef ICBC_SSE41 // #undef ICBC_AVX1 // #undef ICBC_AVX2 // #undef ICBC_AVX512 // #undef ICBC_NEON // #undef ICBC_VMX // #undef ICBC_USE_FMA // #undef ICBC_USE_AVX2_PERMUTE2 // #undef ICBC_USE_AVX512_PERMUTE // #undef ICBC_USE_NEON_VTL // #undef ICBC_PERFECT_ROUND #endif // ICBC_IMPLEMENTATION // Version History: // v1.00 - Initial release. // v1.01 - Added SPMD code path with AVX support. // v1.02 - Removed SIMD code path. // v1.03 - Quality levels. AVX512, Neon, Altivec, vectorized reduction and index selection. // v1.04 - Automatic compile-time SIMD selection. Specify hw decoder at runtime. More optimizations. // v1.05 - Bug fixes. Small optimizations. // Copyright (c) 2020 Ignacio Castano // // Permission is hereby granted, free of charge, to any person obtaining // a copy of this software and associated documentation files (the // "Software"), to deal in the Software without restriction, including // without limitation the rights to use, copy, modify, merge, publish, // distribute, sublicense, and/or sell copies of the Software, and to // permit persons to whom the Software is furnished to do so, subject to // the following conditions: // // The above copyright notice and this permission notice shall be included // in all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. // IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY // CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, // TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE // SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.