// icbc.h v1.05
// A High Quality SIMD BC1 Encoder by Ignacio Castano <castano@gmail.com>.
//
// LICENSE:
//  MIT license at the end of this file.

#ifndef ICBC_H
#define ICBC_H

namespace icbc {

    enum Decoder {
        Decoder_D3D10 = 0,
        Decoder_NVIDIA = 1,
        Decoder_AMD = 2
    };

    void init_dxt1(Decoder decoder = Decoder_D3D10);

    enum Quality {
        Quality_Level1,  // Box fit + least squares fit.
        Quality_Level2,  // Cluster fit 4, threshold = 24.
        Quality_Level3,  // Cluster fit 4, threshold = 32.
        Quality_Level4,  // Cluster fit 4, threshold = 48.
        Quality_Level5,  // Cluster fit 4, threshold = 64.
        Quality_Level6,  // Cluster fit 4, threshold = 96.
        Quality_Level7,  // Cluster fit 4, threshold = 128.
        Quality_Level8,  // Cluster fit 4+3, threshold = 256.
        Quality_Level9,  // Cluster fit 4+3, threshold = 256 + Refinement.

        Quality_Fast = Quality_Level1,
        Quality_Default = Quality_Level8,
        Quality_Max = Quality_Level9,
    };

    void decode_dxt1(const void * block, unsigned char rgba_block[16 * 4], Decoder decoder = Decoder_D3D10);
    float evaluate_dxt1_error(const unsigned char rgba_block[16 * 4], const void * block, Decoder decoder = Decoder_D3D10);

    float compress_dxt1(Quality level, const float * input_colors, const float * input_weights, const float color_weights[3], bool three_color_mode, bool three_color_black, void * output);
}

#endif // ICBC_H

#ifdef ICBC_IMPLEMENTATION

// Instruction level support must be chosen at compile time setting ICBC_SIMD to one of these values:
#define ICBC_SCALAR 0
#define ICBC_SSE2   1
#define ICBC_SSE41  2
#define ICBC_AVX1   3
#define ICBC_AVX2   4
#define ICBC_AVX512 5
#define ICBC_NEON   -1
#define ICBC_VMX    -2

#if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
    #define ICBC_X86 1
#endif

#if (defined(__arm__) || defined(_M_ARM))
    #define ICBC_ARM 1
#endif

#if (defined(__PPC__) || defined(_M_PPC))
    #define ICBC_PPC 1
#endif

// SIMD version.
#ifndef ICBC_SIMD
    #if ICBC_X86
        #if __AVX512F__
            #define ICBC_SIMD ICBC_AVX512
        #elif __AVX2__
            #define ICBC_SIMD ICBC_AVX2
        #elif __AVX__
            #define ICBC_SIMD ICBC_AVX1
        #elif __SSE4_1__
            #define ICBC_SIMD ICBC_SSE41
        #elif __SSE2__
            #define ICBC_SIMD ICBC_SSE2
        #else
            #define ICBC_SIMD ICBC_SCALAR
        #endif
    #endif

    #if ICBC_ARM
        #if __ARM_NEON__
            #define ICBC_SIMD ICBC_NEON
        #else
            #define ICBC_SIMD ICBC_SCALAR
        #endif
    #endif

    #if ICBC_PPC
        #define ICBC_SIMD ICBC_VMX
    #endif
#endif

// AVX1 does not require FMA, and depending on whether it's Intel or AMD you may have FMA3 or FMA4. What a mess.
#ifndef ICBC_USE_FMA
//#define ICBC_USE_FMA 3
//#define ICBC_USE_FMA 4
#endif

#if ICBC_SIMD >= ICBC_AVX2
#define ICBC_BMI2 1
#endif

// Apparently rcp is not deterministic (different precision on Intel and AMD), enable if you don't care about that for a small performance boost.
//#define ICBC_USE_RCP 1

#if ICBC_SIMD == ICBC_AVX2
#define ICBC_USE_AVX2_PERMUTE2 1    // Using permutevar8x32 and bitops.
#endif

#if ICBC_SIMD == ICBC_AVX512
#define ICBC_USE_AVX512_PERMUTE 1
#endif

#if ICBC_SIMD == ICBC_NEON
#define ICBC_USE_NEON_VTL 0         // Not tested.
#endif


// Some experimental knobs:
#define ICBC_PERFECT_ROUND 0        // Enable perfect rounding to compute cluster fit residual.


#if ICBC_SIMD >= ICBC_SSE2
#include <emmintrin.h>
#endif

#if ICBC_SIMD >= ICBC_SSE41
#include <smmintrin.h>
#endif

#if ICBC_SIMD >= ICBC_AVX1
#include <immintrin.h>
#endif

#if ICBC_SIMD >= ICBC_AVX512 && _MSC_VER
#include <zmmintrin.h>
#endif

#if ICBC_SIMD == ICBC_NEON
#include <arm_neon.h>
#endif

#if ICBC_SIMD == ICBC_VMX
#include <altivec.h>
#endif

#if _MSC_VER
#include <intrin.h> // _BitScanReverse
#endif

#include <stdint.h>
#include <stdlib.h> // abs
#include <string.h> // memset
#include <math.h>   // fabsf
#include <float.h>  // FLT_MAX

#ifndef ICBC_ASSERT
#if _DEBUG
#define ICBC_ASSERT assert
#include <assert.h>
#else
#define ICBC_ASSERT(x)
#endif
#endif

namespace icbc {

///////////////////////////////////////////////////////////////////////////////////////////////////
// Basic Templates

template <typename T> inline void swap(T & a, T & b) {
    T temp(a);
    a = b;
    b = temp;
}

template <typename T> inline T max(const T & a, const T & b) {
    return (b < a) ? a : b;
}

template <typename T> inline T min(const T & a, const T & b) {
    return (a < b) ? a : b;
}

template <typename T> inline T clamp(const T & x, const T & a, const T & b) {
    return min(max(x, a), b);
}

template <typename T> inline T square(const T & a) {
    return a * a;
}


///////////////////////////////////////////////////////////////////////////////////////////////////
// Basic Types

typedef uint8_t uint8;
typedef int8_t int8;
typedef uint16_t uint16;
typedef uint32_t uint32;
typedef uint32_t uint;


struct Color16 {
    union {
        struct {
            uint16 b : 5;
            uint16 g : 6;
            uint16 r : 5;
        };
        uint16 u;
    };
};

struct Color32 {
    union {
        struct {
            uint8 b, g, r, a;
        };
        uint32 u;
    };
};

struct BlockDXT1 {
    Color16 col0;
    Color16 col1;
    uint32 indices;
};


struct Vector3 {
    float x;
    float y;
    float z;

    inline void operator+=(Vector3 v) {
        x += v.x; y += v.y; z += v.z;
    }
    inline void operator*=(Vector3 v) {
        x *= v.x; y *= v.y; z *= v.z;
    }
    inline void operator*=(float s) {
        x *= s; y *= s; z *= s;
    }
};

struct Vector4 {
    union {
        struct {
            float x, y, z, w;
        };
        Vector3 xyz;
    };
};


inline Vector3 operator*(Vector3 v, float s) {
    return { v.x * s, v.y * s, v.z * s };
}

inline Vector3 operator*(float s, Vector3 v) {
    return { v.x * s, v.y * s, v.z * s };
}

inline Vector3 operator*(Vector3 a, Vector3 b) {
    return { a.x * b.x, a.y * b.y, a.z * b.z };
}

inline float dot(Vector3 a, Vector3 b) {
    return a.x * b.x + a.y * b.y + a.z * b.z;
}

inline Vector3 operator+(Vector3 a, Vector3 b) {
    return { a.x + b.x, a.y + b.y, a.z + b.z };
}

inline Vector3 operator-(Vector3 a, Vector3 b) {
    return { a.x - b.x, a.y - b.y, a.z - b.z };
}

inline Vector3 operator/(Vector3 v, float s) {
    return { v.x / s, v.y / s, v.z / s };
}

inline float saturate(float x) {
    return clamp(x, 0.0f, 1.0f);
}

inline Vector3 saturate(Vector3 v) {
    return { saturate(v.x), saturate(v.y), saturate(v.z) };
}

inline Vector3 min(Vector3 a, Vector3 b) {
    return { min(a.x, b.x), min(a.y, b.y), min(a.z, b.z) };
}

inline Vector3 max(Vector3 a, Vector3 b) {
    return { max(a.x, b.x), max(a.y, b.y), max(a.z, b.z) };
}

inline bool operator==(const Vector3 & a, const Vector3 & b) {
    return a.x == b.x && a.y == b.y && a.z == b.z;
}

inline Vector3 scalar_to_vector3(float f) {
    return {f, f, f};
}

inline float lengthSquared(Vector3 v) {
    return dot(v, v);
}

inline bool equal(float a, float b, float epsilon = 0.0001) {
    // http://realtimecollisiondetection.net/blog/?p=89
    //return fabsf(a - b) < epsilon * max(1.0f, max(fabsf(a), fabsf(b)));
    return fabsf(a - b) < epsilon;
}

inline bool equal(Vector3 a, Vector3 b, float epsilon) {
    return equal(a.x, b.x, epsilon) && equal(a.y, b.y, epsilon) && equal(a.z, b.z, epsilon);
}


///////////////////////////////////////////////////////////////////////////////////////////////////
// SIMD

#ifndef ICBC_ALIGN_16
#if __GNUC__
#   define ICBC_ALIGN_16 __attribute__ ((__aligned__ (16)))
#else // _MSC_VER
#   define ICBC_ALIGN_16 __declspec(align(16))
#endif
#endif

#if __GNUC__
#define ICBC_FORCEINLINE inline __attribute__((always_inline))
#else
#define ICBC_FORCEINLINE __forceinline
#endif


// Count trailing zeros (BSR).
ICBC_FORCEINLINE int ctz(uint mask) {
#if __GNUC__
    return __builtin_ctz(mask);
#else
    unsigned long index;
    _BitScanReverse(&index, mask);
    return (int)index;
#endif
}


#if ICBC_SIMD == ICBC_SCALAR  // Purely scalar version.

constexpr int VEC_SIZE = 1;

using VFloat = float;
using VMask = bool;

ICBC_FORCEINLINE float & lane(VFloat & v, int i) { return v; }
ICBC_FORCEINLINE VFloat vzero() { return 0.0f; }
ICBC_FORCEINLINE VFloat vbroadcast(float x) { return x; }
ICBC_FORCEINLINE VFloat vload(const float * ptr) { return *ptr; }
ICBC_FORCEINLINE VFloat vrcp(VFloat a) { return 1.0f / a; }
ICBC_FORCEINLINE VFloat vmadd(VFloat a, VFloat b, VFloat c) { return a * b + c; }
ICBC_FORCEINLINE VFloat vmsub(VFloat a, VFloat b, VFloat c) { return a * b - c; }
ICBC_FORCEINLINE VFloat vm2sub(VFloat a, VFloat b, VFloat c, VFloat d) { return a * b - c * d; }
ICBC_FORCEINLINE VFloat vsaturate(VFloat a) { return min(max(a, 0.0f), 1.0f); }
ICBC_FORCEINLINE VFloat vround01(VFloat a) { return float(int(a + 0.5f)); }
ICBC_FORCEINLINE VFloat lane_id() { return 0; }
ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) { return mask ? b : a; }
ICBC_FORCEINLINE VMask vbroadcast(bool b) { return b; }
ICBC_FORCEINLINE bool all(VMask m) { return m; }
ICBC_FORCEINLINE bool any(VMask m) { return m; }
ICBC_FORCEINLINE uint mask(VMask m) { return (uint)m; }
ICBC_FORCEINLINE int reduce_min_index(VFloat v) { return 0; }
ICBC_FORCEINLINE void vtranspose4(VFloat & a, VFloat & b, VFloat & c, VFloat & d) {}

#elif ICBC_SIMD == ICBC_SSE2 || ICBC_SIMD == ICBC_SSE41

constexpr int VEC_SIZE = 4;

#if __GNUC__
// GCC needs a struct so that we can overload operators.
union VFloat {
    __m128 v;
    float m128_f32[VEC_SIZE];

    VFloat() {}
    VFloat(__m128 v) : v(v) {}
    operator __m128 & () { return v; }
};
union VMask {
    __m128 m;

    VMask() {}
    VMask(__m128 m) : m(m) {}
    operator __m128 & () { return m; }
};
#else
using VFloat = __m128;
using VMask = __m128;
#endif

ICBC_FORCEINLINE float & lane(VFloat & v, int i) {
    return v.m128_f32[i];
}

ICBC_FORCEINLINE VFloat vzero() {
    return _mm_setzero_ps();
}

ICBC_FORCEINLINE VFloat vbroadcast(float x) {
    return _mm_set1_ps(x);
}

ICBC_FORCEINLINE VFloat vload(const float * ptr) {
    return _mm_load_ps(ptr);
}

ICBC_FORCEINLINE VFloat vgather(const float * base, VFloat index) {
    VFloat v;
    for (int i = 0; i < VEC_SIZE; i++) {
        lane(v, i) = base[int(lane(index, i))];
    }
    return v;
}

ICBC_FORCEINLINE VFloat operator+(VFloat a, VFloat b) {
    return _mm_add_ps(a, b);
}

ICBC_FORCEINLINE VFloat operator-(VFloat a, VFloat b) {
    return _mm_sub_ps(a, b);
}

ICBC_FORCEINLINE VFloat operator*(VFloat a, VFloat b) {
    return _mm_mul_ps(a, b);
}

ICBC_FORCEINLINE VFloat vrcp(VFloat a) {
#if ICBC_USE_RCP
    VFloat r = _mm_rcp_ps(a);
    return _mm_mul_ps(r, _mm_sub_ps(vbroadcast(2.0f), _mm_mul_ps(r, a)));   // r * (2 - r * a)
#else
    return _mm_div_ps(vbroadcast(1.0f), a);
#endif
}

// a*b+c
ICBC_FORCEINLINE VFloat vmadd(VFloat a, VFloat b, VFloat c) {
    return a * b + c;
}

ICBC_FORCEINLINE VFloat vmsub(VFloat a, VFloat b, VFloat c) {
    return a * b - c;
}

ICBC_FORCEINLINE VFloat vm2sub(VFloat a, VFloat b, VFloat c, VFloat d) {
    return a * b - c * d;
}

ICBC_FORCEINLINE VFloat vsaturate(VFloat a) {
    auto zero = _mm_setzero_ps();
    auto one = _mm_set1_ps(1.0f);
    return _mm_min_ps(_mm_max_ps(a, zero), one);
}

// Assumes a is in [0, 1] range.
ICBC_FORCEINLINE VFloat vround01(VFloat a) {
#if ICBC_SIMD == ICBC_SSE41
    return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
#else
    return _mm_cvtepi32_ps(_mm_cvttps_epi32(a + vbroadcast(0.5f)));
#endif
}

ICBC_FORCEINLINE VFloat vtruncate(VFloat a) {
#if ICBC_SIMD == ICBC_SSE41
    return _mm_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
#else
    return _mm_cvtepi32_ps(_mm_cvttps_epi32(a));
#endif
}

ICBC_FORCEINLINE VFloat lane_id() {
    return _mm_set_ps(3, 2, 1, 0);
}

ICBC_FORCEINLINE VMask operator> (VFloat A, VFloat B) { return _mm_cmpgt_ps(A, B); }
ICBC_FORCEINLINE VMask operator>=(VFloat A, VFloat B) { return _mm_cmpge_ps(A, B); }
ICBC_FORCEINLINE VMask operator< (VFloat A, VFloat B) { return _mm_cmplt_ps(A, B); }
ICBC_FORCEINLINE VMask operator<=(VFloat A, VFloat B) { return _mm_cmple_ps(A, B); }

ICBC_FORCEINLINE VMask operator| (VMask A, VMask B) { return _mm_or_ps(A, B); }
ICBC_FORCEINLINE VMask operator& (VMask A, VMask B) { return _mm_and_ps(A, B); }
ICBC_FORCEINLINE VMask operator^ (VMask A, VMask B) { return _mm_xor_ps(A, B); }

// mask ? b : a
ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) {
#if ICBC_SIMD == ICBC_SSE41
    return _mm_blendv_ps(a, b, mask);
#else
    return _mm_or_ps(_mm_andnot_ps(mask, a), _mm_and_ps(mask, b));
#endif
}

ICBC_FORCEINLINE VMask vbroadcast(bool b) { 
    return _mm_castsi128_ps(_mm_set1_epi32(-int32_t(b)));
}

ICBC_FORCEINLINE bool all(VMask m) {
    int value = _mm_movemask_ps(m);
    return value == 0x7;
}

ICBC_FORCEINLINE bool any(VMask m) {
    int value = _mm_movemask_ps(m);
    return value != 0;
}

ICBC_FORCEINLINE uint mask(VMask m) {
    return (uint)_mm_movemask_ps(m);
}

ICBC_FORCEINLINE int reduce_min_index(VFloat v) {

    // First do an horizontal reduction.                            // v = [ D C | B A ]
    VFloat shuf = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 3, 0, 1));    //     [ C D | A B ]
    VFloat mins = _mm_min_ps(v, shuf);                              // mins = [ D+C C+D | B+A A+B ]
    shuf        = _mm_movehl_ps(shuf, mins);                        //        [   C   D | D+C C+D ]  // let the compiler avoid a mov by reusing shuf
    mins        = _mm_min_ss(mins, shuf);
    mins =      _mm_shuffle_ps(mins, mins, _MM_SHUFFLE(0, 0, 0, 0));

    // Then find the index.
    uint mask = _mm_movemask_ps(v <= mins);
    return ctz(mask);
}

// https://gcc.gnu.org/legacy-ml/gcc-patches/2005-10/msg00324.html
ICBC_FORCEINLINE void vtranspose4(VFloat & r0, VFloat & r1, VFloat & r2, VFloat & r3) {
    VFloat t0 = _mm_unpacklo_ps(r0, r1);
    VFloat t1 = _mm_unpacklo_ps(r2, r3);
    VFloat t2 = _mm_unpackhi_ps(r0, r1);
    VFloat t3 = _mm_unpackhi_ps(r2, r3);
    r0 = _mm_movelh_ps(t0, t1);
    r1 = _mm_movehl_ps(t1, t0);
    r2 = _mm_movelh_ps(t2, t3);
    r3 = _mm_movehl_ps(t3, t2);
}


#elif ICBC_SIMD == ICBC_AVX1 || ICBC_SIMD == ICBC_AVX2

constexpr int VEC_SIZE = 8;

#if __GNUC__
union VFloat {
    __m256 v;
    float m256_f32[VEC_SIZE];

    VFloat() {}
    VFloat(__m256 v) : v(v) {}
    operator __m256 & () { return v; }
};
union VInt {
    __m256i v;
    int m256_i32[VEC_SIZE];

    VInt() {}
    VInt(__m256i v) : v(v) {}
    operator __m256i & () { return v; }
};
union VMask {
    __m256 m;

    VMask() {}
    VMask(__m256 m) : m(m) {}
    operator __m256 & () { return m; }
};
#else
using VFloat = __m256;
using VInt = __m256i;
using VMask = __m256;   // Emulate mask vector using packed float.
#endif

ICBC_FORCEINLINE float & lane(VFloat & v, int i) {
    return v.m256_f32[i];
}

ICBC_FORCEINLINE VFloat vzero() {
    return _mm256_setzero_ps();
}

ICBC_FORCEINLINE VFloat vbroadcast(float a) {
    return _mm256_set1_ps(a);
}

ICBC_FORCEINLINE VFloat vload(const float * ptr) {
    return _mm256_load_ps(ptr);
}

ICBC_FORCEINLINE VFloat vgather(const float * base, VFloat index) {
#if ICBC_SIMD == ICBC_AVX2
    return _mm256_i32gather_ps(base, _mm256_cvtps_epi32(index), 4);
#else
    VFloat v;
    for (int i = 0; i < VEC_SIZE; i++) {
        lane(v, i) = base[int(lane(index, i))];
    }
    return v;
#endif
}

ICBC_FORCEINLINE VFloat operator+(VFloat a, VFloat b) {
    return _mm256_add_ps(a, b);
}

ICBC_FORCEINLINE VFloat operator-(VFloat a, VFloat b) {
    return _mm256_sub_ps(a, b);
}

ICBC_FORCEINLINE VFloat operator*(VFloat a, VFloat b) {
    return _mm256_mul_ps(a, b);
}

ICBC_FORCEINLINE VFloat vrcp(VFloat a) {
#if ICBC_USE_RCP
    #if ICBC_SIMD == ICBC_AVX512
        VFloat r = _mm256_rcp14_ps(a);
    #else
        VFloat r = _mm256_rcp_ps(a);
    #endif

    // r = r * (2 - r * a)
    #if ICBC_USE_FMA == 3 || ICBC_AVX2
        return _mm256_mul_ps(r, _mm256_fnmadd_ps(r, a, vbroadcast(2.0f)));
    #else
        return _mm256_mul_ps(r, _mm256_sub_ps(vbroadcast(2.0f), _mm256_mul_ps(r, a)));
    #endif
#else
    return _mm256_div_ps(vbroadcast(1.0f), a);
#endif
}

// a*b+c
ICBC_FORCEINLINE VFloat vmadd(VFloat a, VFloat b, VFloat c) {
#if ICBC_USE_FMA == 3 || ICBC_SIMD == ICBC_AVX2
    return _mm256_fmadd_ps(a, b, c);
#elif ICBC_USE_FMA == 4
    return _mm256_macc_ps(a, b, c);
#else
    return ((a * b) + c);
#endif
}

ICBC_FORCEINLINE VFloat vmsub(VFloat a, VFloat b, VFloat c) {
#if ICBC_USE_FMA == 3 || ICBC_SIMD == ICBC_AVX2
    return _mm256_fmsub_ps(a, b, c);
#elif ICBC_USE_FMA == 4
    return _mm256_msub_ps(a, b, c);
#else
    return ((a * b) - c);
#endif
}

ICBC_FORCEINLINE VFloat vm2sub(VFloat a, VFloat b, VFloat c, VFloat d) {
    return vmsub(a, b, c * d);
}

ICBC_FORCEINLINE VFloat vsaturate(VFloat a) {
    __m256 zero = _mm256_setzero_ps();
    __m256 one = _mm256_set1_ps(1.0f);
    return _mm256_min_ps(_mm256_max_ps(a, zero), one);
}

ICBC_FORCEINLINE VFloat vround01(VFloat a) {
    return _mm256_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
}

ICBC_FORCEINLINE VFloat vtruncate(VFloat a) {
    return _mm256_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
}

ICBC_FORCEINLINE VFloat lane_id() {
    return _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0);
}

ICBC_FORCEINLINE VMask operator> (VFloat A, VFloat B) { return _mm256_cmp_ps(A, B, _CMP_GT_OQ); }
ICBC_FORCEINLINE VMask operator>=(VFloat A, VFloat B) { return _mm256_cmp_ps(A, B, _CMP_GE_OQ); }
ICBC_FORCEINLINE VMask operator< (VFloat A, VFloat B) { return _mm256_cmp_ps(A, B, _CMP_LT_OQ); }
ICBC_FORCEINLINE VMask operator<=(VFloat A, VFloat B) { return _mm256_cmp_ps(A, B, _CMP_LE_OQ); }

ICBC_FORCEINLINE VMask operator| (VMask A, VMask B) { return _mm256_or_ps(A, B); }
ICBC_FORCEINLINE VMask operator& (VMask A, VMask B) { return _mm256_and_ps(A, B); }
ICBC_FORCEINLINE VMask operator^ (VMask A, VMask B) { return _mm256_xor_ps(A, B); }

// mask ? b : a
ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) {
    return _mm256_blendv_ps(a, b, mask);
}

ICBC_FORCEINLINE VMask vbroadcast(bool b) { 
    return _mm256_castsi256_ps(_mm256_set1_epi32(-int32_t(b)));
}

ICBC_FORCEINLINE bool all(VMask m) {
    __m256 zero = _mm256_setzero_ps();
    return _mm256_testc_ps(_mm256_cmp_ps(zero, zero, _CMP_EQ_UQ), m) == 0;
}

ICBC_FORCEINLINE bool any(VMask m) {
    return _mm256_testz_ps(m, m) == 0;
}

ICBC_FORCEINLINE uint mask(VMask m) {
    return (uint)_mm256_movemask_ps(m);
}

// This is missing on some GCC versions.
#if !defined _mm256_set_m128
#define _mm256_set_m128(hi, lo) _mm256_insertf128_ps(_mm256_castps128_ps256(lo), (hi), 0x1)
#endif

ICBC_FORCEINLINE int reduce_min_index(VFloat v) {

    __m128 vlow  = _mm256_castps256_ps128(v);
    __m128 vhigh = _mm256_extractf128_ps(v, 1);
           vlow  = _mm_min_ps(vlow, vhigh);

    // First do an horizontal reduction.                                // v = [ D C | B A ]
    __m128 shuf = _mm_shuffle_ps(vlow, vlow, _MM_SHUFFLE(2, 3, 0, 1));  //     [ C D | A B ]
    __m128 mins = _mm_min_ps(vlow, shuf);                            // mins = [ D+C C+D | B+A A+B ]
    shuf        = _mm_movehl_ps(shuf, mins);                         //        [   C   D | D+C C+D ]
    mins        = _mm_min_ss(mins, shuf);

    VFloat vmin = _mm256_permute_ps(_mm256_set_m128(mins, mins), 0); // _MM256_PERMUTE(0, 0, 0, 0, 0, 0, 0, 0)

    // Then find the index.
    uint mask = _mm256_movemask_ps(v <= vmin);
    return ctz(mask);
}

// AoS to SoA
ICBC_FORCEINLINE void vtranspose4(VFloat & a, VFloat & b, VFloat & c, VFloat & d) {
    VFloat r0 = _mm256_unpacklo_ps(a, b);
    VFloat r1 = _mm256_unpacklo_ps(c, d);
    VFloat r2 = _mm256_permute2f128_ps(r0, r1, 0x20);
    VFloat r3 = _mm256_permute2f128_ps(r0, r1, 0x31);
    r0 = _mm256_unpackhi_ps(a, b);
    r1 = _mm256_unpackhi_ps(c, d);
    a = _mm256_unpacklo_ps(r2, r3);
    b = _mm256_unpackhi_ps(r2, r3);
    r2 = _mm256_permute2f128_ps(r0, r1, 0x20);
    r3 = _mm256_permute2f128_ps(r0, r1, 0x31);
    c = _mm256_unpacklo_ps(r2, r3);
    d = _mm256_unpackhi_ps(r2, r3);
}

ICBC_FORCEINLINE VInt vzeroi() {
    return _mm256_setzero_si256();
}

ICBC_FORCEINLINE VInt vbroadcast(int a) {
    return _mm256_set1_epi32(a);
}

ICBC_FORCEINLINE VInt vload(const int * ptr) {
    return _mm256_load_si256((const __m256i*)ptr);
}

ICBC_FORCEINLINE VInt operator- (VInt A, int b) { return _mm256_sub_epi32(A, _mm256_set1_epi32(b)); }
ICBC_FORCEINLINE VInt operator& (VInt A, int b) { return _mm256_and_si256(A, _mm256_set1_epi32(b)); }
ICBC_FORCEINLINE VInt operator>> (VInt A, int b) { return _mm256_srli_epi32(A, b); }

ICBC_FORCEINLINE VMask operator> (VInt A, int b) { return _mm256_castsi256_ps(_mm256_cmpgt_epi32(A, _mm256_set1_epi32(b))); }
ICBC_FORCEINLINE VMask operator== (VInt A, int b) { return _mm256_castsi256_ps(_mm256_cmpeq_epi32(A, _mm256_set1_epi32(b))); }

// mask ? v[idx] : 0
ICBC_FORCEINLINE VFloat vpermuteif(VMask mask, VFloat v, VInt idx) {
    return _mm256_and_ps(_mm256_permutevar8x32_ps(v, idx), mask);
}

// mask ? (idx > 8 ? vhi[idx] : vlo[idx]) : 0
ICBC_FORCEINLINE VFloat vpermute2if(VMask mask, VFloat vlo, VFloat vhi, VInt idx) {
#if 0
    VMask mhi = idx > 7;
    vlo = _mm256_permutevar8x32_ps(vlo, idx);
    vhi = _mm256_permutevar8x32_ps(vhi, idx);
    VFloat v = _mm256_blendv_ps(vlo, vhi, mhi);
    return _mm256_and_ps(v, mask);
#else
    // Fabian Giesen says not to mix _mm256_blendv_ps and _mm256_permutevar8x32_ps since they contend for the same gates and instead suggests the following:
    vhi = _mm256_xor_ps(vhi, vlo);
    VFloat v = _mm256_permutevar8x32_ps(vlo, idx);
    VMask mhi = idx > 7;
    v = _mm256_xor_ps(v, _mm256_and_ps(_mm256_permutevar8x32_ps(vhi, idx), mhi));
    return _mm256_and_ps(v, mask);
#endif
}


#elif ICBC_SIMD == ICBC_AVX512

constexpr int VEC_SIZE = 16;

#if __GNUC__
union VFloat {
    __m512 v;
    float m512_f32[VEC_SIZE];

    VFloat() {}
    VFloat(__m512 v) : v(v) {}
    operator __m512 & () { return v; }
};
union VInt {
    __m512i v;
    int m512i_i32[VEC_SIZE];

    VInt() {}
    VInt(__m512i v) : v(v) {}
    operator __m512i & () { return v; }
};
#else
using VFloat = __m512;
using VInt = __m512i;
#endif
struct VMask { __mmask16 m; };

ICBC_FORCEINLINE float & lane(VFloat & v, int i) {
    return v.m512_f32[i];
}

ICBC_FORCEINLINE VFloat vzero() {
    return _mm512_setzero_ps();
}

ICBC_FORCEINLINE VFloat vbroadcast(float a) {
    return _mm512_set1_ps(a);
}

ICBC_FORCEINLINE VFloat vload(const float * ptr) {
    return _mm512_load_ps(ptr);
}

ICBC_FORCEINLINE VFloat vload(VMask mask, const float * ptr) {
    return _mm512_mask_load_ps(_mm512_undefined(), mask.m, ptr);
}

ICBC_FORCEINLINE VFloat vload(VMask mask, const float * ptr, float fallback) {
    return _mm512_mask_load_ps(_mm512_set1_ps(fallback), mask.m, ptr);
}

ICBC_FORCEINLINE VFloat vgather(const float * base, VFloat index) {
    return _mm512_i32gather_ps(_mm512_cvtps_epi32(index), base, 4);
}

ICBC_FORCEINLINE VFloat operator+(VFloat a, VFloat b) {
    return _mm512_add_ps(a, b);
}

ICBC_FORCEINLINE VFloat operator-(VFloat a, VFloat b) {
    return _mm512_sub_ps(a, b);
}

ICBC_FORCEINLINE VFloat operator*(VFloat a, VFloat b) {
    return _mm512_mul_ps(a, b);
}

ICBC_FORCEINLINE VFloat vrcp(VFloat a) {
#if ICBC_USE_RCP
    VFloat r = _mm512_rcp14_ps(a);

    // r = r * (2 - r * a)
    return _mm512_mul_ps(r, _mm512_fnmadd_ps(r, a, vbroadcast(2.0f)));
#else
    return _mm512_div_ps(vbroadcast(1.0f), a);
#endif
}

// a*b+c
ICBC_FORCEINLINE VFloat vmadd(VFloat a, VFloat b, VFloat c) {
    return _mm512_fmadd_ps(a, b, c);
}

ICBC_FORCEINLINE VFloat vmsub(VFloat a, VFloat b, VFloat c) {
    return _mm512_fmsub_ps(a, b, c);
}

ICBC_FORCEINLINE VFloat vm2sub(VFloat a, VFloat b, VFloat c, VFloat d) {
    return vmsub(a, b, c * d);
}

ICBC_FORCEINLINE VFloat vsaturate(VFloat a) {
    auto zero = _mm512_setzero_ps();
    auto one = _mm512_set1_ps(1.0f);
    return _mm512_min_ps(_mm512_max_ps(a, zero), one);
}

ICBC_FORCEINLINE VFloat vround01(VFloat a) {
    return _mm512_roundscale_ps(a, _MM_FROUND_TO_NEAREST_INT);
}

ICBC_FORCEINLINE VFloat vtruncate(VFloat a) {
    return _mm512_roundscale_ps(a, _MM_FROUND_TO_ZERO);
}

ICBC_FORCEINLINE VFloat lane_id() {
    return _mm512_set_ps(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
}

ICBC_FORCEINLINE VMask operator> (VFloat A, VFloat B) { return { _mm512_cmp_ps_mask(A, B, _CMP_GT_OQ) }; }
ICBC_FORCEINLINE VMask operator>=(VFloat A, VFloat B) { return { _mm512_cmp_ps_mask(A, B, _CMP_GE_OQ) }; }
ICBC_FORCEINLINE VMask operator< (VFloat A, VFloat B) { return { _mm512_cmp_ps_mask(A, B, _CMP_LT_OQ) }; }
ICBC_FORCEINLINE VMask operator<=(VFloat A, VFloat B) { return { _mm512_cmp_ps_mask(A, B, _CMP_LE_OQ) }; }

ICBC_FORCEINLINE VMask operator! (VMask A) { return { _mm512_knot(A.m) }; }
ICBC_FORCEINLINE VMask operator| (VMask A, VMask B) { return { _mm512_kor(A.m, B.m) }; }
ICBC_FORCEINLINE VMask operator& (VMask A, VMask B) { return { _mm512_kand(A.m, B.m) }; }
ICBC_FORCEINLINE VMask operator^ (VMask A, VMask B) { return { _mm512_kxor(A.m, B.m) }; }

// mask ? b : a
ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) {
    return _mm512_mask_blend_ps(mask.m, a, b);
}

ICBC_FORCEINLINE VMask vbroadcast(bool b) { 
    return { __mmask16(-int16_t(b)) };
}

ICBC_FORCEINLINE bool all(VMask mask) {
    return mask.m == 0xFFFFFFFF;
}

ICBC_FORCEINLINE bool any(VMask mask) {
    return mask.m != 0;
}

ICBC_FORCEINLINE uint mask(VMask mask) {
    return mask.m;
}

ICBC_FORCEINLINE int reduce_min_index(VFloat v) {

    // First do an horizontal reduction.
    VFloat vmin = vbroadcast(_mm512_reduce_min_ps(v));

    // Then find the index.
    VMask mask = (v <= vmin);
    return ctz(mask.m);
}

//ICBC_FORCEINLINE void vtranspose4(VFloat & a, VFloat & b, VFloat & c, VFloat & d); // @@

ICBC_FORCEINLINE int lane(VInt v, int i) {
    //return _mm256_extract_epi32(v, i);
    return v.m512i_i32[i];
}

ICBC_FORCEINLINE VInt vzeroi() {
    return _mm512_setzero_epi32();
}

ICBC_FORCEINLINE VInt vbroadcast(int a) {
    return _mm512_set1_epi32(a);
}

ICBC_FORCEINLINE VInt vload(const int * ptr) {
    return _mm512_load_epi32(ptr);
}

ICBC_FORCEINLINE VInt operator- (VInt A, int b) { return _mm512_sub_epi32(A, vbroadcast(b)); }
ICBC_FORCEINLINE VInt operator& (VInt A, int b) { return _mm512_and_epi32(A, vbroadcast(b)); }
ICBC_FORCEINLINE VInt operator>> (VInt A, int b) { return _mm512_srli_epi32(A, b); }

ICBC_FORCEINLINE VMask operator> (VInt A, int b) { return { _mm512_cmpgt_epi32_mask(A, vbroadcast(b)) }; }
ICBC_FORCEINLINE VMask operator>=(VInt A, int b) { return { _mm512_cmpge_epi32_mask(A, vbroadcast(b)) }; }
ICBC_FORCEINLINE VMask operator== (VInt A, int b) { return { _mm512_cmpeq_epi32_mask(A, vbroadcast(b)) }; }

// mask ? v[idx] : 0
ICBC_FORCEINLINE VFloat vpermuteif(VMask mask, VFloat v, VInt idx) {
    return _mm512_maskz_permutexvar_ps(mask.m, idx, v);
}


#elif ICBC_SIMD == ICBC_NEON

constexpr int VEC_SIZE = 4;

#if __GNUC__
union VFloat {
    float32x4_t v;
    float e[4];
    VFloat() {}
    VFloat(float32x4_t v) : v(v) {}
    operator float32x4_t & () { return v; }
};
struct VMask {
    uint32x4_t v;
    VMask() {}
    VMask(uint32x4_t v) : v(v) {}
    operator uint32x4_t & () { return v; }
};
#else
using VFloat = float32x4_t;
using VMask = uint32x4_t;
#endif

ICBC_FORCEINLINE float & lane(VFloat & v, int i) {
#if defined(__clang__)
    return v.e[i];
#elif defined(_MSC_VER)
    return v.n128_f32[i];
#else
    return v.v[i];
#endif
}

ICBC_FORCEINLINE VFloat vzero() {
    return vdupq_n_f32(0.0f);
}

ICBC_FORCEINLINE VFloat vbroadcast(float a) {
    return vdupq_n_f32(a);
}

ICBC_FORCEINLINE VFloat vload(const float * ptr) {
    return vld1q_f32(ptr);
}

ICBC_FORCEINLINE VFloat operator+(VFloat a, VFloat b) {
    return vaddq_f32(a, b);
}

ICBC_FORCEINLINE VFloat operator-(VFloat a, VFloat b) {
    return vsubq_f32(a, b);
}

ICBC_FORCEINLINE VFloat operator*(VFloat a, VFloat b) {
    return vmulq_f32(a, b);
}

ICBC_FORCEINLINE VFloat vrcp(VFloat a) {
//#if ICBC_USE_RCP
    VFloat rcp = vrecpeq_f32(a);
    //return rcp;
    return vmulq_f32(vrecpsq_f32(a, rcp), rcp);
//#else
//    return vdiv_f32(vbroadcast(1.0f), a);    // @@ ARMv8 only?
//#endif
}

// a*b+c
ICBC_FORCEINLINE VFloat vmadd(VFloat a, VFloat b, VFloat c) {
    return vmlaq_f32(c, a, b);
}

ICBC_FORCEINLINE VFloat vmsub(VFloat a, VFloat b, VFloat c) {
    return a * b - c;
}

ICBC_FORCEINLINE VFloat vm2sub(VFloat a, VFloat b, VFloat c, VFloat d) {
    // vmlsq_f32(a, b, c) == a - (b * c)
    return vmlsq_f32(a * b, c, d);
}

ICBC_FORCEINLINE VFloat vsaturate(VFloat a) {
    return vminq_f32(vmaxq_f32(a, vzero()), vbroadcast(1));
}

ICBC_FORCEINLINE VFloat vround01(VFloat a) {
#if __ARM_RACH >= 8
    return vrndqn_f32(a);   // Round to integral (to nearest, ties to even)
#else
    return vcvtq_f32_s32(vcvtq_s32_f32(a + vbroadcast(0.5)));
#endif
}

ICBC_FORCEINLINE VFloat vtruncate(VFloat a) {
#if __ARM_RACH >= 8
    return vrndq_f32(a);
#else
    return vcvtq_f32_s32(vcvtq_s32_f32(a));
#endif
}

ICBC_FORCEINLINE VFloat lane_id() {
    ICBC_ALIGN_16 float data[4] = { 0, 1, 2, 3 };
	return vld1q_f32(data);
}

ICBC_FORCEINLINE VMask operator> (VFloat A, VFloat B) { return { vcgtq_f32(A, B) }; }
ICBC_FORCEINLINE VMask operator>=(VFloat A, VFloat B) { return { vcgeq_f32(A, B) }; }
ICBC_FORCEINLINE VMask operator< (VFloat A, VFloat B) { return { vcltq_f32(A, B) }; }
ICBC_FORCEINLINE VMask operator<=(VFloat A, VFloat B) { return { vcleq_f32(A, B) }; }

ICBC_FORCEINLINE VMask operator| (VMask A, VMask B) { return { vorrq_u32(A, B) }; }
ICBC_FORCEINLINE VMask operator& (VMask A, VMask B) { return { vandq_u32(A, B) }; }
ICBC_FORCEINLINE VMask operator^ (VMask A, VMask B) { return { veorq_u32(A, B) }; }

// mask ? b : a
ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) {
    return vbslq_f32(mask, b, a);
}

ICBC_FORCEINLINE bool all(VMask mask) {
    uint32x2_t m2 = vpmin_u32(vget_low_u32(mask), vget_high_u32(mask));
    uint32x2_t m1 = vpmin_u32(m2, m2);
#if defined(_MSC_VER)
    return m1.n64_u32[0] != 0;
#else
    return m1[0] != 0;
#endif
}

ICBC_FORCEINLINE bool any(VMask mask) {
    uint32x2_t m2 = vpmax_u32(vget_low_u32(mask), vget_high_u32(mask));
    uint32x2_t m1 = vpmax_u32(m2, m2);
#if defined(_MSC_VER)
    return m1.n64_u32[0] != 0;
#else
    return m1[0] != 0;
#endif
}

// @@ Is this the best we can do?
// From: https://github.com/jratcliff63367/sse2neon/blob/master/SSE2NEON.h
ICBC_FORCEINLINE uint mask(VMask mask) {
	static const uint32x4_t movemask = { 1, 2, 4, 8 };
	static const uint32x4_t highbit = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
	uint32x4_t t1 = vtstq_u32(mask, highbit);
	uint32x4_t t2 = vandq_u32(t1, movemask);
	uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2));
	return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1);
}

inline int reduce_min_index(VFloat v) {
#if 0
    float32x2_t m2 = vpmin_f32(vget_low_u32(V), vget_high_u32(v));
    float32x2_t m1 = vpmin_f32(m2, m2);
    float min_value = vget_lane_f32(m1, 0);
    VFloat vmin = vbroadcast(min_value);

    // (v <= vmin)

    // @@ Find the lane that contains minValue?
#endif

    // @@ Is there a better way to do this reduction?
    int min_idx = 0;
    float min_value = lane(v, 0);

    for (int i = 1; i < VEC_SIZE; i++) {
        float value = lane(v, i);
        if (value < min_value) {
            min_value = value;
            min_idx = i;
        }
    }

    return min_idx;
}

// https://github.com/Maratyszcza/NNPACK/blob/master/src/neon/transpose.h
ICBC_FORCEINLINE void vtranspose4(VFloat & a, VFloat & b, VFloat & c, VFloat & d) {
    // row0 = ( x00 x01 x02 x03 )
    // row1 = ( x10 x11 x12 x13 )
    // row2 = ( x20 x21 x22 x23 )
    // row3 = ( x30 x31 x32 x33 )

    // row01 = ( x00 x10 x02 x12 ), ( x01 x11 x03, x13 )
    // row23 = ( x20 x30 x22 x32 ), ( x21 x31 x23, x33 )
    float32x4x2_t row01 = vtrnq_f32(a, b);
    float32x4x2_t row23 = vtrnq_f32(c, d);

    // row0 = ( x00 x10 x20 x30 )
    // row1 = ( x01 x11 x21 x31 )
    // row2 = ( x02 x12 x22 x32 )
    // row3 = ( x03 x13 x23 x33 )
    a = vcombine_f32(vget_low_f32(row01.val[0]), vget_low_f32(row23.val[0]));
    b = vcombine_f32(vget_low_f32(row01.val[1]), vget_low_f32(row23.val[1]));
    c = vcombine_f32(vget_high_f32(row01.val[0]), vget_high_f32(row23.val[0]));
    d = vcombine_f32(vget_high_f32(row01.val[1]), vget_high_f32(row23.val[1]));
}

#elif ICBC_SIMD == ICBC_VMX

constexpr int VEC_SIZE = 4;

union VFloat {
    vectro float v;
    float e[4];
    VFloat() {}
    VFloat(vector float v) : v(v) {}
    operator vector float & () { return v; }
};
struct VMask {
    vector unsigned int v;
    VMask() {}
    VMask(vector unsigned int v) : v(v) {}
    operator vector unsigned int & () { return v; }
};

ICBC_FORCEINLINE float & lane(VFloat & v, int i) {
    return v.e[i];
}

ICBC_FORCEINLINE VFloat vzero() {
    return vec_splats(0.0f);
}

ICBC_FORCEINLINE VFloat vbroadcast(float a) {
    return vec_splats(a);
}

ICBC_FORCEINLINE VFloat vload(const float * ptr) {
    return vec_ld(ptr)
}

ICBC_FORCEINLINE VFloat operator+(VFloat a, VFloat b) {
    return vec_add(a, b);
}

ICBC_FORCEINLINE VFloat operator-(VFloat a, VFloat b) {
    return vec_sub(a, b);
}

ICBC_FORCEINLINE VFloat operator*(VFloat a, VFloat b) {
    return vec_madd(a, b, vec_splats(-0.0f));
}

ICBC_FORCEINLINE VFloat vrcp(VFloat a) {
#if ICBC_USE_RCP
    // get the reciprocal estimate
    vector float estimate = vec_re( v.vec );

    // one round of Newton-Rhaphson refinement
    vector float diff = vec_nmsub( estimate, v.vec, vec_splats( 1.0f ) );
    return vec_madd(diff, estimate, estimate );
#else
    return vec_div(vec_splats(1),a);
#endif
}

// a*b+c
ICBC_FORCEINLINE VFloat vmadd(VFloat a, VFloat b, VFloat c) {
    return vec_madd(a, b, c);
}

ICBC_FORCEINLINE VFloat vmsub(VFloat a, VFloat b, VFloat c) {
    return vec_msub(a, b, c); // @@ Is this right?
}

ICBC_FORCEINLINE VFloat vm2sub(VFloat a, VFloat b, VFloat c, VFloat d) {
    return vmsub(a, b, c * d);
}

ICBC_FORCEINLINE VFloat vsaturate(VFloat a) {
    return vec_min(vec_max(a, vzero()), vbroadcast(1));
}

ICBC_FORCEINLINE VFloat vround01(VFloat a) {
    // @@ Assumes a is positive and ~small
    return vec_trunc(a + vbroadcast(0.5));
}

ICBC_FORCEINLINE VFloat vtruncate(VFloat a) {
    return vec_trunc(a);
}

ICBC_FORCEINLINE VFloat lane_id() {
    return (VFloat){ 0, 1, 2, 3 };
}

ICBC_FORCEINLINE VMask operator> (VFloat A, VFloat B) { return { vec_cmpgt(A, B) }; }
ICBC_FORCEINLINE VMask operator>=(VFloat A, VFloat B) { return { vec_cmpge(A, B) }; }
ICBC_FORCEINLINE VMask operator< (VFloat A, VFloat B) { return { vec_cmplt(A, B) }; }
ICBC_FORCEINLINE VMask operator<=(VFloat A, VFloat B) { return { vec_cmple(A, B) }; }

ICBC_FORCEINLINE VMask operator| (VMask A, VMask B) { return { vec_or(A, B) }; }
ICBC_FORCEINLINE VMask operator& (VMask A, VMask B) { return { vec_and(A, B) }; }
ICBC_FORCEINLINE VMask operator^ (VMask A, VMask B) { return { vec_xor(A, B) }; }

// mask ? b : a
ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) {
    return vec_sel(a, b, mask);
}

ICBC_FORCEINLINE int reduce_min_index(VFloat v) {

    //VFloat vmin =  //@@ Horizontal min?
    //return vec_cmpeq_idx(v, vmin);

    // @@ Is there a better way to do this reduction?
    int min_idx = 0;
    float min_value = lane(v, 0);

    for (int i = 1; i < VEC_SIZE; i++) {
        float value = lane(v, i);
        if (value < min_value) {
            min_value = value;
            min_idx = i;
        }
    }

    return min_idx;
}

ICBC_FORCEINLINE void vtranspose4(VFloat & a, VFloat & b, VFloat & c, VFloat & d) {
    VFloat t1 = vec_mergeh(a, c);
    VFloat t2 = vec_mergel(a, c);
    VFloat t3 = vec_mergeh(b, d);
    VFloat t4 = vec_mergel(b, d);
    a = vec_mergeh(t1, t3);
    b = vec_mergel(t1, t3);
    c = vec_mergeh(t2, t4);
    d = vec_mergel(t2, t4);
}

#endif // ICBC_SIMD == *

#if ICBC_SIMD != ICBC_SCALAR
ICBC_FORCEINLINE VFloat vmadd(VFloat a, float b, VFloat c) {
    VFloat vb = vbroadcast(b);
    return vmadd(a, vb, c);
}
#endif

struct VVector3 {
    VFloat x;
    VFloat y;
    VFloat z;
};

ICBC_FORCEINLINE VVector3 vbroadcast(Vector3 v) {
    VVector3 v8;

    v8.x = vbroadcast(v.x);
    v8.y = vbroadcast(v.y);
    v8.z = vbroadcast(v.z);
    return v8;
}

ICBC_FORCEINLINE VVector3 vbroadcast(float x, float y, float z) {
    VVector3 v8;
    v8.x = vbroadcast(x);
    v8.y = vbroadcast(y);
    v8.z = vbroadcast(z);
    return v8;
}

/*ICBC_FORCEINLINE VVector3 vload(const Vector3 * v) {
    // @@ See this for a 8x3 transpose: https://software.intel.com/content/www/us/en/develop/articles/3d-vector-normalization-using-256-bit-intel-advanced-vector-extensions-intel-avx.html
}*/

ICBC_FORCEINLINE VVector3 vload(const Vector4 * ptr) {
#if ICBC_SIMD == ICBC_AVX512

    // @@ AVX512 transpose not implemented.
    __m512i vindex = _mm512_set_epi32(4 * 15, 4 * 14, 4 * 13, 4 * 12, 4 * 11, 4 * 10, 4 * 9, 4 * 8, 4 * 7, 4 * 6, 4 * 5, 4 * 4, 4 * 3, 4 * 2, 4 * 1, 0);

    VVector3 v;
    v.x = _mm512_i32gather_ps(vindex, &ptr->x, 4);
    v.y = _mm512_i32gather_ps(vindex, &ptr->y, 4);
    v.z = _mm512_i32gather_ps(vindex, &ptr->z, 4);
    return v;

#else

    VVector3 v;
    v.x = vload(&ptr->x + 0 * VEC_SIZE);
    v.y = vload(&ptr->x + 1 * VEC_SIZE);
    v.z = vload(&ptr->x + 2 * VEC_SIZE);
    VFloat tmp = vload(&ptr->x + 3 * VEC_SIZE);

    vtranspose4(v.x, v.y, v.z, tmp);

    return v;
#endif
}


ICBC_FORCEINLINE VVector3 operator+(VVector3 a, VVector3 b) {
    VVector3 v;
    v.x = (a.x + b.x);
    v.y = (a.y + b.y);
    v.z = (a.z + b.z);
    return v;
}

ICBC_FORCEINLINE VVector3 operator-(VVector3 a, VVector3 b) {
    VVector3 v8;
    v8.x = (a.x - b.x);
    v8.y = (a.y - b.y);
    v8.z = (a.z - b.z);
    return v8;
}

ICBC_FORCEINLINE VVector3 operator*(VVector3 a, VVector3 b) {
    VVector3 v8;
    v8.x = (a.x * b.x);
    v8.y = (a.y * b.y);
    v8.z = (a.z * b.z);
    return v8;
}

ICBC_FORCEINLINE VVector3 operator*(VVector3 a, VFloat b) {
    VVector3 v8;
    v8.x = (a.x * b);
    v8.y = (a.y * b);
    v8.z = (a.z * b);
    return v8;
}

ICBC_FORCEINLINE VVector3 vmadd(VVector3 a, VVector3 b, VVector3 c) {
    VVector3 v8;
    v8.x = vmadd(a.x, b.x, c.x);
    v8.y = vmadd(a.y, b.y, c.y);
    v8.z = vmadd(a.z, b.z, c.z);
    return v8;
}

ICBC_FORCEINLINE VVector3 vmadd(VVector3 a, VFloat b, VVector3 c) {
    VVector3 v8;
    v8.x = vmadd(a.x, b, c.x);
    v8.y = vmadd(a.y, b, c.y);
    v8.z = vmadd(a.z, b, c.z);
    return v8;
}

#if ICBC_SIMD != ICBC_SCALAR
ICBC_FORCEINLINE VVector3 vmadd(VVector3 a, float b, VVector3 c) {
    VVector3 v8;
    VFloat vb = vbroadcast(b);
    v8.x = vmadd(a.x, vb, c.x);
    v8.y = vmadd(a.y, vb, c.y);
    v8.z = vmadd(a.z, vb, c.z);
    return v8;
}
#endif

ICBC_FORCEINLINE VVector3 vmsub(VVector3 a, VFloat b, VFloat c) {
    VVector3 v8;
    v8.x = vmsub(a.x, b, c);
    v8.y = vmsub(a.y, b, c);
    v8.z = vmsub(a.z, b, c);
    return v8;
}

ICBC_FORCEINLINE VVector3 vmsub(VVector3 a, VFloat b, VVector3 c) {
    VVector3 v8;
    v8.x = vmsub(a.x, b, c.x);
    v8.y = vmsub(a.y, b, c.y);
    v8.z = vmsub(a.z, b, c.z);
    return v8;
}

ICBC_FORCEINLINE VVector3 vm2sub(VVector3 a, VFloat b, VVector3 c, VFloat d) {
    VVector3 v;
    v.x = vm2sub(a.x, b, c.x, d);
    v.y = vm2sub(a.y, b, c.y, d);
    v.z = vm2sub(a.z, b, c.z, d);
    return v;
}

ICBC_FORCEINLINE VVector3 vm2sub(VVector3 a, VVector3 b, VVector3 c, VFloat d) {
    VVector3 v;
    v.x = vm2sub(a.x, b.x, c.x, d);
    v.y = vm2sub(a.y, b.y, c.y, d);
    v.z = vm2sub(a.z, b.z, c.z, d);
    return v;
}

ICBC_FORCEINLINE VVector3 vm2sub(VVector3 a, VVector3 b, VVector3 c, VVector3 d) {
    VVector3 v;
    v.x = vm2sub(a.x, b.x, c.x, d.x);
    v.y = vm2sub(a.y, b.y, c.y, d.y);
    v.z = vm2sub(a.z, b.z, c.z, d.z);
    return v;
}

ICBC_FORCEINLINE VFloat vdot(VVector3 a, VVector3 b) {
    VFloat r;
    r = a.x * b.x + a.y * b.y + a.z * b.z;
    return r;
}

// Length squared.
ICBC_FORCEINLINE VFloat vlen2(VVector3 v) {
    return vdot(v, v);
}


// mask ? b : a
ICBC_FORCEINLINE VVector3 vselect(VMask mask, VVector3 a, VVector3 b) {
    VVector3 r;
    r.x = vselect(mask, a.x, b.x);
    r.y = vselect(mask, a.y, b.y);
    r.z = vselect(mask, a.z, b.z);
    return r;
}


///////////////////////////////////////////////////////////////////////////////////////////////////
// Color conversion functions.

static const float midpoints5[32] = {
    0.015686f, 0.047059f, 0.078431f, 0.111765f, 0.145098f, 0.176471f, 0.207843f, 0.241176f, 0.274510f, 0.305882f, 0.337255f, 0.370588f, 0.403922f, 0.435294f, 0.466667f, 0.5f,
    0.533333f, 0.564706f, 0.596078f, 0.629412f, 0.662745f, 0.694118f, 0.725490f, 0.758824f, 0.792157f, 0.823529f, 0.854902f, 0.888235f, 0.921569f, 0.952941f, 0.984314f, FLT_MAX
};

static const float midpoints6[64] = {
    0.007843f, 0.023529f, 0.039216f, 0.054902f, 0.070588f, 0.086275f, 0.101961f, 0.117647f, 0.133333f, 0.149020f, 0.164706f, 0.180392f, 0.196078f, 0.211765f, 0.227451f, 0.245098f,
    0.262745f, 0.278431f, 0.294118f, 0.309804f, 0.325490f, 0.341176f, 0.356863f, 0.372549f, 0.388235f, 0.403922f, 0.419608f, 0.435294f, 0.450980f, 0.466667f, 0.482353f, 0.500000f,
    0.517647f, 0.533333f, 0.549020f, 0.564706f, 0.580392f, 0.596078f, 0.611765f, 0.627451f, 0.643137f, 0.658824f, 0.674510f, 0.690196f, 0.705882f, 0.721569f, 0.737255f, 0.754902f,
    0.772549f, 0.788235f, 0.803922f, 0.819608f, 0.835294f, 0.850980f, 0.866667f, 0.882353f, 0.898039f, 0.913725f, 0.929412f, 0.945098f, 0.960784f, 0.976471f, 0.992157f, FLT_MAX
};

/*void init_tables() {
    for (int i = 0; i < 31; i++) {
        float f0 = float(((i+0) << 3) | ((i+0) >> 2)) / 255.0f;
        float f1 = float(((i+1) << 3) | ((i+1) >> 2)) / 255.0f;
        midpoints5[i] = (f0 + f1) * 0.5;
    }
    midpoints5[31] = FLT_MAX;

    for (int i = 0; i < 63; i++) {
        float f0 = float(((i+0) << 2) | ((i+0) >> 4)) / 255.0f;
        float f1 = float(((i+1) << 2) | ((i+1) >> 4)) / 255.0f;
        midpoints6[i] = (f0 + f1) * 0.5;
    }
    midpoints6[63] = FLT_MAX;
}*/

ICBC_FORCEINLINE VFloat vround5(VFloat x) {
    const VFloat rb_scale = vbroadcast(31.0f);
    const VFloat rb_inv_scale = vbroadcast(1.0f / 31.0f);

#if ICBC_PERFECT_ROUND
    VFloat q = vtruncate(x * rb_scale);
    VFloat mp = vgather(midpoints5, q);
    //return (q + (vbroadcast(1.0f) & (x > mp))) * rb_inv_scale;
    return (q + vselect(x > mp, vzero(), vbroadcast(1))) * rb_inv_scale;
#else
    return vround01(x * rb_scale) * rb_inv_scale;
#endif
}

ICBC_FORCEINLINE VFloat vround6(VFloat x) {
    const VFloat g_scale = vbroadcast(63.0f);
    const VFloat g_inv_scale = vbroadcast(1.0f / 63.0f);

#if ICBC_PERFECT_ROUND
    VFloat q = vtruncate(x * g_scale);
    VFloat mp = vgather(midpoints6, q);
    //return (q + (vbroadcast(1) & (x > mp))) * g_inv_scale;
    return (q + vselect(x > mp, vzero(), vbroadcast(1))) * g_inv_scale;
#else
    return vround01(x * g_scale) * g_inv_scale;
#endif
}

ICBC_FORCEINLINE VVector3 vround_ept(VVector3 v) {
    VVector3 r;
    r.x = vround5(vsaturate(v.x));
    r.y = vround6(vsaturate(v.y));
    r.z = vround5(vsaturate(v.z));
    return r;
}

static Color16 vector3_to_color16(const Vector3 & v) {

    // Truncate.
    uint r = uint(clamp(v.x * 31.0f, 0.0f, 31.0f));
	uint g = uint(clamp(v.y * 63.0f, 0.0f, 63.0f));
	uint b = uint(clamp(v.z * 31.0f, 0.0f, 31.0f));

    // Round exactly according to 565 bit-expansion.
    r += (v.x > midpoints5[r]);
    g += (v.y > midpoints6[g]);
    b += (v.z > midpoints5[b]);

    Color16 c;
    c.u = uint16((r << 11) | (g << 5) | b);
    return c;
}

static Color32 bitexpand_color16_to_color32(Color16 c16) {
    Color32 c32;
    //c32.b = (c16.b << 3) | (c16.b >> 2);
    //c32.g = (c16.g << 2) | (c16.g >> 4);
    //c32.r = (c16.r << 3) | (c16.r >> 2);
    //c32.a = 0xFF;

    c32.u = ((c16.u << 3) & 0xf8) | ((c16.u << 5) & 0xfc00) | ((c16.u << 8) & 0xf80000);
    c32.u |= (c32.u >> 5) & 0x070007;
    c32.u |= (c32.u >> 6) & 0x000300;

    return c32;
}

inline Vector3 color_to_vector3(Color32 c) {
    return { c.r / 255.0f, c.g / 255.0f, c.b / 255.0f };
}

inline Color32 vector3_to_color32(Vector3 v) {
    Color32 color;
    color.r = uint8(saturate(v.x) * 255 + 0.5f);
    color.g = uint8(saturate(v.y) * 255 + 0.5f);
    color.b = uint8(saturate(v.z) * 255 + 0.5f);
    color.a = 255;
    return color;
}


///////////////////////////////////////////////////////////////////////////////////////////////////
// Input block processing.

inline bool is_black(Vector3 c) {
    // This large threshold seems to improve compression. This is not forcing these texels to be black, just
    // causes them to be ignored during PCA.
    // @@ We may want to adjust this based on the quality level, since this increases the number of blocks that try cluster-3 fitting.
    //return c.x < midpoints5[0] && c.y < midpoints6[0] && c.z < midpoints5[0];
    //return c.x < 1.0f / 32 && c.y < 1.0f / 32 && c.z < 1.0f / 32;
    return c.x < 1.0f / 8 && c.y < 1.0f / 8 && c.z < 1.0f / 8;
}

// Find similar colors and combine them together.
static int reduce_colors(const Vector4 * input_colors, const float * input_weights, int count, float threshold, Vector3 * colors, float * weights, bool * any_black)
{
#if 0
    for (int i = 0; i < 16; i++) {
        colors[i] = input_colors[i].xyz;
        weights[i] = input_weights[i];
    }
    return 16;
#else
    *any_black = false;

    int n = 0;
    for (int i = 0; i < count; i++)
    {
        Vector3 ci = input_colors[i].xyz;
        float wi = input_weights[i];

        if (wi > 0) {
            // Find matching color.
            int j;
            for (j = 0; j < n; j++) {
                if (equal(colors[j], ci, threshold)) {
                    colors[j] = (colors[j] * weights[j] + ci * wi) / (weights[j] + wi);
                    weights[j] += wi;
                    break;
                }
            }

            // No match found. Add new color.
            if (j == n) {
                colors[n] = ci;
                weights[n] = wi;
                n++;
            }

            if (is_black(ci)) {
                *any_black = true;
            }
        }
    }

    ICBC_ASSERT(n <= count);

    return n;
#endif
}

static int skip_blacks(const Vector3 * input_colors, const float * input_weights, int count, Vector3 * colors, float * weights)
{
    int n = 0;
    for (int i = 0; i < count; i++)
    {
        Vector3 ci = input_colors[i];
        float wi = input_weights[i];

        if (is_black(ci)) {
            continue;
        }

        colors[n] = ci;
        weights[n] = wi;
        n += 1;
    }

    return n;
}


///////////////////////////////////////////////////////////////////////////////////////////////////
// PCA

static Vector3 computeCentroid(int n, const Vector3 *__restrict points, const float *__restrict weights)
{
    Vector3 centroid = { 0 };
    float total = 0.0f;

    for (int i = 0; i < n; i++)
    {
        total += weights[i];
        centroid += weights[i] * points[i];
    }
    centroid *= (1.0f / total);

    return centroid;
}

static Vector3 computeCovariance(int n, const Vector3 *__restrict points, const float *__restrict weights, float *__restrict covariance)
{
    // compute the centroid
    Vector3 centroid = computeCentroid(n, points, weights);

    // compute covariance matrix
    for (int i = 0; i < 6; i++)
    {
        covariance[i] = 0.0f;
    }

    for (int i = 0; i < n; i++)
    {
        Vector3 a = (points[i] - centroid);    // @@ I think weight should be squared, but that seems to increase the error slightly.
        Vector3 b = weights[i] * a;

        covariance[0] += a.x * b.x;
        covariance[1] += a.x * b.y;
        covariance[2] += a.x * b.z;
        covariance[3] += a.y * b.y;
        covariance[4] += a.y * b.z;
        covariance[5] += a.z * b.z;
    }

    return centroid;
}

// @@ We should be able to do something cheaper...
static Vector3 estimatePrincipalComponent(const float * __restrict matrix)
{
    const Vector3 row0 = { matrix[0], matrix[1], matrix[2] };
    const Vector3 row1 = { matrix[1], matrix[3], matrix[4] };
    const Vector3 row2 = { matrix[2], matrix[4], matrix[5] };

    float r0 = lengthSquared(row0);
    float r1 = lengthSquared(row1);
    float r2 = lengthSquared(row2);

    if (r0 > r1 && r0 > r2) return row0;
    if (r1 > r2) return row1;
    return row2;
}

static inline Vector3 firstEigenVector_PowerMethod(const float *__restrict matrix)
{
    if (matrix[0] == 0 && matrix[3] == 0 && matrix[5] == 0)
    {
        return {0};
    }

    Vector3 v = estimatePrincipalComponent(matrix);

    const int NUM = 8;
    for (int i = 0; i < NUM; i++)
    {
        float x = v.x * matrix[0] + v.y * matrix[1] + v.z * matrix[2];
        float y = v.x * matrix[1] + v.y * matrix[3] + v.z * matrix[4];
        float z = v.x * matrix[2] + v.y * matrix[4] + v.z * matrix[5];

        float norm = max(max(x, y), z);

        v = { x, y, z };
        v *= (1.0f / norm);
    }

    return v;
}

static Vector3 computePrincipalComponent_PowerMethod(int n, const Vector3 *__restrict points, const float *__restrict weights)
{
    float matrix[6];
    computeCovariance(n, points, weights, matrix);

    return firstEigenVector_PowerMethod(matrix);
}


///////////////////////////////////////////////////////////////////////////////////////////////////
// SAT

struct SummedAreaTable {
    ICBC_ALIGN_16 float r[16];
    ICBC_ALIGN_16 float g[16];
    ICBC_ALIGN_16 float b[16];
    ICBC_ALIGN_16 float w[16];
};

int compute_sat(const Vector3 * colors, const float * weights, int count, SummedAreaTable * sat)
{
    // I've tried using a lower quality approximation of the principal direction, but the best fit line seems to produce best results.
    Vector3 principal = computePrincipalComponent_PowerMethod(count, colors, weights);

    // build the list of values
    int order[16];
    float dps[16];
    for (int i = 0; i < count; ++i)
    {
        order[i] = i;
        dps[i] = dot(colors[i], principal);
    }

    // stable sort
    for (int i = 0; i < count; ++i)
    {
        for (int j = i; j > 0 && dps[j] < dps[j - 1]; --j)
        {
            swap(dps[j], dps[j - 1]);
            swap(order[j], order[j - 1]);
        }
    }

    float w = weights[order[0]];
    sat->r[0] = colors[order[0]].x * w;
    sat->g[0] = colors[order[0]].y * w;
    sat->b[0] = colors[order[0]].z * w;
    sat->w[0] = w;

    for (int i = 1; i < count; i++) {
        w = weights[order[i]];
        sat->r[i] = sat->r[i - 1] + colors[order[i]].x * w;
        sat->g[i] = sat->g[i - 1] + colors[order[i]].y * w;
        sat->b[i] = sat->b[i - 1] + colors[order[i]].z * w;
        sat->w[i] = sat->w[i - 1] + w;
    }

    // Try incremental decimation:
    /*if (count > 4)
    {
        float threshold = 1.0f / 4;

        for (uint i = 1; i < count; ++i)
        {
            if (sat->r[i] - sat->r[i - 1] < threshold &&
                sat->g[i] - sat->g[i - 1] < threshold &&
                sat->b[i] - sat->b[i - 1] < threshold)
            {
                for (int j = i+1; j < count; j++) {
                    sat->r[j - 1] = sat->r[j];
                    sat->g[j - 1] = sat->g[j];
                    sat->b[j - 1] = sat->b[j];
                    sat->w[j - 1] = sat->w[j];
                }
                count -= 1;
                i -= 1;
                if (count == 4) break;
            }
        }
    }*/

    for (int i = count; i < 16; i++) {
        sat->r[i] = FLT_MAX;
        sat->g[i] = FLT_MAX;
        sat->b[i] = FLT_MAX;
        sat->w[i] = FLT_MAX;
    }

    return count;
}


///////////////////////////////////////////////////////////////////////////////////////////////////
// Cluster Fit

struct Combinations {
    uint8 c0, c1, c2, pad;
};

static ICBC_ALIGN_16 int s_fourClusterTotal[16];
static ICBC_ALIGN_16 int s_threeClusterTotal[16];
static ICBC_ALIGN_16 Combinations s_fourCluster[968 + 8];
static ICBC_ALIGN_16 Combinations s_threeCluster[152 + 8];

#if ICBC_USE_NEON_VTL
static uint8 s_neon_vtl_index0_4[4 * 968];
static uint8 s_neon_vtl_index1_4[4 * 968];
static uint8 s_neon_vtl_index2_4[4 * 968];

static uint8 s_neon_vtl_index0_3[4 * 152];
static uint8 s_neon_vtl_index1_3[4 * 152];
#endif

static void init_cluster_tables() {

    for (int t = 1, i = 0; t <= 16; t++) {
        for (int c0 = 0; c0 <= t; c0++) {
            for (int c1 = 0; c1 <= t - c0; c1++) {
                for (int c2 = 0; c2 <= t - c0 - c1; c2++) {

                    // Skip this cluster so that the total is a multiple of 8
                    if (c0 == 0 && c1 == 0 && c2 == 0) continue;

                    bool found = false;
                    if (t > 1) {
                        for (int j = 0; j < s_fourClusterTotal[t-2]; j++) {
                            if (s_fourCluster[j].c0 == c0 && s_fourCluster[j].c1 == c0+c1 && s_fourCluster[j].c2 == c0+c1+c2) {
                                found = true;
                                break;
                            }
                        }
                    }

                    if (!found) {
                        s_fourCluster[i].c0 = uint8(c0);
                        s_fourCluster[i].c1 = uint8(c0+c1);
                        s_fourCluster[i].c2 = uint8(c0+c1+c2);
                        i++;
                    }
                }
            }
        }

        s_fourClusterTotal[t - 1] = i;
    }

    // Replicate last entry.
    for (int i = 0; i < 8; i++) {
        s_fourCluster[968 + i] = s_fourCluster[968-1];
    }

    for (int t = 1, i = 0; t <= 16; t++) {
        for (int c0 = 0; c0 <= t; c0++) {
            for (int c1 = 0; c1 <= t - c0; c1++) {

                // Skip this cluster so that the total is a multiple of 8
                if (c0 == 0 && c1 == 0) continue;

                bool found = false;
                if (t > 1) {
                    for (int j = 0; j < s_threeClusterTotal[t - 2]; j++) {
                        if (s_threeCluster[j].c0 == c0 && s_threeCluster[j].c1 == c0+c1) {
                            found = true;
                            break;
                        }
                    }
                }

                if (!found) {
                    s_threeCluster[i].c0 = uint8(c0);
                    s_threeCluster[i].c1 = uint8(c0 + c1);
                    i++;
                }
            }
        }

        s_threeClusterTotal[t - 1] = i;
    }

    // Replicate last entry.
    for (int i = 0; i < 8; i++) {
        s_threeCluster[152 + i] = s_threeCluster[152 - 1];
    }

#if ICBC_USE_NEON_VTL
    for (int i = 0; i < 968; i++) {
        int c0 = (s_fourCluster[i].c0) - 1;
        s_neon_vtl_index0_4[4 * i + 0] = uint8(c0 * 4 + 0);
        s_neon_vtl_index0_4[4 * i + 1] = uint8(c0 * 4 + 1);
        s_neon_vtl_index0_4[4 * i + 2] = uint8(c0 * 4 + 2);
        s_neon_vtl_index0_4[4 * i + 3] = uint8(c0 * 4 + 3);

        int c1 = (s_fourCluster[i].c1) - 1;
        s_neon_vtl_index1_4[4 * i + 0] = uint8(c1 * 4 + 0);
        s_neon_vtl_index1_4[4 * i + 1] = uint8(c1 * 4 + 1);
        s_neon_vtl_index1_4[4 * i + 2] = uint8(c1 * 4 + 2);
        s_neon_vtl_index1_4[4 * i + 3] = uint8(c1 * 4 + 3);

        int c2 = (s_fourCluster[i].c2) - 1;
        s_neon_vtl_index2_4[4 * i + 0] = uint8(c2 * 4 + 0);
        s_neon_vtl_index2_4[4 * i + 1] = uint8(c2 * 4 + 1);
        s_neon_vtl_index2_4[4 * i + 2] = uint8(c2 * 4 + 2);
        s_neon_vtl_index2_4[4 * i + 3] = uint8(c2 * 4 + 3);
    }

    for (int i = 0; i < 152; i++) {
        int c0 = (s_threeCluster[i].c0) - 1;
        s_neon_vtl_index0_3[4 * i + 0] = uint8(c0 * 4 + 0);
        s_neon_vtl_index0_3[4 * i + 1] = uint8(c0 * 4 + 1);
        s_neon_vtl_index0_3[4 * i + 2] = uint8(c0 * 4 + 2);
        s_neon_vtl_index0_3[4 * i + 3] = uint8(c0 * 4 + 3);

        int c1 = (s_threeCluster[i].c1) - 1;
        s_neon_vtl_index1_3[4 * i + 0] = uint8(c1 * 4 + 0);
        s_neon_vtl_index1_3[4 * i + 1] = uint8(c1 * 4 + 1);
        s_neon_vtl_index1_3[4 * i + 2] = uint8(c1 * 4 + 2);
        s_neon_vtl_index1_3[4 * i + 3] = uint8(c1 * 4 + 3);
    }
#endif
}


static void cluster_fit_three(const SummedAreaTable & sat, int count, Vector3 metric_sqr, Vector3 * start, Vector3 * end)
{
    const float r_sum = sat.r[count-1];
    const float g_sum = sat.g[count-1];
    const float b_sum = sat.b[count-1];
    const float w_sum = sat.w[count-1];

    VFloat vbesterror = vbroadcast(FLT_MAX);
    VVector3 vbeststart = { vzero(), vzero(), vzero() };
    VVector3 vbestend = { vzero(), vzero(), vzero() };

    // check all possible clusters for this total order
    const int total_order_count = s_threeClusterTotal[count - 1];

    for (int i = 0; i < total_order_count; i += VEC_SIZE)
    {
        VVector3 x0, x1;
        VFloat w0, w1;

#if ICBC_USE_AVX512_PERMUTE

        auto loadmask = lane_id() < vbroadcast(float(count));

        // Load sat in one register:
        VFloat vrsat = vload(loadmask, sat.r, FLT_MAX);
        VFloat vgsat = vload(loadmask, sat.g, FLT_MAX);
        VFloat vbsat = vload(loadmask, sat.b, FLT_MAX);
        VFloat vwsat = vload(loadmask, sat.w, FLT_MAX);

        // Load 4 uint8 per lane.
        VInt packedClusterIndex = vload((int *)&s_threeCluster[i]);

        VInt c0 = (packedClusterIndex & 0xFF) - 1;
        VInt c1 = (packedClusterIndex >> 8) - 1;

        x0.x = vpermuteif(c0 >= 0, vrsat, c0);
        x0.y = vpermuteif(c0 >= 0, vgsat, c0);
        x0.z = vpermuteif(c0 >= 0, vbsat, c0);
        w0   = vpermuteif(c0 >= 0, vwsat, c0);

        x1.x = vpermuteif(c1 >= 0, vrsat, c1);
        x1.y = vpermuteif(c1 >= 0, vgsat, c1);
        x1.z = vpermuteif(c1 >= 0, vbsat, c1);
        w1   = vpermuteif(c1 >= 0, vwsat, c1);

#elif ICBC_USE_AVX2_PERMUTE2

        // Load 4 uint8 per lane. @@ Ideally I should pack this better and load only 2.
        VInt packedClusterIndex = vload((int *)&s_threeCluster[i]);

        VInt c0 = (packedClusterIndex & 0xFF);
        VInt c1 = ((packedClusterIndex >> 8)); // No need for & 0xFF

        if (count <= 8) {
            // Load sat.r in one register:
            VFloat rLo = vload(sat.r);
            VFloat gLo = vload(sat.g);
            VFloat bLo = vload(sat.b);
            VFloat wLo = vload(sat.w);

            x0.x = vpermuteif(c0>0, rLo, c0-1);
            x0.y = vpermuteif(c0>0, gLo, c0-1);
            x0.z = vpermuteif(c0>0, bLo, c0-1);
            w0   = vpermuteif(c0>0, wLo, c0-1);

            x1.x = vpermuteif(c1>0, rLo, c1-1);
            x1.y = vpermuteif(c1>0, gLo, c1-1);
            x1.z = vpermuteif(c1>0, bLo, c1-1);
            w1   = vpermuteif(c1>0, wLo, c1-1);
        }
        else {
            // Load sat.r in two registers:
            VFloat rLo = vload(sat.r); VFloat rHi = vload(sat.r + 8);
            VFloat gLo = vload(sat.g); VFloat gHi = vload(sat.g + 8);
            VFloat bLo = vload(sat.b); VFloat bHi = vload(sat.b + 8);
            VFloat wLo = vload(sat.w); VFloat wHi = vload(sat.w + 8);

            x0.x = vpermute2if(c0>0, rLo, rHi, c0-1);
            x0.y = vpermute2if(c0>0, gLo, gHi, c0-1);
            x0.z = vpermute2if(c0>0, bLo, bHi, c0-1);
            w0   = vpermute2if(c0>0, wLo, wHi, c0-1);

            x1.x = vpermute2if(c1>0, rLo, rHi, c1-1);
            x1.y = vpermute2if(c1>0, gLo, gHi, c1-1);
            x1.z = vpermute2if(c1>0, bLo, bHi, c1-1);
            w1   = vpermute2if(c1>0, wLo, wHi, c1-1);
        }

#elif ICBC_USE_NEON_VTL

        uint8x16_t idx0 = (uint8x16_t &)s_neon_vtl_index0_3[4*i];
        uint8x16_t idx1 = (uint8x16_t &)s_neon_vtl_index1_3[4*i];

        if (count <= 4) {
            uint8x16_t rsat1 = vld1q_u8((uint8*)sat.r);
            uint8x16_t gsat1 = vld1q_u8((uint8*)sat.g);
            uint8x16_t bsat1 = vld1q_u8((uint8*)sat.b);
            uint8x16_t wsat1 = vld1q_u8((uint8*)sat.w);

            x0.x = vreinterpretq_f32_u8(vqtbl1q_u8(rsat1, idx0));
            x0.y = vreinterpretq_f32_u8(vqtbl1q_u8(gsat1, idx0));
            x0.z = vreinterpretq_f32_u8(vqtbl1q_u8(bsat1, idx0));
            w0   = vreinterpretq_f32_u8(vqtbl1q_u8(wsat1, idx0));

            x1.x = vreinterpretq_f32_u8(vqtbl1q_u8(rsat1, idx1));
            x1.y = vreinterpretq_f32_u8(vqtbl1q_u8(gsat1, idx1));
            x1.z = vreinterpretq_f32_u8(vqtbl1q_u8(bsat1, idx1));
            w1   = vreinterpretq_f32_u8(vqtbl1q_u8(wsat1, idx1));
        }
        else if (count <= 8) {
            uint8x16x2_t rsat2 = vld2q_u8((uint8*)sat.r);
            uint8x16x2_t gsat2 = vld2q_u8((uint8*)sat.g);
            uint8x16x2_t bsat2 = vld2q_u8((uint8*)sat.b);
            uint8x16x2_t wsat2 = vld2q_u8((uint8*)sat.w);

            x0.x = vreinterpretq_f32_u8(vqtbl2q_u8(rsat2, idx0));
            x0.y = vreinterpretq_f32_u8(vqtbl2q_u8(gsat2, idx0));
            x0.z = vreinterpretq_f32_u8(vqtbl2q_u8(bsat2, idx0));
            w0   = vreinterpretq_f32_u8(vqtbl2q_u8(wsat2, idx0));

            x1.x = vreinterpretq_f32_u8(vqtbl2q_u8(rsat2, idx1));
            x1.y = vreinterpretq_f32_u8(vqtbl2q_u8(gsat2, idx1));
            x1.z = vreinterpretq_f32_u8(vqtbl2q_u8(bsat2, idx1));
            w1   = vreinterpretq_f32_u8(vqtbl2q_u8(wsat2, idx1));
        }
        else if (count <= 12) {
            uint8x16x3_t rsat3 = vld3q_u8((uint8*)sat.r);
            uint8x16x3_t gsat3 = vld3q_u8((uint8*)sat.g);
            uint8x16x3_t bsat3 = vld3q_u8((uint8*)sat.b);
            uint8x16x3_t wsat3 = vld3q_u8((uint8*)sat.w);

            x0.x = vreinterpretq_f32_u8(vqtbl3q_u8(rsat3, idx0));
            x0.y = vreinterpretq_f32_u8(vqtbl3q_u8(gsat3, idx0));
            x0.z = vreinterpretq_f32_u8(vqtbl3q_u8(bsat3, idx0));
            w0   = vreinterpretq_f32_u8(vqtbl3q_u8(wsat3, idx0));

            x1.x = vreinterpretq_f32_u8(vqtbl3q_u8(rsat3, idx1));
            x1.y = vreinterpretq_f32_u8(vqtbl3q_u8(gsat3, idx1));
            x1.z = vreinterpretq_f32_u8(vqtbl3q_u8(bsat3, idx1));
            w1   = vreinterpretq_f32_u8(vqtbl3q_u8(wsat3, idx1));
        }
        else {
            // Load SAT.
            uint8x16x4_t rsat4 = vld4q_u8((uint8*)sat.r);
            uint8x16x4_t gsat4 = vld4q_u8((uint8*)sat.g);
            uint8x16x4_t bsat4 = vld4q_u8((uint8*)sat.b);
            uint8x16x4_t wsat4 = vld4q_u8((uint8*)sat.w);

            x0.x = vreinterpretq_f32_u8(vqtbl4q_u8(rsat4, idx0));
            x0.y = vreinterpretq_f32_u8(vqtbl4q_u8(gsat4, idx0));
            x0.z = vreinterpretq_f32_u8(vqtbl4q_u8(bsat4, idx0));
            w0   = vreinterpretq_f32_u8(vqtbl4q_u8(wsat4, idx0));

            x1.x = vreinterpretq_f32_u8(vqtbl4q_u8(rsat4, idx1));
            x1.y = vreinterpretq_f32_u8(vqtbl4q_u8(gsat4, idx1));
            x1.z = vreinterpretq_f32_u8(vqtbl4q_u8(bsat4, idx1));
            w1   = vreinterpretq_f32_u8(vqtbl4q_u8(wsat4, idx1));
        }

#else

        // Scalar path
        x0.x = vzero(); x0.y = vzero(); x0.z = vzero(); w0 = vzero();
        x1.x = vzero(); x1.y = vzero(); x1.z = vzero(); w1 = vzero();

        for (int l = 0; l < VEC_SIZE; l++) {
            int c0 = s_threeCluster[i + l].c0 - 1;
            if (c0 >= 0) {
                lane(x0.x, l) = sat.r[c0];
                lane(x0.y, l) = sat.g[c0];
                lane(x0.z, l) = sat.b[c0];
                lane(w0, l) = sat.w[c0];
            }

            int c1 = s_threeCluster[i + l].c1 - 1;
            if (c1 >= 0) {
                lane(x1.x, l) = sat.r[c1];
                lane(x1.y, l) = sat.g[c1];
                lane(x1.z, l) = sat.b[c1];
                lane(w1, l) = sat.w[c1];
            }
        }

#endif

        VFloat w2 = vbroadcast(w_sum) - w1;
        x1 = x1 - x0;
        w1 = w1 - w0;

        VFloat alphabeta_sum = w1 * vbroadcast(0.25f);
        VFloat alpha2_sum = w0 + alphabeta_sum;
        VFloat beta2_sum = w2 + alphabeta_sum;
        VFloat factor = vrcp(vm2sub(alpha2_sum, beta2_sum, alphabeta_sum, alphabeta_sum));

        VVector3 alphax_sum = x0 + x1 * vbroadcast(0.5f);
        VVector3 betax_sum = vbroadcast(r_sum, g_sum, b_sum) - alphax_sum;

        VVector3 a = vm2sub(alphax_sum, beta2_sum, betax_sum, alphabeta_sum) * factor;
        VVector3 b = vm2sub(betax_sum, alpha2_sum, alphax_sum, alphabeta_sum) * factor;

        // snap to the grid
        a = vround_ept(a);
        b = vround_ept(b);

        // compute the error
        VVector3 e2 = vm2sub(a, vmsub(b, alphabeta_sum, alphax_sum), b, betax_sum) * vbroadcast(2.0f);
        VVector3 e1 = vmadd(a * a, alpha2_sum, vmadd(b * b, beta2_sum, e2));

        // apply the metric to the error term
        VFloat error = vdot(e1, vbroadcast(metric_sqr));


        // keep the solution if it wins
        auto mask = (error < vbesterror);

        // I could mask the unused lanes here, but instead I set the invalid SAT entries to FLT_MAX.
        //mask = (mask & (vbroadcast(total_order_count) >= tid8(i))); // This doesn't seem to help. Is it OK to consider elements out of bounds?

        vbesterror = vselect(mask, vbesterror, error);
        vbeststart = vselect(mask, vbeststart, a);
        vbestend = vselect(mask, vbestend, b);
    }

    int bestindex = reduce_min_index(vbesterror);

    start->x = lane(vbeststart.x, bestindex);
    start->y = lane(vbeststart.y, bestindex);
    start->z = lane(vbeststart.z, bestindex);
    end->x = lane(vbestend.x, bestindex);
    end->y = lane(vbestend.y, bestindex);
    end->z = lane(vbestend.z, bestindex);
}


static void cluster_fit_four(const SummedAreaTable & sat, int count, Vector3 metric_sqr, Vector3 * start, Vector3 * end)
{
    const float r_sum = sat.r[count-1];
    const float g_sum = sat.g[count-1];
    const float b_sum = sat.b[count-1];
    const float w_sum = sat.w[count-1];

    VFloat vbesterror = vbroadcast(FLT_MAX);
    VVector3 vbeststart = { vzero(), vzero(), vzero() };
    VVector3 vbestend = { vzero(), vzero(), vzero() };

    // check all possible clusters for this total order
    const int total_order_count = s_fourClusterTotal[count - 1];

    for (int i = 0; i < total_order_count; i += VEC_SIZE)
    {
        VVector3 x0, x1, x2;
        VFloat w0, w1, w2;

        /*
        // Another approach would be to load and broadcast one color at a time like I do in my old CUDA implementation.
        uint akku = 0;

        // Compute alpha & beta for this permutation.
        #pragma unroll
        for (int i = 0; i < 16; i++)
        {
            const uint bits = permutation >> (2*i);

            alphax_sum += alphaTable4[bits & 3] * colors[i];
            akku += prods4[bits & 3];
        }

        float alpha2_sum = float(akku >> 16);
        float beta2_sum = float((akku >> 8) & 0xff);
        float alphabeta_sum = float(akku & 0xff);
        float3 betax_sum = 9.0f * color_sum - alphax_sum;
        */

#if ICBC_USE_AVX512_PERMUTE

        auto loadmask = lane_id() < vbroadcast(float(count));

        // Load sat in one register:
        VFloat vrsat = vload(loadmask, sat.r, FLT_MAX);
        VFloat vgsat = vload(loadmask, sat.g, FLT_MAX);
        VFloat vbsat = vload(loadmask, sat.b, FLT_MAX);
        VFloat vwsat = vload(loadmask, sat.w, FLT_MAX);

        // Load 4 uint8 per lane.
        VInt packedClusterIndex = vload((int *)&s_fourCluster[i]);

        VInt c0 = (packedClusterIndex & 0xFF) - 1;
        VInt c1 = ((packedClusterIndex >> 8) & 0xFF) - 1;
        VInt c2 = ((packedClusterIndex >> 16)) - 1; // @@ No need for &

        x0.x = vpermuteif(c0 >= 0, vrsat, c0);
        x0.y = vpermuteif(c0 >= 0, vgsat, c0);
        x0.z = vpermuteif(c0 >= 0, vbsat, c0);
        w0   = vpermuteif(c0 >= 0, vwsat, c0);

        x1.x = vpermuteif(c1 >= 0, vrsat, c1);
        x1.y = vpermuteif(c1 >= 0, vgsat, c1);
        x1.z = vpermuteif(c1 >= 0, vbsat, c1);
        w1   = vpermuteif(c1 >= 0, vwsat, c1);

        x2.x = vpermuteif(c2 >= 0, vrsat, c2);
        x2.y = vpermuteif(c2 >= 0, vgsat, c2);
        x2.z = vpermuteif(c2 >= 0, vbsat, c2);
        w2   = vpermuteif(c2 >= 0, vwsat, c2);

#elif ICBC_USE_AVX2_PERMUTE2

        // Load 4 uint8 per lane.
        VInt packedClusterIndex = vload((int *)&s_fourCluster[i]);

        VInt c0 = (packedClusterIndex & 0xFF);
        VInt c1 = ((packedClusterIndex >> 8) & 0xFF);
        VInt c2 = ((packedClusterIndex >> 16)); // @@ No need for &

        if (count <= 8) {
            // Load sat.r in one register:
            VFloat rLo = vload(sat.r);
            VFloat gLo = vload(sat.g);
            VFloat bLo = vload(sat.b);
            VFloat wLo = vload(sat.w);

            x0.x = vpermuteif(c0>0, rLo, c0-1);
            x0.y = vpermuteif(c0>0, gLo, c0-1);
            x0.z = vpermuteif(c0>0, bLo, c0-1);
            w0   = vpermuteif(c0>0, wLo, c0-1);

            x1.x = vpermuteif(c1>0, rLo, c1-1);
            x1.y = vpermuteif(c1>0, gLo, c1-1);
            x1.z = vpermuteif(c1>0, bLo, c1-1);
            w1   = vpermuteif(c1>0, wLo, c1-1);

            x2.x = vpermuteif(c2>0, rLo, c2-1);
            x2.y = vpermuteif(c2>0, gLo, c2-1);
            x2.z = vpermuteif(c2>0, bLo, c2-1);
            w2   = vpermuteif(c2>0, wLo, c2-1);
        }
        else {
            // Load sat.r in two registers:
            VFloat rLo = vload(sat.r); VFloat rHi = vload(sat.r + 8);
            VFloat gLo = vload(sat.g); VFloat gHi = vload(sat.g + 8);
            VFloat bLo = vload(sat.b); VFloat bHi = vload(sat.b + 8);
            VFloat wLo = vload(sat.w); VFloat wHi = vload(sat.w + 8);

            x0.x = vpermute2if(c0>0, rLo, rHi, c0-1);
            x0.y = vpermute2if(c0>0, gLo, gHi, c0-1);
            x0.z = vpermute2if(c0>0, bLo, bHi, c0-1);
            w0   = vpermute2if(c0>0, wLo, wHi, c0-1);

            x1.x = vpermute2if(c1>0, rLo, rHi, c1-1);
            x1.y = vpermute2if(c1>0, gLo, gHi, c1-1);
            x1.z = vpermute2if(c1>0, bLo, bHi, c1-1);
            w1   = vpermute2if(c1>0, wLo, wHi, c1-1);

            x2.x = vpermute2if(c2>0, rLo, rHi, c2-1);
            x2.y = vpermute2if(c2>0, gLo, gHi, c2-1);
            x2.z = vpermute2if(c2>0, bLo, bHi, c2-1);
            w2   = vpermute2if(c2>0, wLo, wHi, c2-1);
        }

#elif ICBC_USE_NEON_VTL

        uint8x16_t idx0 = (uint8x16_t &)s_neon_vtl_index0_4[4*i];
        uint8x16_t idx1 = (uint8x16_t &)s_neon_vtl_index1_4[4*i];
        uint8x16_t idx2 = (uint8x16_t &)s_neon_vtl_index2_4[4*i];

        if (count <= 4) {
            uint8x16_t rsat1 = vld1q_u8((uint8*)sat.r);
            uint8x16_t gsat1 = vld1q_u8((uint8*)sat.g);
            uint8x16_t bsat1 = vld1q_u8((uint8*)sat.b);
            uint8x16_t wsat1 = vld1q_u8((uint8*)sat.w);

            x0.x = vreinterpretq_f32_u8(vqtbl1q_u8(rsat1, idx0));
            x0.y = vreinterpretq_f32_u8(vqtbl1q_u8(gsat1, idx0));
            x0.z = vreinterpretq_f32_u8(vqtbl1q_u8(bsat1, idx0));
            w0   = vreinterpretq_f32_u8(vqtbl1q_u8(wsat1, idx0));

            x1.x = vreinterpretq_f32_u8(vqtbl1q_u8(rsat1, idx1));
            x1.y = vreinterpretq_f32_u8(vqtbl1q_u8(gsat1, idx1));
            x1.z = vreinterpretq_f32_u8(vqtbl1q_u8(bsat1, idx1));
            w1   = vreinterpretq_f32_u8(vqtbl1q_u8(wsat1, idx1));

            x2.x = vreinterpretq_f32_u8(vqtbl1q_u8(rsat1, idx2));
            x2.y = vreinterpretq_f32_u8(vqtbl1q_u8(gsat1, idx2));
            x2.z = vreinterpretq_f32_u8(vqtbl1q_u8(bsat1, idx2));
            w2   = vreinterpretq_f32_u8(vqtbl1q_u8(wsat1, idx2));
        }
        else if (count <= 8) {
            uint8x16x2_t rsat2 = vld2q_u8((uint8*)sat.r);
            uint8x16x2_t gsat2 = vld2q_u8((uint8*)sat.g);
            uint8x16x2_t bsat2 = vld2q_u8((uint8*)sat.b);
            uint8x16x2_t wsat2 = vld2q_u8((uint8*)sat.w);

            x0.x = vreinterpretq_f32_u8(vqtbl2q_u8(rsat2, idx0));
            x0.y = vreinterpretq_f32_u8(vqtbl2q_u8(gsat2, idx0));
            x0.z = vreinterpretq_f32_u8(vqtbl2q_u8(bsat2, idx0));
            w0   = vreinterpretq_f32_u8(vqtbl2q_u8(wsat2, idx0));

            x1.x = vreinterpretq_f32_u8(vqtbl2q_u8(rsat2, idx1));
            x1.y = vreinterpretq_f32_u8(vqtbl2q_u8(gsat2, idx1));
            x1.z = vreinterpretq_f32_u8(vqtbl2q_u8(bsat2, idx1));
            w1   = vreinterpretq_f32_u8(vqtbl2q_u8(wsat2, idx1));

            x2.x = vreinterpretq_f32_u8(vqtbl2q_u8(rsat2, idx2));
            x2.y = vreinterpretq_f32_u8(vqtbl2q_u8(gsat2, idx2));
            x2.z = vreinterpretq_f32_u8(vqtbl2q_u8(bsat2, idx2));
            w2   = vreinterpretq_f32_u8(vqtbl2q_u8(wsat2, idx2));
        }
        else if (count <= 12) {
            uint8x16x3_t rsat3 = vld3q_u8((uint8*)sat.r);
            uint8x16x3_t gsat3 = vld3q_u8((uint8*)sat.g);
            uint8x16x3_t bsat3 = vld3q_u8((uint8*)sat.b);
            uint8x16x3_t wsat3 = vld3q_u8((uint8*)sat.w);

            x0.x = vreinterpretq_f32_u8(vqtbl3q_u8(rsat3, idx0));
            x0.y = vreinterpretq_f32_u8(vqtbl3q_u8(gsat3, idx0));
            x0.z = vreinterpretq_f32_u8(vqtbl3q_u8(bsat3, idx0));
            w0   = vreinterpretq_f32_u8(vqtbl3q_u8(wsat3, idx0));

            x1.x = vreinterpretq_f32_u8(vqtbl3q_u8(rsat3, idx1));
            x1.y = vreinterpretq_f32_u8(vqtbl3q_u8(gsat3, idx1));
            x1.z = vreinterpretq_f32_u8(vqtbl3q_u8(bsat3, idx1));
            w1   = vreinterpretq_f32_u8(vqtbl3q_u8(wsat3, idx1));

            x2.x = vreinterpretq_f32_u8(vqtbl3q_u8(rsat3, idx2));
            x2.y = vreinterpretq_f32_u8(vqtbl3q_u8(gsat3, idx2));
            x2.z = vreinterpretq_f32_u8(vqtbl3q_u8(bsat3, idx2));
            w2   = vreinterpretq_f32_u8(vqtbl3q_u8(wsat3, idx2));
        }
        else {
            uint8x16x4_t rsat4 = vld4q_u8((uint8*)sat.r);
            uint8x16x4_t gsat4 = vld4q_u8((uint8*)sat.g);
            uint8x16x4_t bsat4 = vld4q_u8((uint8*)sat.b);
            uint8x16x4_t wsat4 = vld4q_u8((uint8*)sat.w);

            x0.x = vreinterpretq_f32_u8(vqtbl4q_u8(rsat4, idx0));
            x0.y = vreinterpretq_f32_u8(vqtbl4q_u8(gsat4, idx0));
            x0.z = vreinterpretq_f32_u8(vqtbl4q_u8(bsat4, idx0));
            w0   = vreinterpretq_f32_u8(vqtbl4q_u8(wsat4, idx0));

            x1.x = vreinterpretq_f32_u8(vqtbl4q_u8(rsat4, idx1));
            x1.y = vreinterpretq_f32_u8(vqtbl4q_u8(gsat4, idx1));
            x1.z = vreinterpretq_f32_u8(vqtbl4q_u8(bsat4, idx1));
            w1   = vreinterpretq_f32_u8(vqtbl4q_u8(wsat4, idx1));

            x2.x = vreinterpretq_f32_u8(vqtbl4q_u8(rsat4, idx2));
            x2.y = vreinterpretq_f32_u8(vqtbl4q_u8(gsat4, idx2));
            x2.z = vreinterpretq_f32_u8(vqtbl4q_u8(bsat4, idx2));
            w2   = vreinterpretq_f32_u8(vqtbl4q_u8(wsat4, idx2));
        }

#else

        // Scalar path
        x0.x = vzero(); x0.y = vzero(); x0.z = vzero(); w0 = vzero();
        x1.x = vzero(); x1.y = vzero(); x1.z = vzero(); w1 = vzero();
        x2.x = vzero(); x2.y = vzero(); x2.z = vzero(); w2 = vzero();

        for (int l = 0; l < VEC_SIZE; l++) {
            int c0 = s_fourCluster[i + l].c0 - 1;
            if (c0 >= 0) {
                lane(x0.x, l) = sat.r[c0];
                lane(x0.y, l) = sat.g[c0];
                lane(x0.z, l) = sat.b[c0];
                lane(w0, l) = sat.w[c0];
            }

            int c1 = s_fourCluster[i + l].c1 - 1;
            if (c1 >= 0) {
                lane(x1.x, l) = sat.r[c1];
                lane(x1.y, l) = sat.g[c1];
                lane(x1.z, l) = sat.b[c1];
                lane(w1, l) = sat.w[c1];
            }

            int c2 = s_fourCluster[i + l].c2 - 1;
            if (c2 >= 0) {
                lane(x2.x, l) = sat.r[c2];
                lane(x2.y, l) = sat.g[c2];
                lane(x2.z, l) = sat.b[c2];
                lane(w2, l) = sat.w[c2];
            }
        }

#endif

        VFloat w3 = vbroadcast(w_sum) - w2;
        x2 = x2 - x1;
        x1 = x1 - x0;
        w2 = w2 - w1;
        w1 = w1 - w0;

        VFloat alpha2_sum = vmadd(w2, (1.0f / 9.0f), vmadd(w1, (4.0f / 9.0f), w0));
        VFloat beta2_sum  = vmadd(w1, (1.0f / 9.0f), vmadd(w2, (4.0f / 9.0f), w3));

        VFloat alphabeta_sum = (w1 + w2) * vbroadcast(2.0f / 9.0f);
        VFloat factor = vrcp(vm2sub(alpha2_sum, beta2_sum, alphabeta_sum, alphabeta_sum));

        VVector3 alphax_sum = vmadd(x2, (1.0f / 3.0f), vmadd(x1, (2.0f / 3.0f), x0));
        VVector3 betax_sum = vbroadcast(r_sum, g_sum, b_sum) - alphax_sum;

        VVector3 a = vm2sub(alphax_sum, beta2_sum, betax_sum, alphabeta_sum) * factor;
        VVector3 b = vm2sub(betax_sum, alpha2_sum, alphax_sum, alphabeta_sum) * factor;

        // snap to the grid
        a = vround_ept(a);
        b = vround_ept(b);

        // compute the error
        VVector3 e2 = vm2sub(a, vmsub(b, alphabeta_sum, alphax_sum), b, betax_sum) * vbroadcast(2.0f);
        VVector3 e1 = vmadd(a * a, alpha2_sum, vmadd(b * b, beta2_sum, e2));

        // apply the metric to the error term
        VFloat error = vdot(e1, vbroadcast(metric_sqr));

        // keep the solution if it wins
        auto mask = (error < vbesterror);

        // We could mask the unused lanes here, but instead set the invalid SAT entries to FLT_MAX.
        //mask = (mask & (vbroadcast(total_order_count) >= tid8(i))); // This doesn't seem to help. Is it OK to consider elements out of bounds?

        vbesterror = vselect(mask, vbesterror, error);
        vbeststart = vselect(mask, vbeststart, a);
        vbestend = vselect(mask, vbestend, b);
    }

    int bestindex = reduce_min_index(vbesterror);

    start->x = lane(vbeststart.x, bestindex);
    start->y = lane(vbeststart.y, bestindex);
    start->z = lane(vbeststart.z, bestindex);
    end->x = lane(vbestend.x, bestindex);
    end->y = lane(vbestend.y, bestindex);
    end->z = lane(vbestend.z, bestindex);
}


///////////////////////////////////////////////////////////////////////////////////////////////////
// Palette evaluation.

Decoder s_decoder = Decoder_D3D10;

// D3D10
inline void evaluate_palette4_d3d10(Color32 palette[4]) {
    palette[2].r = (2 * palette[0].r + palette[1].r) / 3;
    palette[2].g = (2 * palette[0].g + palette[1].g) / 3;
    palette[2].b = (2 * palette[0].b + palette[1].b) / 3;
    palette[2].a = 0xFF;

    palette[3].r = (2 * palette[1].r + palette[0].r) / 3;
    palette[3].g = (2 * palette[1].g + palette[0].g) / 3;
    palette[3].b = (2 * palette[1].b + palette[0].b) / 3;
    palette[3].a = 0xFF;
}
inline void evaluate_palette3_d3d10(Color32 palette[4]) {
    palette[2].r = (palette[0].r + palette[1].r) / 2;
    palette[2].g = (palette[0].g + palette[1].g) / 2;
    palette[2].b = (palette[0].b + palette[1].b) / 2;
    palette[2].a = 0xFF;
    palette[3].u = 0;
}
static void evaluate_palette_d3d10(Color16 c0, Color16 c1, Color32 palette[4]) {
    palette[0] = bitexpand_color16_to_color32(c0);
    palette[1] = bitexpand_color16_to_color32(c1);
    if (c0.u > c1.u) {
        evaluate_palette4_d3d10(palette);
    }
    else {
        evaluate_palette3_d3d10(palette);
    }
}

// NV
inline void evaluate_palette4_nv(Color16 c0, Color16 c1, Color32 palette[4]) {
    int gdiff = palette[1].g - palette[0].g;
    palette[2].r = uint8(((2 * c0.r + c1.r) * 22) / 8);
    palette[2].g = uint8((256 * palette[0].g + gdiff / 4 + 128 + gdiff * 80) / 256);
    palette[2].b = uint8(((2 * c0.b + c1.b) * 22) / 8);
    palette[2].a = 0xFF;

    palette[3].r = uint8(((2 * c1.r + c0.r) * 22) / 8);
    palette[3].g = uint8((256 * palette[1].g - gdiff / 4 + 128 - gdiff * 80) / 256);
    palette[3].b = uint8(((2 * c1.b + c0.b) * 22) / 8);
    palette[3].a = 0xFF;
}
inline void evaluate_palette3_nv(Color16 c0, Color16 c1, Color32 palette[4]) {
    int gdiff = palette[1].g - palette[0].g;
    palette[2].r = uint8(((c0.r + c1.r) * 33) / 8);
    palette[2].g = uint8((256 * palette[0].g + gdiff / 4 + 128 + gdiff * 128) / 256);
    palette[2].b = uint8(((c0.b + c1.b) * 33) / 8);
    palette[2].a = 0xFF;
    palette[3].u = 0;
}
static void evaluate_palette_nv(Color16 c0, Color16 c1, Color32 palette[4]) {
    palette[0] = bitexpand_color16_to_color32(c0);
    palette[1] = bitexpand_color16_to_color32(c1);

    if (c0.u > c1.u) {
        evaluate_palette4_nv(c0, c1, palette);
    }
    else {
        evaluate_palette3_nv(c0, c1, palette);
    }
}

// AMD
inline void evaluate_palette4_amd(Color32 palette[4]) {
    palette[2].r = uint8((43 * palette[0].r + 21 * palette[1].r + 32) >> 6);
    palette[2].g = uint8((43 * palette[0].g + 21 * palette[1].g + 32) >> 6);
    palette[2].b = uint8((43 * palette[0].b + 21 * palette[1].b + 32) >> 6);
    palette[2].a = 0xFF;

    palette[3].r = uint8((43 * palette[1].r + 21 * palette[0].r + 32) >> 6);
    palette[3].g = uint8((43 * palette[1].g + 21 * palette[0].g + 32) >> 6);
    palette[3].b = uint8((43 * palette[1].b + 21 * palette[0].b + 32) >> 6);
    palette[3].a = 0xFF;
}
inline void evaluate_palette3_amd(Color32 palette[4]) {
    palette[2].r = uint8((palette[0].r + palette[1].r + 1) / 2);
    palette[2].g = uint8((palette[0].g + palette[1].g + 1) / 2);
    palette[2].b = uint8((palette[0].b + palette[1].b + 1) / 2);
    palette[2].a = 0xFF;
    palette[3].u = 0;
}
static void evaluate_palette_amd(Color16 c0, Color16 c1, Color32 palette[4]) {
    palette[0] = bitexpand_color16_to_color32(c0);
    palette[1] = bitexpand_color16_to_color32(c1);

    if (c0.u > c1.u) {
        evaluate_palette4_amd(palette);
    }
    else {
        evaluate_palette3_amd(palette);
    }
}

inline void evaluate_palette(Color16 c0, Color16 c1, Color32 palette[4], Decoder decoder = s_decoder) {
    if (decoder == Decoder_D3D10)         evaluate_palette_d3d10(c0, c1, palette);
    else if (decoder == Decoder_NVIDIA)   evaluate_palette_nv(c0, c1, palette);
    else if (decoder == Decoder_AMD)      evaluate_palette_amd(c0, c1, palette);
}

static void evaluate_palette(Color16 c0, Color16 c1, Vector3 palette[4]) {
    Color32 palette32[4];
    evaluate_palette(c0, c1, palette32);

    for (int i = 0; i < 4; i++) {
        palette[i] = color_to_vector3(palette32[i]);
    }
}

static void decode_dxt1(const BlockDXT1 * block, unsigned char rgba_block[16 * 4], Decoder decoder)
{
    Color32 palette[4];
    evaluate_palette(block->col0, block->col1, palette, decoder);

    for (int i = 0; i < 16; i++) {
        int index = (block->indices >> (2 * i)) & 3;
        Color32 c = palette[index];
        rgba_block[4 * i + 0] = c.r;
        rgba_block[4 * i + 1] = c.g;
        rgba_block[4 * i + 2] = c.b;
        rgba_block[4 * i + 3] = c.a;
    }
}


///////////////////////////////////////////////////////////////////////////////////////////////////
// Error evaluation.

// Different ways of estimating the error.

static float evaluate_mse(const Vector3 & p, const Vector3 & c, const Vector3 & w) {
    Vector3 d = (p - c) * w * 255;
    return dot(d, d);
}

static float evaluate_mse(const Color32 & p, const Vector3 & c, const Vector3 & w) {
    Vector3 d = (color_to_vector3(p) - c) * w * 255;
    return dot(d, d);
}

static int evaluate_mse(const Color32 & p, const Color32 & c) {
    return (square(int(p.r)-c.r) + square(int(p.g)-c.g) + square(int(p.b)-c.b));
}

static int evaluate_mse(const Color32 palette[4], const Color32 & c) {
    int e0 = evaluate_mse(palette[0], c);
    int e1 = evaluate_mse(palette[1], c);
    int e2 = evaluate_mse(palette[2], c);
    int e3 = evaluate_mse(palette[3], c);
    return min(min(e0, e1), min(e2, e3));
}

// Returns MSE error in [0-255] range.
static int evaluate_mse(const BlockDXT1 * output, Color32 color, int index) {
    Color32 palette[4];
    evaluate_palette(output->col0, output->col1, palette);

    return evaluate_mse(palette[index], color);
}

// Returns weighted MSE error in [0-255] range.
static float evaluate_palette_error(Color32 palette[4], const Color32 * colors, const float * weights, int count) {

    float total = 0.0f;
    for (int i = 0; i < count; i++) {
        total += weights[i] * evaluate_mse(palette, colors[i]);
    }

    return total;
}

static float evaluate_palette_error(Color32 palette[4], const Color32 * colors, int count) {

    float total = 0.0f;
    for (int i = 0; i < count; i++) {
        total += evaluate_mse(palette, colors[i]);
    }

    return total;
}

static float evaluate_mse(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, const BlockDXT1 * output) {
    Color32 palette[4];
    evaluate_palette(output->col0, output->col1, palette);

    // evaluate error for each index.
    float error = 0.0f;
    for (int i = 0; i < 16; i++) {
        int index = (output->indices >> (2 * i)) & 3;
        error += input_weights[i] * evaluate_mse(palette[index], input_colors[i].xyz, color_weights);
    }
    return error;
}

static float evaluate_mse(const Vector4 input_colors[16], const float input_weights[16], const Vector3& color_weights, Vector3 palette[4], uint32 indices) {

    // evaluate error for each index.
    float error = 0.0f;
    for (int i = 0; i < 16; i++) {
        int index = (indices >> (2 * i)) & 3;
        error += input_weights[i] * evaluate_mse(palette[index], input_colors[i].xyz, color_weights);
    }
    return error;
}

float evaluate_dxt1_error(const uint8 rgba_block[16*4], const BlockDXT1 * block, Decoder decoder) {
    Color32 palette[4];
    evaluate_palette(block->col0, block->col1, palette, decoder);

    // evaluate error for each index.
    float error = 0.0f;
    for (int i = 0; i < 16; i++) {
        int index = (block->indices >> (2 * i)) & 3;
        Color32 c;
        c.r = rgba_block[4 * i + 0];
        c.g = rgba_block[4 * i + 1];
        c.b = rgba_block[4 * i + 2];
        c.a = 255;
        error += evaluate_mse(palette[index], c);
    }
    return error;
}


///////////////////////////////////////////////////////////////////////////////////////////////////
// Index selection

// @@ Can we interleave the two uint16 at once?
inline uint32 interleave_uint16_with_zeros(uint32 input)  {
    uint32 word = input;
    word = (word ^ (word << 8 )) & 0x00ff00ff;
    word = (word ^ (word << 4 )) & 0x0f0f0f0f;
    word = (word ^ (word << 2 )) & 0x33333333;
    word = (word ^ (word << 1 )) & 0x55555555;
    return word;
}

// Interleave the bits. https://lemire.me/blog/2018/01/08/how-fast-can-you-bit-interleave-32-bit-integers/
ICBC_FORCEINLINE uint32 interleave(uint32 a, uint32 b) {
#if ICBC_BMI2
    return _pdep_u32(a, 0x55555555) | _pdep_u32(b, 0xaaaaaaaa);
#else
    return interleave_uint16_with_zeros(a) | (interleave_uint16_with_zeros(b) << 1);
#endif
}

static uint compute_indices4(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 palette[4]) {
    uint indices0 = 0;
    uint indices1 = 0;

    VVector3 vw = vbroadcast(color_weights);
    VVector3 vp0 = vbroadcast(palette[0]) * vw;
    VVector3 vp1 = vbroadcast(palette[1]) * vw;
    VVector3 vp2 = vbroadcast(palette[2]) * vw;
    VVector3 vp3 = vbroadcast(palette[3]) * vw;

    for (int i = 0; i < 16; i += VEC_SIZE) {
        VVector3 vc = vload(&input_colors[i]) * vw;

        VFloat d0 = vlen2(vc - vp0);
        VFloat d1 = vlen2(vc - vp1);
        VFloat d2 = vlen2(vc - vp2);
        VFloat d3 = vlen2(vc - vp3);

        VMask b1 = d1 > d2;
        VMask b2 = d0 > d2;
        VMask x0 = b1 & b2;

        VMask b0 = d0 > d3;
        VMask b3 = d1 > d3;
        x0 = x0 | (b0 & b3);

        VMask b4 = d2 > d3;
        VMask x1 = b0 & b4;

        indices0 |= mask(x0) << i;
        indices1 |= mask(x1) << i;
    }

    return interleave(indices1, indices0);
}

static uint compute_indices3(const Vector4 input_colors[16], const Vector3 & color_weights, bool allow_transparent_black, const Vector3 palette[4]) {
    uint indices0 = 0;
    uint indices1 = 0;

    VVector3 vw = vbroadcast(color_weights);
    VVector3 vp0 = vbroadcast(palette[0]) * vw;
    VVector3 vp1 = vbroadcast(palette[1]) * vw;
    VVector3 vp2 = vbroadcast(palette[2]) * vw;

    if (allow_transparent_black) {
        for (int i = 0; i < 16; i += VEC_SIZE) {
            VVector3 vc = vload(&input_colors[i]) * vw;

            VFloat d0 = vlen2(vp0 - vc);
            VFloat d1 = vlen2(vp1 - vc);
            VFloat d2 = vlen2(vp2 - vc);
            VFloat d3 = vdot(vc, vc);

            VMask i1 = (d1 < d2);
            VMask i2 = (d2 <= d0) & (d2 <= d1);
            VMask i3 = (d3 <= d0) & (d3 <= d1) & (d3 <= d2);

            indices0 |= mask(i2 | i3) << i;
            indices1 |= mask(i1 | i3) << i;
        }
    }
    else {
        for (int i = 0; i < 16; i += VEC_SIZE) {
            VVector3 vc = vload(&input_colors[i]) * vw;

            VFloat d0 = vlen2(vc - vp0);
            VFloat d1 = vlen2(vc - vp1);
            VFloat d2 = vlen2(vc - vp2);

            VMask i1 = (d1 < d2);
            VMask i2 = (d2 <= d0) & (d2 <= d1);

            indices0 |= mask(i2) << i;
            indices1 |= mask(i1) << i;
        }
    }

    return interleave(indices1, indices0);
}


static uint compute_indices(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 palette[4]) {
#if 0
    Vector3 p0 = palette[0] * color_weights;
    Vector3 p1 = palette[1] * color_weights;
    Vector3 p2 = palette[2] * color_weights;
    Vector3 p3 = palette[3] * color_weights;

    uint indices = 0;
    for (int i = 0; i < 16; i++) {
        Vector3 ci = input_colors[i].xyz * color_weights;
        float d0 = lengthSquared(p0 - ci);
        float d1 = lengthSquared(p1 - ci);
        float d2 = lengthSquared(p2 - ci);
        float d3 = lengthSquared(p3 - ci);

        uint index;
        if (d0 < d1 && d0 < d2 && d0 < d3) index = 0;
        else if (d1 < d2 && d1 < d3) index = 1;
        else if (d2 < d3) index = 2;
        else index = 3;

        indices |= index << (2 * i);
    }

    return indices;
#else
    uint indices0 = 0;
    uint indices1 = 0;

    VVector3 vw = vbroadcast(color_weights);
    VVector3 vp0 = vbroadcast(palette[0]) * vw;
    VVector3 vp1 = vbroadcast(palette[1]) * vw;
    VVector3 vp2 = vbroadcast(palette[2]) * vw;
    VVector3 vp3 = vbroadcast(palette[3]) * vw;

    for (int i = 0; i < 16; i += VEC_SIZE) {
        VVector3 vc = vload(&input_colors[i]) * vw;

        VFloat d0 = vlen2(vc - vp0);
        VFloat d1 = vlen2(vc - vp1);
        VFloat d2 = vlen2(vc - vp2);
        VFloat d3 = vlen2(vc - vp3);

        //VMask i0 = (d0 < d1) & (d0 < d2) & (d0 < d3);
        VMask i1 = (d1 <= d0) & (d1 < d2) & (d1 < d3);
        VMask i2 = (d2 <= d0) & (d2 <= d1) & (d2 < d3);
        VMask i3 = (d3 <= d0) & (d3 <= d1) & (d3 <= d2);
        //VFloat vindex = vselect(i0, vselect(i1, vselect(i2, vbroadcast(3), vbroadcast(2)), vbroadcast(1)), vbroadcast(0));

        indices0 |= mask(i2 | i3) << i;
        indices1 |= mask(i1 | i3) << i;
    }

    uint indices = interleave(indices1, indices0);
    return indices;
#endif
}


static float output_block3(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, bool allow_transparent_black, const Vector3 & v0, const Vector3 & v1, BlockDXT1 * block)
{
    Color16 color0 = vector3_to_color16(v0);
    Color16 color1 = vector3_to_color16(v1);

    if (color0.u > color1.u) {
        swap(color0, color1);
    }

    Vector3 palette[4];
    evaluate_palette(color0, color1, palette);

    block->col0 = color0;
    block->col1 = color1;
    block->indices = compute_indices3(input_colors, color_weights, allow_transparent_black, palette);

    return evaluate_mse(input_colors, input_weights, color_weights, palette, block->indices);
}

static float output_block4(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, const Vector3 & v0, const Vector3 & v1, BlockDXT1 * block)
{
    Color16 color0 = vector3_to_color16(v0);
    Color16 color1 = vector3_to_color16(v1);

    if (color0.u < color1.u) {
        swap(color0, color1);
    }

    Vector3 palette[4];
    evaluate_palette(color0, color1, palette);

    block->col0 = color0;
    block->col1 = color1;
    block->indices = compute_indices4(input_colors, color_weights, palette);

    return evaluate_mse(input_colors, input_weights, color_weights, palette, block->indices);
}


// Least squares fitting of color end points for the given indices. @@ Take weights into account.
static bool optimize_end_points4(uint indices, const Vector4 * colors, /*const float * weights,*/ int count, Vector3 * a, Vector3 * b)
{
    float alpha2_sum = 0.0f;
    float beta2_sum = 0.0f;
    float alphabeta_sum = 0.0f;
    Vector3 alphax_sum = { 0,0,0 };
    Vector3 betax_sum = { 0,0,0 };

    for (int i = 0; i < count; i++)
    {
        const uint bits = indices >> (2 * i);

        float beta = float(bits & 1);
        if (bits & 2) beta = (1 + beta) / 3.0f;
        float alpha = 1.0f - beta;

        alpha2_sum += alpha * alpha;
        beta2_sum += beta * beta;
        alphabeta_sum += alpha * beta;
        alphax_sum += alpha * colors[i].xyz;
        betax_sum += beta * colors[i].xyz;
    }

    float denom = alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum;
    if (equal(denom, 0.0f)) return false;

    float factor = 1.0f / denom;

    *a = saturate((alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor);
    *b = saturate((betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor);

    return true;
}

// Least squares optimization with custom factors.
// This allows us passing the standard [1, 0, 2/3 1/3] weights by default, but also use alternative mappings when the number of clusters is not 4.
static bool optimize_end_points4(uint indices, const Vector3 * colors, int count, float factors[4], Vector3 * a, Vector3 * b)
{
    float alpha2_sum = 0.0f;
    float beta2_sum = 0.0f;
    float alphabeta_sum = 0.0f;
    Vector3 alphax_sum = { 0,0,0 };
    Vector3 betax_sum = { 0,0,0 };

    for (int i = 0; i < count; i++)
    {
        const uint idx = (indices >> (2 * i)) & 3;
        float alpha = factors[idx];
        float beta = 1 - alpha;

        alpha2_sum += alpha * alpha;
        beta2_sum += beta * beta;
        alphabeta_sum += alpha * beta;
        alphax_sum += alpha * colors[i];
        betax_sum += beta * colors[i];
    }

    float denom = alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum;
    if (equal(denom, 0.0f)) return false;

    float factor = 1.0f / denom;

    *a = saturate((alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor);
    *b = saturate((betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor);

    return true;
}

static bool optimize_end_points4(uint indices, const Vector3 * colors, int count, Vector3 * a, Vector3 * b)
{
    float factors[4] = { 1, 0, 2.f / 3, 1.f / 3 };
    return optimize_end_points4(indices, colors, count, factors, a, b);
}


// Least squares fitting of color end points for the given indices. @@ This does not support black/transparent index. @@ Take weights into account.
static bool optimize_end_points3(uint indices, const Vector3 * colors, /*const float * weights,*/ int count, Vector3 * a, Vector3 * b)
{
    float alpha2_sum = 0.0f;
    float beta2_sum = 0.0f;
    float alphabeta_sum = 0.0f;
    Vector3 alphax_sum = { 0,0,0 };
    Vector3 betax_sum = { 0,0,0 };

    for (int i = 0; i < count; i++)
    {
        const uint bits = indices >> (2 * i);

        float beta = float(bits & 1);
        if (bits & 2) beta = 0.5f;
        float alpha = 1.0f - beta;

        alpha2_sum += alpha * alpha;
        beta2_sum += beta * beta;
        alphabeta_sum += alpha * beta;
        alphax_sum += alpha * colors[i];
        betax_sum += beta * colors[i];
    }

    float denom = alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum;
    if (equal(denom, 0.0f)) return false;

    float factor = 1.0f / denom;

    *a = saturate((alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor);
    *b = saturate((betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor);

    return true;
}


// find minimum and maximum colors based on bounding box in color space
inline static void fit_colors_bbox(const Vector3 * colors, int count, Vector3 * __restrict c0, Vector3 * __restrict c1)
{
    *c0 = { 0,0,0 };
    *c1 = { 1,1,1 };

    for (int i = 0; i < count; i++) {
        *c0 = max(*c0, colors[i]);
        *c1 = min(*c1, colors[i]);
    }
}

inline static void select_diagonal(const Vector3 * colors, int count, Vector3 * __restrict c0, Vector3 * __restrict c1)
{
    Vector3 center = (*c0 + *c1) * 0.5f;

    /*Vector3 center = colors[0];
    for (int i = 1; i < count; i++) {
        center = center * float(i-1) / i + colors[i] / i;
    }*/
    /*Vector3 center = colors[0];
    for (int i = 1; i < count; i++) {
        center += colors[i];
    }
    center /= count;*/

    float cov_xz = 0.0f;
    float cov_yz = 0.0f;
    for (int i = 0; i < count; i++) {
        Vector3 t = colors[i] - center;
        cov_xz += t.x * t.z;
        cov_yz += t.y * t.z;
    }

    float x0 = c0->x;
    float y0 = c0->y;
    float x1 = c1->x;
    float y1 = c1->y;

    if (cov_xz < 0) {
        swap(x0, x1);
    }
    if (cov_yz < 0) {
        swap(y0, y1);
    }

    *c0 = { x0, y0, c0->z };
    *c1 = { x1, y1, c1->z };
}

inline static void inset_bbox(Vector3 * __restrict c0, Vector3 * __restrict c1)
{
    float bias = (8.0f / 255.0f) / 16.0f;
    Vector3 inset = (*c0 - *c1) / 16.0f - scalar_to_vector3(bias);
    *c0 = saturate(*c0 - inset);
    *c1 = saturate(*c1 + inset);
}


// Single color lookup tables from:
// https://github.com/nothings/stb/blob/master/stb_dxt.h
static uint8 s_match5[256][2];
static uint8 s_match6[256][2];

static inline int Lerp13(int a, int b)
{
    // replace "/ 3" by "* 0xaaab) >> 17" if your compiler sucks or you really need every ounce of speed.
    return (a * 2 + b) / 3;
}

static void PrepareOptTable5(uint8 * table, Decoder decoder)
{
    uint8 expand[32];
    for (int i = 0; i < 32; i++) expand[i] = uint8((i << 3) | (i >> 2));

    for (int i = 0; i < 256; i++) {
        int bestErr = 256 * 100;

        for (int mn = 0; mn < 32; mn++) {
            for (int mx = 0; mx < 32; mx++) {
                int mine = expand[mn];
                int maxe = expand[mx];

                int err;

                int amd_r = (43 * maxe + 21 * mine + 32) >> 6;
                int amd_err = abs(amd_r - i);

                int nv_r = ((2 * mx + mn) * 22) / 8;
                int nv_err = abs(nv_r - i);

                if (decoder == Decoder_D3D10) {
                    // DX10 spec says that interpolation must be within 3% of "correct" result,
                    // add this as error term. (normally we'd expect a random distribution of
                    // +-1.5% error, but nowhere in the spec does it say that the error has to be
                    // unbiased - better safe than sorry).
                    int r = (maxe * 2 + mine) / 3;
                    err = abs(r - i) * 100 + abs(mx - mn) * 3;

                    // Another approach is to consider the worst of AMD and NVIDIA errors.
                    err = max(amd_err, nv_err);                    
                }
                else if (decoder == Decoder_NVIDIA) {
                    err = nv_err;
                }
                else /*if (decoder == Decoder_AMD)*/ {
                    err = amd_err;
                }

                if (err < bestErr) {
                    bestErr = err;
                    table[i * 2 + 0] = uint8(mx);
                    table[i * 2 + 1] = uint8(mn);
                }
            }
        }
    }
}

static void PrepareOptTable6(uint8 * table, Decoder decoder)
{
    uint8 expand[64];
    for (int i = 0; i < 64; i++) expand[i] = uint8((i << 2) | (i >> 4));

    for (int i = 0; i < 256; i++) {
        int bestErr = 256 * 100;

        for (int mn = 0; mn < 64; mn++) {
            for (int mx = 0; mx < 64; mx++) {
                int mine = expand[mn];
                int maxe = expand[mx];

                int err;

                int amd_g = (43 * maxe + 21 * mine + 32) >> 6;
                int amd_err = abs(amd_g - i);

                int nv_g = (256 * mine + (maxe - mine) / 4 + 128 + (maxe - mine) * 80) / 256;
                int nv_err = abs(nv_g - i);

                if (decoder == Decoder_D3D10) {
                    // DX10 spec says that interpolation must be within 3% of "correct" result,
                    // add this as error term. (normally we'd expect a random distribution of
                    // +-1.5% error, but nowhere in the spec does it say that the error has to be
                    // unbiased - better safe than sorry).
                    int g = (maxe * 2 + mine) / 3;
                    err = abs(g - i) * 100 + abs(mx - mn) * 3;

                    // Another approach is to consider the worst of AMD and NVIDIA errors.
                    err = max(amd_err, nv_err);
                }
                else if (decoder == Decoder_NVIDIA) {
                    err = nv_err;
                }
                else /*if (decoder == Decoder_AMD)*/ {
                    err = amd_err;
                }

                if (err < bestErr) {
                    bestErr = err;
                    table[i * 2 + 0] = uint8(mx);
                    table[i * 2 + 1] = uint8(mn);
                }
            }
        }
    }
}


static void init_single_color_tables(Decoder decoder)
{
    // Prepare single color lookup tables.
    PrepareOptTable5(&s_match5[0][0], decoder);
    PrepareOptTable6(&s_match6[0][0], decoder);
}

// Single color compressor, based on:
// https://mollyrocket.com/forums/viewtopic.php?t=392
static void compress_dxt1_single_color_optimal(Color32 c, BlockDXT1 * output)
{
    output->col0.r = s_match5[c.r][0];
    output->col0.g = s_match6[c.g][0];
    output->col0.b = s_match5[c.b][0];
    output->col1.r = s_match5[c.r][1];
    output->col1.g = s_match6[c.g][1];
    output->col1.b = s_match5[c.b][1];
    output->indices = 0xaaaaaaaa;

    if (output->col0.u < output->col1.u)
    {
        swap(output->col0.u, output->col1.u);
        output->indices ^= 0x55555555;
    }
}


static float compress_dxt1_cluster_fit(const Vector4 input_colors[16], const float input_weights[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, bool three_color_mode, bool try_transparent_black, bool allow_transparent_black, BlockDXT1 * output)
{
    Vector3 metric_sqr = color_weights * color_weights;

    SummedAreaTable sat;
    int sat_count = compute_sat(colors, weights, count, &sat);

    Vector3 start, end;
    cluster_fit_four(sat, sat_count, metric_sqr, &start, &end);

    float best_error = output_block4(input_colors, input_weights, color_weights, start, end, output);

    if (three_color_mode) {
        if (try_transparent_black) {
            Vector3 tmp_colors[16];
            float tmp_weights[16];
            int tmp_count = skip_blacks(colors, weights, count, tmp_colors, tmp_weights);
            if (!tmp_count) return best_error;

            sat_count = compute_sat(tmp_colors, tmp_weights, tmp_count, &sat);
        }

        cluster_fit_three(sat, sat_count, metric_sqr, &start, &end);

        BlockDXT1 three_color_block;
        float three_color_error = output_block3(input_colors, input_weights, color_weights, allow_transparent_black, start, end, &three_color_block);

        if (three_color_error < best_error) {
            best_error = three_color_error;
            *output = three_color_block;
        }
    }

    return best_error;
}


static float refine_endpoints(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, bool three_color_mode, float input_error, BlockDXT1 * output) {
    // TODO:
    // - Optimize palette evaluation when updating only one channel.
    // - try all diagonals.

    // Things that don't help:
    // - Alternate endpoint updates.
    // - Randomize order.
    // - If one direction does not improve, test opposite direction next.

    static const int8 deltas[16][3] = {
        {1,0,0},
        {0,1,0},
        {0,0,1},

        {-1,0,0},
        {0,-1,0},
        {0,0,-1},

        {1,1,0},
        {1,0,1},
        {0,1,1},

        {-1,-1,0},
        {-1,0,-1},
        {0,-1,-1},

        {-1,1,0},
        //{-1,0,1},

        {1,-1,0},
        {0,-1,1},

        //{1,0,-1},
        {0,1,-1},
    };

    float best_error = input_error;

    int lastImprovement = 0;
    for (int i = 0; i < 256; i++) {
        BlockDXT1 refined = *output;
        int8 delta[3] = { deltas[i % 16][0], deltas[i % 16][1], deltas[i % 16][2] };

        if ((i / 16) & 1) {
            refined.col0.r += delta[0];
            refined.col0.g += delta[1];
            refined.col0.b += delta[2];
        }
        else {
            refined.col1.r += delta[0];
            refined.col1.g += delta[1];
            refined.col1.b += delta[2];
        }

        if (!three_color_mode) {
            if (refined.col0.u == refined.col1.u) refined.col1.g += 1;
            if (refined.col0.u < refined.col1.u) swap(refined.col0.u, refined.col1.u);
        }

        Vector3 palette[4];
        evaluate_palette(output->col0, output->col1, palette);

        refined.indices = compute_indices(input_colors, color_weights, palette);

        float refined_error = evaluate_mse(input_colors, input_weights, color_weights, &refined);
        if (refined_error < best_error) {
            best_error = refined_error;
            *output = refined;
            lastImprovement = i;
        }

        // Early out if the last 32 steps didn't improve error.
        if (i - lastImprovement > 32) break;
    }

    return best_error;
}

struct Options {
    float threshold = 0.0f;
    bool box_fit = false;
    bool least_squares_fit = false;
    bool cluster_fit = false;
    bool cluster_fit_3 = false;
    bool cluster_fit_3_black_only = false;
    bool endpoint_refinement = false;
};

static Options setup_options(Quality level, bool enable_three_color_mode, bool enable_transparent_black) {
    Options opt;

    switch (level) {
        case Quality_Level1:            // Box fit + least squares fit.
            opt.box_fit = true;
            opt.least_squares_fit = true;
            opt.threshold = 1.0f / 256;
            break;

        case Quality_Level2:            // Cluster fit 4, threshold = 24.
            opt.box_fit = true;
            opt.least_squares_fit = true;
            opt.cluster_fit = true;
            opt.cluster_fit_3_black_only = enable_three_color_mode && enable_transparent_black;
            opt.threshold = 1.0f / 24;
            break;

        case Quality_Level3:            // Cluster fit 4, threshold = 32.
            opt.box_fit = true;
            opt.cluster_fit = true;
            opt.cluster_fit_3_black_only = enable_three_color_mode && enable_transparent_black;
            opt.threshold = 1.0f / 32;
            break;

        case Quality_Level4:            // Cluster fit 3+4, threshold = 48.
            opt.cluster_fit = true;
            opt.cluster_fit_3_black_only = enable_three_color_mode && enable_transparent_black;
            opt.threshold = 1.0f / 48;
            break;

        case Quality_Level5:            // Cluster fit 3+4, threshold = 64.
            opt.cluster_fit = true;
            opt.cluster_fit_3_black_only = enable_three_color_mode && enable_transparent_black;
            opt.threshold = 1.0f / 64;
            break;

        case Quality_Level6:            // Cluster fit 3+4, threshold = 96.
            opt.cluster_fit = true;
            opt.cluster_fit_3_black_only = enable_three_color_mode && enable_transparent_black;
            opt.threshold = 1.0f / 96;
            break;

        case Quality_Level7:            // Cluster fit 3+4, threshold = 128.
            opt.cluster_fit = true;
            opt.cluster_fit_3_black_only = enable_three_color_mode && enable_transparent_black;
            opt.threshold = 1.0f / 128;
            break;

        case Quality_Level8:            // Cluster fit 3+4, threshold = 256.
            opt.cluster_fit = true;
            opt.cluster_fit_3 = enable_three_color_mode;
            opt.threshold = 1.0f / 256;
            break;

        case Quality_Level9:           // Cluster fit 3+4, threshold = 256 + Refinement.
            opt.cluster_fit = true;
            opt.cluster_fit_3 = enable_three_color_mode;
            opt.threshold = 1.0f / 256;
            opt.endpoint_refinement = true;
            break;
    }

    return opt;
}


static float compress_dxt1(Quality level, const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, bool three_color_mode, bool three_color_black, BlockDXT1 * output)
{
    Options opt = setup_options(level, three_color_mode, three_color_black);

    Vector3 colors[16];
    float weights[16];
    bool any_black = false;
    int count;
    if (opt.cluster_fit) {
        count = reduce_colors(input_colors, input_weights, 16, opt.threshold, colors, weights, &any_black);
    }
    else {
        for (int i = 0; i < 16; i++) {
            colors[i] = input_colors[i].xyz;
        }
        count = 16;
    }

    if (count == 0) {
        // Output trivial block.
        output->col0.u = 0;
        output->col1.u = 0;
        output->indices = 0;
        return 0;
    }

    // Cluster fit cannot handle single color blocks, so encode them optimally.
    if (count == 1) {
        compress_dxt1_single_color_optimal(vector3_to_color32(colors[0]), output);
        return evaluate_mse(input_colors, input_weights, color_weights, output);
    }

    float error = FLT_MAX;

    // Quick end point selection.
    if (opt.box_fit) {
        Vector3 c0, c1;
        fit_colors_bbox(colors, count, &c0, &c1);
        inset_bbox(&c0, &c1);
        select_diagonal(colors, count, &c0, &c1);
        error = output_block4(input_colors, input_weights, color_weights, c0, c1, output);

        // Refine color for the selected indices.
        if (opt.least_squares_fit && optimize_end_points4(output->indices, input_colors, 16, &c0, &c1)) {
            BlockDXT1 optimized_block;
            float optimized_error = output_block4(input_colors, input_weights, color_weights, c0, c1, &optimized_block);

            if (optimized_error < error) {
                error = optimized_error;
                *output = optimized_block;
            }
        }
    }

    if (opt.cluster_fit) {
        // @@ Use current endpoints as input for initial PCA approximation?

        bool use_three_color_black = any_black && three_color_black;
        bool use_three_color_mode = opt.cluster_fit_3 || (use_three_color_black && opt.cluster_fit_3_black_only);

        // Try cluster fit.
        BlockDXT1 cluster_fit_output;
        float cluster_fit_error = compress_dxt1_cluster_fit(input_colors, input_weights, colors, weights, count, color_weights, use_three_color_mode, use_three_color_black, three_color_black, &cluster_fit_output);
        if (cluster_fit_error < error) {
            *output = cluster_fit_output;
            error = cluster_fit_error;
        }
    }

    if (opt.endpoint_refinement) {
        error = refine_endpoints(input_colors, input_weights, color_weights, three_color_mode, error, output);
    }

    return error;
}


// Public API

void init_dxt1(Decoder decoder) {
    s_decoder = decoder;
    init_single_color_tables(decoder);
    init_cluster_tables();
}

void decode_dxt1(const void * block, unsigned char rgba_block[16 * 4], Decoder decoder/*=Decoder_D3D10*/) {
    decode_dxt1((const BlockDXT1 *)block, rgba_block, decoder);
}

float evaluate_dxt1_error(const unsigned char rgba_block[16 * 4], const void * dxt_block, Decoder decoder/*=Decoder_D3D10*/) {
    return evaluate_dxt1_error(rgba_block, (const BlockDXT1 *)dxt_block, decoder);
}

float compress_dxt1(Quality level, const float * input_colors, const float * input_weights, const float rgb[3], bool three_color_mode, bool three_color_black, void * output) {
    return compress_dxt1(level, (Vector4*)input_colors, input_weights, { rgb[0], rgb[1], rgb[2] }, three_color_mode, three_color_black, (BlockDXT1*)output);
}

} // icbc

// // Do not polute preprocessor definitions.
// #undef ICBC_SIMD
// #undef ICBC_ASSERT

// #undef ICBC_SCALAR
// #undef ICBC_SSE2
// #undef ICBC_SSE41
// #undef ICBC_AVX1
// #undef ICBC_AVX2
// #undef ICBC_AVX512
// #undef ICBC_NEON
// #undef ICBC_VMX

// #undef ICBC_USE_FMA
// #undef ICBC_USE_AVX2_PERMUTE2
// #undef ICBC_USE_AVX512_PERMUTE
// #undef ICBC_USE_NEON_VTL

// #undef ICBC_PERFECT_ROUND

#endif // ICBC_IMPLEMENTATION

// Version History:
// v1.00 - Initial release.
// v1.01 - Added SPMD code path with AVX support.
// v1.02 - Removed SIMD code path.
// v1.03 - Quality levels. AVX512, Neon, Altivec, vectorized reduction and index selection.
// v1.04 - Automatic compile-time SIMD selection. Specify hw decoder at runtime. More optimizations.
// v1.05 - Bug fixes. Small optimizations.

// Copyright (c) 2020 Ignacio Castano <castano@gmail.com>
//
// Permission is hereby granted, free of charge, to any person obtaining
// a copy of this software and associated documentation files (the
// "Software"), to	deal in the Software without restriction, including
// without limitation the rights to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
// IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
// CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
// TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
// SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.