You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
nvidia-texture-tools/src/nvtt/icbc.h

3634 lines
112 KiB
C++

// icbc.h v1.05
// A High Quality SIMD BC1 Encoder by Ignacio Castano <castano@gmail.com>.
//
// LICENSE:
// MIT license at the end of this file.
#ifndef ICBC_H
#define ICBC_H
namespace icbc {
enum Decoder {
Decoder_D3D10 = 0,
Decoder_NVIDIA = 1,
Decoder_AMD = 2
};
void init_dxt1(Decoder decoder = Decoder_D3D10);
enum Quality {
Quality_Level1, // Box fit + least squares fit.
Quality_Level2, // Cluster fit 4, threshold = 24.
Quality_Level3, // Cluster fit 4, threshold = 32.
Quality_Level4, // Cluster fit 4, threshold = 48.
Quality_Level5, // Cluster fit 4, threshold = 64.
Quality_Level6, // Cluster fit 4, threshold = 96.
Quality_Level7, // Cluster fit 4, threshold = 128.
Quality_Level8, // Cluster fit 4+3, threshold = 256.
Quality_Level9, // Cluster fit 4+3, threshold = 256 + Refinement.
Quality_Fast = Quality_Level1,
Quality_Default = Quality_Level8,
Quality_Max = Quality_Level9,
};
void decode_dxt1(const void * block, unsigned char rgba_block[16 * 4], Decoder decoder = Decoder_D3D10);
float evaluate_dxt1_error(const unsigned char rgba_block[16 * 4], const void * block, Decoder decoder = Decoder_D3D10);
float compress_dxt1(Quality level, const float * input_colors, const float * input_weights, const float color_weights[3], bool three_color_mode, bool three_color_black, void * output);
}
#endif // ICBC_H
#ifdef ICBC_IMPLEMENTATION
// Instruction level support must be chosen at compile time setting ICBC_SIMD to one of these values:
#define ICBC_SCALAR 0
#define ICBC_SSE2 1
#define ICBC_SSE41 2
#define ICBC_AVX1 3
#define ICBC_AVX2 4
#define ICBC_AVX512 5
#define ICBC_NEON -1
#define ICBC_VMX -2
#if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
#define ICBC_X86 1
#endif
#if (defined(__arm__) || defined(_M_ARM))
#define ICBC_ARM 1
#endif
#if (defined(__PPC__) || defined(_M_PPC))
#define ICBC_PPC 1
#endif
// SIMD version.
#ifndef ICBC_SIMD
#if ICBC_X86
#if __AVX512F__
#define ICBC_SIMD ICBC_AVX512
#elif __AVX2__
#define ICBC_SIMD ICBC_AVX2
#elif __AVX__
#define ICBC_SIMD ICBC_AVX1
#elif __SSE4_1__
#define ICBC_SIMD ICBC_SSE41
#elif __SSE2__
#define ICBC_SIMD ICBC_SSE2
#else
#define ICBC_SIMD ICBC_SCALAR
#endif
#endif
#if ICBC_ARM
#if __ARM_NEON__
#define ICBC_SIMD ICBC_NEON
#else
#define ICBC_SIMD ICBC_SCALAR
#endif
#endif
#if ICBC_PPC
#define ICBC_SIMD ICBC_VMX
#endif
#endif
// AVX1 does not require FMA, and depending on whether it's Intel or AMD you may have FMA3 or FMA4. What a mess.
#ifndef ICBC_USE_FMA
//#define ICBC_USE_FMA 3
//#define ICBC_USE_FMA 4
#endif
#if ICBC_SIMD >= ICBC_AVX2
#define ICBC_BMI2 1
#endif
// Apparently rcp is not deterministic (different precision on Intel and AMD), enable if you don't care about that for a small performance boost.
//#define ICBC_USE_RCP 1
#if ICBC_SIMD == ICBC_AVX2
#define ICBC_USE_AVX2_PERMUTE2 1 // Using permutevar8x32 and bitops.
#endif
#if ICBC_SIMD == ICBC_AVX512
#define ICBC_USE_AVX512_PERMUTE 1
#endif
#if ICBC_SIMD == ICBC_NEON
#define ICBC_USE_NEON_VTL 0 // Not tested.
#endif
// Some experimental knobs:
#define ICBC_PERFECT_ROUND 0 // Enable perfect rounding to compute cluster fit residual.
#if ICBC_SIMD >= ICBC_SSE2
#include <emmintrin.h>
#endif
#if ICBC_SIMD >= ICBC_SSE41
#include <smmintrin.h>
#endif
#if ICBC_SIMD >= ICBC_AVX1
#include <immintrin.h>
#endif
#if ICBC_SIMD >= ICBC_AVX512 && _MSC_VER
#include <zmmintrin.h>
#endif
#if ICBC_SIMD == ICBC_NEON
#include <arm_neon.h>
#endif
#if ICBC_SIMD == ICBC_VMX
#include <altivec.h>
#endif
#if _MSC_VER
#include <intrin.h> // _BitScanReverse
#endif
#include <stdint.h>
#include <stdlib.h> // abs
#include <string.h> // memset
#include <math.h> // fabsf
#include <float.h> // FLT_MAX
#ifndef ICBC_ASSERT
#if _DEBUG
#define ICBC_ASSERT assert
#include <assert.h>
#else
#define ICBC_ASSERT(x)
#endif
#endif
namespace icbc {
///////////////////////////////////////////////////////////////////////////////////////////////////
// Basic Templates
template <typename T> inline void swap(T & a, T & b) {
T temp(a);
a = b;
b = temp;
}
template <typename T> inline T max(const T & a, const T & b) {
return (b < a) ? a : b;
}
template <typename T> inline T min(const T & a, const T & b) {
return (a < b) ? a : b;
}
template <typename T> inline T clamp(const T & x, const T & a, const T & b) {
return min(max(x, a), b);
}
template <typename T> inline T square(const T & a) {
return a * a;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
// Basic Types
typedef uint8_t uint8;
typedef int8_t int8;
typedef uint16_t uint16;
typedef uint32_t uint32;
typedef uint32_t uint;
struct Color16 {
union {
struct {
uint16 b : 5;
uint16 g : 6;
uint16 r : 5;
};
uint16 u;
};
};
struct Color32 {
union {
struct {
uint8 b, g, r, a;
};
uint32 u;
};
};
struct BlockDXT1 {
Color16 col0;
Color16 col1;
uint32 indices;
};
struct Vector3 {
float x;
float y;
float z;
inline void operator+=(Vector3 v) {
x += v.x; y += v.y; z += v.z;
}
inline void operator*=(Vector3 v) {
x *= v.x; y *= v.y; z *= v.z;
}
inline void operator*=(float s) {
x *= s; y *= s; z *= s;
}
};
struct Vector4 {
union {
struct {
float x, y, z, w;
};
Vector3 xyz;
};
};
inline Vector3 operator*(Vector3 v, float s) {
return { v.x * s, v.y * s, v.z * s };
}
inline Vector3 operator*(float s, Vector3 v) {
return { v.x * s, v.y * s, v.z * s };
}
inline Vector3 operator*(Vector3 a, Vector3 b) {
return { a.x * b.x, a.y * b.y, a.z * b.z };
}
inline float dot(Vector3 a, Vector3 b) {
return a.x * b.x + a.y * b.y + a.z * b.z;
}
inline Vector3 operator+(Vector3 a, Vector3 b) {
return { a.x + b.x, a.y + b.y, a.z + b.z };
}
inline Vector3 operator-(Vector3 a, Vector3 b) {
return { a.x - b.x, a.y - b.y, a.z - b.z };
}
inline Vector3 operator/(Vector3 v, float s) {
return { v.x / s, v.y / s, v.z / s };
}
inline float saturate(float x) {
return clamp(x, 0.0f, 1.0f);
}
inline Vector3 saturate(Vector3 v) {
return { saturate(v.x), saturate(v.y), saturate(v.z) };
}
inline Vector3 min(Vector3 a, Vector3 b) {
return { min(a.x, b.x), min(a.y, b.y), min(a.z, b.z) };
}
inline Vector3 max(Vector3 a, Vector3 b) {
return { max(a.x, b.x), max(a.y, b.y), max(a.z, b.z) };
}
inline bool operator==(const Vector3 & a, const Vector3 & b) {
return a.x == b.x && a.y == b.y && a.z == b.z;
}
inline Vector3 scalar_to_vector3(float f) {
return {f, f, f};
}
inline float lengthSquared(Vector3 v) {
return dot(v, v);
}
inline bool equal(float a, float b, float epsilon = 0.0001) {
// http://realtimecollisiondetection.net/blog/?p=89
//return fabsf(a - b) < epsilon * max(1.0f, max(fabsf(a), fabsf(b)));
return fabsf(a - b) < epsilon;
}
inline bool equal(Vector3 a, Vector3 b, float epsilon) {
return equal(a.x, b.x, epsilon) && equal(a.y, b.y, epsilon) && equal(a.z, b.z, epsilon);
}
///////////////////////////////////////////////////////////////////////////////////////////////////
// SIMD
#ifndef ICBC_ALIGN_16
#if __GNUC__
# define ICBC_ALIGN_16 __attribute__ ((__aligned__ (16)))
#else // _MSC_VER
# define ICBC_ALIGN_16 __declspec(align(16))
#endif
#endif
#if __GNUC__
#define ICBC_FORCEINLINE inline __attribute__((always_inline))
#else
#define ICBC_FORCEINLINE __forceinline
#endif
// Count trailing zeros (BSR).
ICBC_FORCEINLINE int ctz(uint mask) {
#if __GNUC__
return __builtin_ctz(mask);
#else
unsigned long index;
_BitScanReverse(&index, mask);
return (int)index;
#endif
}
#if ICBC_SIMD == ICBC_SCALAR // Purely scalar version.
constexpr int VEC_SIZE = 1;
using VFloat = float;
using VMask = bool;
ICBC_FORCEINLINE float & lane(VFloat & v, int i) { return v; }
ICBC_FORCEINLINE VFloat vzero() { return 0.0f; }
ICBC_FORCEINLINE VFloat vbroadcast(float x) { return x; }
ICBC_FORCEINLINE VFloat vload(const float * ptr) { return *ptr; }
ICBC_FORCEINLINE VFloat vrcp(VFloat a) { return 1.0f / a; }
ICBC_FORCEINLINE VFloat vmadd(VFloat a, VFloat b, VFloat c) { return a * b + c; }
ICBC_FORCEINLINE VFloat vmsub(VFloat a, VFloat b, VFloat c) { return a * b - c; }
ICBC_FORCEINLINE VFloat vm2sub(VFloat a, VFloat b, VFloat c, VFloat d) { return a * b - c * d; }
ICBC_FORCEINLINE VFloat vsaturate(VFloat a) { return min(max(a, 0.0f), 1.0f); }
ICBC_FORCEINLINE VFloat vround01(VFloat a) { return float(int(a + 0.5f)); }
ICBC_FORCEINLINE VFloat lane_id() { return 0; }
ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) { return mask ? b : a; }
ICBC_FORCEINLINE VMask vbroadcast(bool b) { return b; }
ICBC_FORCEINLINE bool all(VMask m) { return m; }
ICBC_FORCEINLINE bool any(VMask m) { return m; }
ICBC_FORCEINLINE uint mask(VMask m) { return (uint)m; }
ICBC_FORCEINLINE int reduce_min_index(VFloat v) { return 0; }
ICBC_FORCEINLINE void vtranspose4(VFloat & a, VFloat & b, VFloat & c, VFloat & d) {}
#elif ICBC_SIMD == ICBC_SSE2 || ICBC_SIMD == ICBC_SSE41
constexpr int VEC_SIZE = 4;
#if __GNUC__
// GCC needs a struct so that we can overload operators.
union VFloat {
__m128 v;
float m128_f32[VEC_SIZE];
VFloat() {}
VFloat(__m128 v) : v(v) {}
operator __m128 & () { return v; }
};
union VMask {
__m128 m;
VMask() {}
VMask(__m128 m) : m(m) {}
operator __m128 & () { return m; }
};
#else
using VFloat = __m128;
using VMask = __m128;
#endif
ICBC_FORCEINLINE float & lane(VFloat & v, int i) {
return v.m128_f32[i];
}
ICBC_FORCEINLINE VFloat vzero() {
return _mm_setzero_ps();
}
ICBC_FORCEINLINE VFloat vbroadcast(float x) {
return _mm_set1_ps(x);
}
ICBC_FORCEINLINE VFloat vload(const float * ptr) {
return _mm_load_ps(ptr);
}
ICBC_FORCEINLINE VFloat vgather(const float * base, VFloat index) {
VFloat v;
for (int i = 0; i < VEC_SIZE; i++) {
lane(v, i) = base[int(lane(index, i))];
}
return v;
}
ICBC_FORCEINLINE VFloat operator+(VFloat a, VFloat b) {
return _mm_add_ps(a, b);
}
ICBC_FORCEINLINE VFloat operator-(VFloat a, VFloat b) {
return _mm_sub_ps(a, b);
}
ICBC_FORCEINLINE VFloat operator*(VFloat a, VFloat b) {
return _mm_mul_ps(a, b);
}
ICBC_FORCEINLINE VFloat vrcp(VFloat a) {
#if ICBC_USE_RCP
VFloat r = _mm_rcp_ps(a);
return _mm_mul_ps(r, _mm_sub_ps(vbroadcast(2.0f), _mm_mul_ps(r, a))); // r * (2 - r * a)
#else
return _mm_div_ps(vbroadcast(1.0f), a);
#endif
}
// a*b+c
ICBC_FORCEINLINE VFloat vmadd(VFloat a, VFloat b, VFloat c) {
return a * b + c;
}
ICBC_FORCEINLINE VFloat vmsub(VFloat a, VFloat b, VFloat c) {
return a * b - c;
}
ICBC_FORCEINLINE VFloat vm2sub(VFloat a, VFloat b, VFloat c, VFloat d) {
return a * b - c * d;
}
ICBC_FORCEINLINE VFloat vsaturate(VFloat a) {
auto zero = _mm_setzero_ps();
auto one = _mm_set1_ps(1.0f);
return _mm_min_ps(_mm_max_ps(a, zero), one);
}
// Assumes a is in [0, 1] range.
ICBC_FORCEINLINE VFloat vround01(VFloat a) {
#if ICBC_SIMD == ICBC_SSE41
return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
#else
return _mm_cvtepi32_ps(_mm_cvttps_epi32(a + vbroadcast(0.5f)));
#endif
}
ICBC_FORCEINLINE VFloat vtruncate(VFloat a) {
#if ICBC_SIMD == ICBC_SSE41
return _mm_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
#else
return _mm_cvtepi32_ps(_mm_cvttps_epi32(a));
#endif
}
ICBC_FORCEINLINE VFloat lane_id() {
return _mm_set_ps(3, 2, 1, 0);
}
ICBC_FORCEINLINE VMask operator> (VFloat A, VFloat B) { return _mm_cmpgt_ps(A, B); }
ICBC_FORCEINLINE VMask operator>=(VFloat A, VFloat B) { return _mm_cmpge_ps(A, B); }
ICBC_FORCEINLINE VMask operator< (VFloat A, VFloat B) { return _mm_cmplt_ps(A, B); }
ICBC_FORCEINLINE VMask operator<=(VFloat A, VFloat B) { return _mm_cmple_ps(A, B); }
ICBC_FORCEINLINE VMask operator| (VMask A, VMask B) { return _mm_or_ps(A, B); }
ICBC_FORCEINLINE VMask operator& (VMask A, VMask B) { return _mm_and_ps(A, B); }
ICBC_FORCEINLINE VMask operator^ (VMask A, VMask B) { return _mm_xor_ps(A, B); }
// mask ? b : a
ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) {
#if ICBC_SIMD == ICBC_SSE41
return _mm_blendv_ps(a, b, mask);
#else
return _mm_or_ps(_mm_andnot_ps(mask, a), _mm_and_ps(mask, b));
#endif
}
ICBC_FORCEINLINE VMask vbroadcast(bool b) {
return _mm_castsi128_ps(_mm_set1_epi32(-int32_t(b)));
}
ICBC_FORCEINLINE bool all(VMask m) {
int value = _mm_movemask_ps(m);
return value == 0x7;
}
ICBC_FORCEINLINE bool any(VMask m) {
int value = _mm_movemask_ps(m);
return value != 0;
}
ICBC_FORCEINLINE uint mask(VMask m) {
return (uint)_mm_movemask_ps(m);
}
ICBC_FORCEINLINE int reduce_min_index(VFloat v) {
// First do an horizontal reduction. // v = [ D C | B A ]
VFloat shuf = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 3, 0, 1)); // [ C D | A B ]
VFloat mins = _mm_min_ps(v, shuf); // mins = [ D+C C+D | B+A A+B ]
shuf = _mm_movehl_ps(shuf, mins); // [ C D | D+C C+D ] // let the compiler avoid a mov by reusing shuf
mins = _mm_min_ss(mins, shuf);
mins = _mm_shuffle_ps(mins, mins, _MM_SHUFFLE(0, 0, 0, 0));
// Then find the index.
uint mask = _mm_movemask_ps(v <= mins);
return ctz(mask);
}
// https://gcc.gnu.org/legacy-ml/gcc-patches/2005-10/msg00324.html
ICBC_FORCEINLINE void vtranspose4(VFloat & r0, VFloat & r1, VFloat & r2, VFloat & r3) {
VFloat t0 = _mm_unpacklo_ps(r0, r1);
VFloat t1 = _mm_unpacklo_ps(r2, r3);
VFloat t2 = _mm_unpackhi_ps(r0, r1);
VFloat t3 = _mm_unpackhi_ps(r2, r3);
r0 = _mm_movelh_ps(t0, t1);
r1 = _mm_movehl_ps(t1, t0);
r2 = _mm_movelh_ps(t2, t3);
r3 = _mm_movehl_ps(t3, t2);
}
#elif ICBC_SIMD == ICBC_AVX1 || ICBC_SIMD == ICBC_AVX2
constexpr int VEC_SIZE = 8;
#if __GNUC__
union VFloat {
__m256 v;
float m256_f32[VEC_SIZE];
VFloat() {}
VFloat(__m256 v) : v(v) {}
operator __m256 & () { return v; }
};
union VInt {
__m256i v;
int m256_i32[VEC_SIZE];
VInt() {}
VInt(__m256i v) : v(v) {}
operator __m256i & () { return v; }
};
union VMask {
__m256 m;
VMask() {}
VMask(__m256 m) : m(m) {}
operator __m256 & () { return m; }
};
#else
using VFloat = __m256;
using VInt = __m256i;
using VMask = __m256; // Emulate mask vector using packed float.
#endif
ICBC_FORCEINLINE float & lane(VFloat & v, int i) {
return v.m256_f32[i];
}
ICBC_FORCEINLINE VFloat vzero() {
return _mm256_setzero_ps();
}
ICBC_FORCEINLINE VFloat vbroadcast(float a) {
return _mm256_set1_ps(a);
}
ICBC_FORCEINLINE VFloat vload(const float * ptr) {
return _mm256_load_ps(ptr);
}
ICBC_FORCEINLINE VFloat vgather(const float * base, VFloat index) {
#if ICBC_SIMD == ICBC_AVX2
return _mm256_i32gather_ps(base, _mm256_cvtps_epi32(index), 4);
#else
VFloat v;
for (int i = 0; i < VEC_SIZE; i++) {
lane(v, i) = base[int(lane(index, i))];
}
return v;
#endif
}
ICBC_FORCEINLINE VFloat operator+(VFloat a, VFloat b) {
return _mm256_add_ps(a, b);
}
ICBC_FORCEINLINE VFloat operator-(VFloat a, VFloat b) {
return _mm256_sub_ps(a, b);
}
ICBC_FORCEINLINE VFloat operator*(VFloat a, VFloat b) {
return _mm256_mul_ps(a, b);
}
ICBC_FORCEINLINE VFloat vrcp(VFloat a) {
#if ICBC_USE_RCP
#if ICBC_SIMD == ICBC_AVX512
VFloat r = _mm256_rcp14_ps(a);
#else
VFloat r = _mm256_rcp_ps(a);
#endif
// r = r * (2 - r * a)
#if ICBC_USE_FMA == 3 || ICBC_AVX2
return _mm256_mul_ps(r, _mm256_fnmadd_ps(r, a, vbroadcast(2.0f)));
#else
return _mm256_mul_ps(r, _mm256_sub_ps(vbroadcast(2.0f), _mm256_mul_ps(r, a)));
#endif
#else
return _mm256_div_ps(vbroadcast(1.0f), a);
#endif
}
// a*b+c
ICBC_FORCEINLINE VFloat vmadd(VFloat a, VFloat b, VFloat c) {
#if ICBC_USE_FMA == 3 || ICBC_SIMD == ICBC_AVX2
return _mm256_fmadd_ps(a, b, c);
#elif ICBC_USE_FMA == 4
return _mm256_macc_ps(a, b, c);
#else
return ((a * b) + c);
#endif
}
ICBC_FORCEINLINE VFloat vmsub(VFloat a, VFloat b, VFloat c) {
#if ICBC_USE_FMA == 3 || ICBC_SIMD == ICBC_AVX2
return _mm256_fmsub_ps(a, b, c);
#elif ICBC_USE_FMA == 4
return _mm256_msub_ps(a, b, c);
#else
return ((a * b) - c);
#endif
}
ICBC_FORCEINLINE VFloat vm2sub(VFloat a, VFloat b, VFloat c, VFloat d) {
return vmsub(a, b, c * d);
}
ICBC_FORCEINLINE VFloat vsaturate(VFloat a) {
__m256 zero = _mm256_setzero_ps();
__m256 one = _mm256_set1_ps(1.0f);
return _mm256_min_ps(_mm256_max_ps(a, zero), one);
}
ICBC_FORCEINLINE VFloat vround01(VFloat a) {
return _mm256_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
}
ICBC_FORCEINLINE VFloat vtruncate(VFloat a) {
return _mm256_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
}
ICBC_FORCEINLINE VFloat lane_id() {
return _mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0);
}
ICBC_FORCEINLINE VMask operator> (VFloat A, VFloat B) { return _mm256_cmp_ps(A, B, _CMP_GT_OQ); }
ICBC_FORCEINLINE VMask operator>=(VFloat A, VFloat B) { return _mm256_cmp_ps(A, B, _CMP_GE_OQ); }
ICBC_FORCEINLINE VMask operator< (VFloat A, VFloat B) { return _mm256_cmp_ps(A, B, _CMP_LT_OQ); }
ICBC_FORCEINLINE VMask operator<=(VFloat A, VFloat B) { return _mm256_cmp_ps(A, B, _CMP_LE_OQ); }
ICBC_FORCEINLINE VMask operator| (VMask A, VMask B) { return _mm256_or_ps(A, B); }
ICBC_FORCEINLINE VMask operator& (VMask A, VMask B) { return _mm256_and_ps(A, B); }
ICBC_FORCEINLINE VMask operator^ (VMask A, VMask B) { return _mm256_xor_ps(A, B); }
// mask ? b : a
ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) {
return _mm256_blendv_ps(a, b, mask);
}
ICBC_FORCEINLINE VMask vbroadcast(bool b) {
return _mm256_castsi256_ps(_mm256_set1_epi32(-int32_t(b)));
}
ICBC_FORCEINLINE bool all(VMask m) {
__m256 zero = _mm256_setzero_ps();
return _mm256_testc_ps(_mm256_cmp_ps(zero, zero, _CMP_EQ_UQ), m) == 0;
}
ICBC_FORCEINLINE bool any(VMask m) {
return _mm256_testz_ps(m, m) == 0;
}
ICBC_FORCEINLINE uint mask(VMask m) {
return (uint)_mm256_movemask_ps(m);
}
// This is missing on some GCC versions.
#if !defined _mm256_set_m128
#define _mm256_set_m128(hi, lo) _mm256_insertf128_ps(_mm256_castps128_ps256(lo), (hi), 0x1)
#endif
ICBC_FORCEINLINE int reduce_min_index(VFloat v) {
__m128 vlow = _mm256_castps256_ps128(v);
__m128 vhigh = _mm256_extractf128_ps(v, 1);
vlow = _mm_min_ps(vlow, vhigh);
// First do an horizontal reduction. // v = [ D C | B A ]
__m128 shuf = _mm_shuffle_ps(vlow, vlow, _MM_SHUFFLE(2, 3, 0, 1)); // [ C D | A B ]
__m128 mins = _mm_min_ps(vlow, shuf); // mins = [ D+C C+D | B+A A+B ]
shuf = _mm_movehl_ps(shuf, mins); // [ C D | D+C C+D ]
mins = _mm_min_ss(mins, shuf);
VFloat vmin = _mm256_permute_ps(_mm256_set_m128(mins, mins), 0); // _MM256_PERMUTE(0, 0, 0, 0, 0, 0, 0, 0)
// Then find the index.
uint mask = _mm256_movemask_ps(v <= vmin);
return ctz(mask);
}
// AoS to SoA
ICBC_FORCEINLINE void vtranspose4(VFloat & a, VFloat & b, VFloat & c, VFloat & d) {
VFloat r0 = _mm256_unpacklo_ps(a, b);
VFloat r1 = _mm256_unpacklo_ps(c, d);
VFloat r2 = _mm256_permute2f128_ps(r0, r1, 0x20);
VFloat r3 = _mm256_permute2f128_ps(r0, r1, 0x31);
r0 = _mm256_unpackhi_ps(a, b);
r1 = _mm256_unpackhi_ps(c, d);
a = _mm256_unpacklo_ps(r2, r3);
b = _mm256_unpackhi_ps(r2, r3);
r2 = _mm256_permute2f128_ps(r0, r1, 0x20);
r3 = _mm256_permute2f128_ps(r0, r1, 0x31);
c = _mm256_unpacklo_ps(r2, r3);
d = _mm256_unpackhi_ps(r2, r3);
}
ICBC_FORCEINLINE VInt vzeroi() {
return _mm256_setzero_si256();
}
ICBC_FORCEINLINE VInt vbroadcast(int a) {
return _mm256_set1_epi32(a);
}
ICBC_FORCEINLINE VInt vload(const int * ptr) {
return _mm256_load_si256((const __m256i*)ptr);
}
ICBC_FORCEINLINE VInt operator- (VInt A, int b) { return _mm256_sub_epi32(A, _mm256_set1_epi32(b)); }
ICBC_FORCEINLINE VInt operator& (VInt A, int b) { return _mm256_and_si256(A, _mm256_set1_epi32(b)); }
ICBC_FORCEINLINE VInt operator>> (VInt A, int b) { return _mm256_srli_epi32(A, b); }
ICBC_FORCEINLINE VMask operator> (VInt A, int b) { return _mm256_castsi256_ps(_mm256_cmpgt_epi32(A, _mm256_set1_epi32(b))); }
ICBC_FORCEINLINE VMask operator== (VInt A, int b) { return _mm256_castsi256_ps(_mm256_cmpeq_epi32(A, _mm256_set1_epi32(b))); }
// mask ? v[idx] : 0
ICBC_FORCEINLINE VFloat vpermuteif(VMask mask, VFloat v, VInt idx) {
return _mm256_and_ps(_mm256_permutevar8x32_ps(v, idx), mask);
}
// mask ? (idx > 8 ? vhi[idx] : vlo[idx]) : 0
ICBC_FORCEINLINE VFloat vpermute2if(VMask mask, VFloat vlo, VFloat vhi, VInt idx) {
#if 0
VMask mhi = idx > 7;
vlo = _mm256_permutevar8x32_ps(vlo, idx);
vhi = _mm256_permutevar8x32_ps(vhi, idx);
VFloat v = _mm256_blendv_ps(vlo, vhi, mhi);
return _mm256_and_ps(v, mask);
#else
// Fabian Giesen says not to mix _mm256_blendv_ps and _mm256_permutevar8x32_ps since they contend for the same gates and instead suggests the following:
vhi = _mm256_xor_ps(vhi, vlo);
VFloat v = _mm256_permutevar8x32_ps(vlo, idx);
VMask mhi = idx > 7;
v = _mm256_xor_ps(v, _mm256_and_ps(_mm256_permutevar8x32_ps(vhi, idx), mhi));
return _mm256_and_ps(v, mask);
#endif
}
#elif ICBC_SIMD == ICBC_AVX512
constexpr int VEC_SIZE = 16;
#if __GNUC__
union VFloat {
__m512 v;
float m512_f32[VEC_SIZE];
VFloat() {}
VFloat(__m512 v) : v(v) {}
operator __m512 & () { return v; }
};
union VInt {
__m512i v;
int m512i_i32[VEC_SIZE];
VInt() {}
VInt(__m512i v) : v(v) {}
operator __m512i & () { return v; }
};
#else
using VFloat = __m512;
using VInt = __m512i;
#endif
struct VMask { __mmask16 m; };
ICBC_FORCEINLINE float & lane(VFloat & v, int i) {
return v.m512_f32[i];
}
ICBC_FORCEINLINE VFloat vzero() {
return _mm512_setzero_ps();
}
ICBC_FORCEINLINE VFloat vbroadcast(float a) {
return _mm512_set1_ps(a);
}
ICBC_FORCEINLINE VFloat vload(const float * ptr) {
return _mm512_load_ps(ptr);
}
ICBC_FORCEINLINE VFloat vload(VMask mask, const float * ptr) {
return _mm512_mask_load_ps(_mm512_undefined(), mask.m, ptr);
}
ICBC_FORCEINLINE VFloat vload(VMask mask, const float * ptr, float fallback) {
return _mm512_mask_load_ps(_mm512_set1_ps(fallback), mask.m, ptr);
}
ICBC_FORCEINLINE VFloat vgather(const float * base, VFloat index) {
return _mm512_i32gather_ps(_mm512_cvtps_epi32(index), base, 4);
}
ICBC_FORCEINLINE VFloat operator+(VFloat a, VFloat b) {
return _mm512_add_ps(a, b);
}
ICBC_FORCEINLINE VFloat operator-(VFloat a, VFloat b) {
return _mm512_sub_ps(a, b);
}
ICBC_FORCEINLINE VFloat operator*(VFloat a, VFloat b) {
return _mm512_mul_ps(a, b);
}
ICBC_FORCEINLINE VFloat vrcp(VFloat a) {
#if ICBC_USE_RCP
VFloat r = _mm512_rcp14_ps(a);
// r = r * (2 - r * a)
return _mm512_mul_ps(r, _mm512_fnmadd_ps(r, a, vbroadcast(2.0f)));
#else
return _mm512_div_ps(vbroadcast(1.0f), a);
#endif
}
// a*b+c
ICBC_FORCEINLINE VFloat vmadd(VFloat a, VFloat b, VFloat c) {
return _mm512_fmadd_ps(a, b, c);
}
ICBC_FORCEINLINE VFloat vmsub(VFloat a, VFloat b, VFloat c) {
return _mm512_fmsub_ps(a, b, c);
}
ICBC_FORCEINLINE VFloat vm2sub(VFloat a, VFloat b, VFloat c, VFloat d) {
return vmsub(a, b, c * d);
}
ICBC_FORCEINLINE VFloat vsaturate(VFloat a) {
auto zero = _mm512_setzero_ps();
auto one = _mm512_set1_ps(1.0f);
return _mm512_min_ps(_mm512_max_ps(a, zero), one);
}
ICBC_FORCEINLINE VFloat vround01(VFloat a) {
return _mm512_roundscale_ps(a, _MM_FROUND_TO_NEAREST_INT);
}
ICBC_FORCEINLINE VFloat vtruncate(VFloat a) {
return _mm512_roundscale_ps(a, _MM_FROUND_TO_ZERO);
}
ICBC_FORCEINLINE VFloat lane_id() {
return _mm512_set_ps(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
}
ICBC_FORCEINLINE VMask operator> (VFloat A, VFloat B) { return { _mm512_cmp_ps_mask(A, B, _CMP_GT_OQ) }; }
ICBC_FORCEINLINE VMask operator>=(VFloat A, VFloat B) { return { _mm512_cmp_ps_mask(A, B, _CMP_GE_OQ) }; }
ICBC_FORCEINLINE VMask operator< (VFloat A, VFloat B) { return { _mm512_cmp_ps_mask(A, B, _CMP_LT_OQ) }; }
ICBC_FORCEINLINE VMask operator<=(VFloat A, VFloat B) { return { _mm512_cmp_ps_mask(A, B, _CMP_LE_OQ) }; }
ICBC_FORCEINLINE VMask operator! (VMask A) { return { _mm512_knot(A.m) }; }
ICBC_FORCEINLINE VMask operator| (VMask A, VMask B) { return { _mm512_kor(A.m, B.m) }; }
ICBC_FORCEINLINE VMask operator& (VMask A, VMask B) { return { _mm512_kand(A.m, B.m) }; }
ICBC_FORCEINLINE VMask operator^ (VMask A, VMask B) { return { _mm512_kxor(A.m, B.m) }; }
// mask ? b : a
ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) {
return _mm512_mask_blend_ps(mask.m, a, b);
}
ICBC_FORCEINLINE VMask vbroadcast(bool b) {
return { __mmask16(-int16_t(b)) };
}
ICBC_FORCEINLINE bool all(VMask mask) {
return mask.m == 0xFFFFFFFF;
}
ICBC_FORCEINLINE bool any(VMask mask) {
return mask.m != 0;
}
ICBC_FORCEINLINE uint mask(VMask mask) {
return mask.m;
}
ICBC_FORCEINLINE int reduce_min_index(VFloat v) {
// First do an horizontal reduction.
VFloat vmin = vbroadcast(_mm512_reduce_min_ps(v));
// Then find the index.
VMask mask = (v <= vmin);
return ctz(mask.m);
}
//ICBC_FORCEINLINE void vtranspose4(VFloat & a, VFloat & b, VFloat & c, VFloat & d); // @@
ICBC_FORCEINLINE int lane(VInt v, int i) {
//return _mm256_extract_epi32(v, i);
return v.m512i_i32[i];
}
ICBC_FORCEINLINE VInt vzeroi() {
return _mm512_setzero_epi32();
}
ICBC_FORCEINLINE VInt vbroadcast(int a) {
return _mm512_set1_epi32(a);
}
ICBC_FORCEINLINE VInt vload(const int * ptr) {
return _mm512_load_epi32(ptr);
}
ICBC_FORCEINLINE VInt operator- (VInt A, int b) { return _mm512_sub_epi32(A, vbroadcast(b)); }
ICBC_FORCEINLINE VInt operator& (VInt A, int b) { return _mm512_and_epi32(A, vbroadcast(b)); }
ICBC_FORCEINLINE VInt operator>> (VInt A, int b) { return _mm512_srli_epi32(A, b); }
ICBC_FORCEINLINE VMask operator> (VInt A, int b) { return { _mm512_cmpgt_epi32_mask(A, vbroadcast(b)) }; }
ICBC_FORCEINLINE VMask operator>=(VInt A, int b) { return { _mm512_cmpge_epi32_mask(A, vbroadcast(b)) }; }
ICBC_FORCEINLINE VMask operator== (VInt A, int b) { return { _mm512_cmpeq_epi32_mask(A, vbroadcast(b)) }; }
// mask ? v[idx] : 0
ICBC_FORCEINLINE VFloat vpermuteif(VMask mask, VFloat v, VInt idx) {
return _mm512_maskz_permutexvar_ps(mask.m, idx, v);
}
#elif ICBC_SIMD == ICBC_NEON
constexpr int VEC_SIZE = 4;
#if __GNUC__
union VFloat {
float32x4_t v;
float e[4];
VFloat() {}
VFloat(float32x4_t v) : v(v) {}
operator float32x4_t & () { return v; }
};
struct VMask {
uint32x4_t v;
VMask() {}
VMask(uint32x4_t v) : v(v) {}
operator uint32x4_t & () { return v; }
};
#else
using VFloat = float32x4_t;
using VMask = uint32x4_t;
#endif
ICBC_FORCEINLINE float & lane(VFloat & v, int i) {
#if defined(__clang__)
return v.e[i];
#elif defined(_MSC_VER)
return v.n128_f32[i];
#else
return v.v[i];
#endif
}
ICBC_FORCEINLINE VFloat vzero() {
return vdupq_n_f32(0.0f);
}
ICBC_FORCEINLINE VFloat vbroadcast(float a) {
return vdupq_n_f32(a);
}
ICBC_FORCEINLINE VFloat vload(const float * ptr) {
return vld1q_f32(ptr);
}
ICBC_FORCEINLINE VFloat operator+(VFloat a, VFloat b) {
return vaddq_f32(a, b);
}
ICBC_FORCEINLINE VFloat operator-(VFloat a, VFloat b) {
return vsubq_f32(a, b);
}
ICBC_FORCEINLINE VFloat operator*(VFloat a, VFloat b) {
return vmulq_f32(a, b);
}
ICBC_FORCEINLINE VFloat vrcp(VFloat a) {
//#if ICBC_USE_RCP
VFloat rcp = vrecpeq_f32(a);
//return rcp;
return vmulq_f32(vrecpsq_f32(a, rcp), rcp);
//#else
// return vdiv_f32(vbroadcast(1.0f), a); // @@ ARMv8 only?
//#endif
}
// a*b+c
ICBC_FORCEINLINE VFloat vmadd(VFloat a, VFloat b, VFloat c) {
return vmlaq_f32(c, a, b);
}
ICBC_FORCEINLINE VFloat vmsub(VFloat a, VFloat b, VFloat c) {
return a * b - c;
}
ICBC_FORCEINLINE VFloat vm2sub(VFloat a, VFloat b, VFloat c, VFloat d) {
// vmlsq_f32(a, b, c) == a - (b * c)
return vmlsq_f32(a * b, c, d);
}
ICBC_FORCEINLINE VFloat vsaturate(VFloat a) {
return vminq_f32(vmaxq_f32(a, vzero()), vbroadcast(1));
}
ICBC_FORCEINLINE VFloat vround01(VFloat a) {
#if __ARM_RACH >= 8
return vrndqn_f32(a); // Round to integral (to nearest, ties to even)
#else
return vcvtq_f32_s32(vcvtq_s32_f32(a + vbroadcast(0.5)));
#endif
}
ICBC_FORCEINLINE VFloat vtruncate(VFloat a) {
#if __ARM_RACH >= 8
return vrndq_f32(a);
#else
return vcvtq_f32_s32(vcvtq_s32_f32(a));
#endif
}
ICBC_FORCEINLINE VFloat lane_id() {
ICBC_ALIGN_16 float data[4] = { 0, 1, 2, 3 };
return vld1q_f32(data);
}
ICBC_FORCEINLINE VMask operator> (VFloat A, VFloat B) { return { vcgtq_f32(A, B) }; }
ICBC_FORCEINLINE VMask operator>=(VFloat A, VFloat B) { return { vcgeq_f32(A, B) }; }
ICBC_FORCEINLINE VMask operator< (VFloat A, VFloat B) { return { vcltq_f32(A, B) }; }
ICBC_FORCEINLINE VMask operator<=(VFloat A, VFloat B) { return { vcleq_f32(A, B) }; }
ICBC_FORCEINLINE VMask operator| (VMask A, VMask B) { return { vorrq_u32(A, B) }; }
ICBC_FORCEINLINE VMask operator& (VMask A, VMask B) { return { vandq_u32(A, B) }; }
ICBC_FORCEINLINE VMask operator^ (VMask A, VMask B) { return { veorq_u32(A, B) }; }
// mask ? b : a
ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) {
return vbslq_f32(mask, b, a);
}
ICBC_FORCEINLINE bool all(VMask mask) {
uint32x2_t m2 = vpmin_u32(vget_low_u32(mask), vget_high_u32(mask));
uint32x2_t m1 = vpmin_u32(m2, m2);
#if defined(_MSC_VER)
return m1.n64_u32[0] != 0;
#else
return m1[0] != 0;
#endif
}
ICBC_FORCEINLINE bool any(VMask mask) {
uint32x2_t m2 = vpmax_u32(vget_low_u32(mask), vget_high_u32(mask));
uint32x2_t m1 = vpmax_u32(m2, m2);
#if defined(_MSC_VER)
return m1.n64_u32[0] != 0;
#else
return m1[0] != 0;
#endif
}
// @@ Is this the best we can do?
// From: https://github.com/jratcliff63367/sse2neon/blob/master/SSE2NEON.h
ICBC_FORCEINLINE uint mask(VMask mask) {
static const uint32x4_t movemask = { 1, 2, 4, 8 };
static const uint32x4_t highbit = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
uint32x4_t t1 = vtstq_u32(mask, highbit);
uint32x4_t t2 = vandq_u32(t1, movemask);
uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2));
return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1);
}
inline int reduce_min_index(VFloat v) {
#if 0
float32x2_t m2 = vpmin_f32(vget_low_u32(V), vget_high_u32(v));
float32x2_t m1 = vpmin_f32(m2, m2);
float min_value = vget_lane_f32(m1, 0);
VFloat vmin = vbroadcast(min_value);
// (v <= vmin)
// @@ Find the lane that contains minValue?
#endif
// @@ Is there a better way to do this reduction?
int min_idx = 0;
float min_value = lane(v, 0);
for (int i = 1; i < VEC_SIZE; i++) {
float value = lane(v, i);
if (value < min_value) {
min_value = value;
min_idx = i;
}
}
return min_idx;
}
// https://github.com/Maratyszcza/NNPACK/blob/master/src/neon/transpose.h
ICBC_FORCEINLINE void vtranspose4(VFloat & a, VFloat & b, VFloat & c, VFloat & d) {
// row0 = ( x00 x01 x02 x03 )
// row1 = ( x10 x11 x12 x13 )
// row2 = ( x20 x21 x22 x23 )
// row3 = ( x30 x31 x32 x33 )
// row01 = ( x00 x10 x02 x12 ), ( x01 x11 x03, x13 )
// row23 = ( x20 x30 x22 x32 ), ( x21 x31 x23, x33 )
float32x4x2_t row01 = vtrnq_f32(a, b);
float32x4x2_t row23 = vtrnq_f32(c, d);
// row0 = ( x00 x10 x20 x30 )
// row1 = ( x01 x11 x21 x31 )
// row2 = ( x02 x12 x22 x32 )
// row3 = ( x03 x13 x23 x33 )
a = vcombine_f32(vget_low_f32(row01.val[0]), vget_low_f32(row23.val[0]));
b = vcombine_f32(vget_low_f32(row01.val[1]), vget_low_f32(row23.val[1]));
c = vcombine_f32(vget_high_f32(row01.val[0]), vget_high_f32(row23.val[0]));
d = vcombine_f32(vget_high_f32(row01.val[1]), vget_high_f32(row23.val[1]));
}
#elif ICBC_SIMD == ICBC_VMX
constexpr int VEC_SIZE = 4;
union VFloat {
vectro float v;
float e[4];
VFloat() {}
VFloat(vector float v) : v(v) {}
operator vector float & () { return v; }
};
struct VMask {
vector unsigned int v;
VMask() {}
VMask(vector unsigned int v) : v(v) {}
operator vector unsigned int & () { return v; }
};
ICBC_FORCEINLINE float & lane(VFloat & v, int i) {
return v.e[i];
}
ICBC_FORCEINLINE VFloat vzero() {
return vec_splats(0.0f);
}
ICBC_FORCEINLINE VFloat vbroadcast(float a) {
return vec_splats(a);
}
ICBC_FORCEINLINE VFloat vload(const float * ptr) {
return vec_ld(ptr)
}
ICBC_FORCEINLINE VFloat operator+(VFloat a, VFloat b) {
return vec_add(a, b);
}
ICBC_FORCEINLINE VFloat operator-(VFloat a, VFloat b) {
return vec_sub(a, b);
}
ICBC_FORCEINLINE VFloat operator*(VFloat a, VFloat b) {
return vec_madd(a, b, vec_splats(-0.0f));
}
ICBC_FORCEINLINE VFloat vrcp(VFloat a) {
#if ICBC_USE_RCP
// get the reciprocal estimate
vector float estimate = vec_re( v.vec );
// one round of Newton-Rhaphson refinement
vector float diff = vec_nmsub( estimate, v.vec, vec_splats( 1.0f ) );
return vec_madd(diff, estimate, estimate );
#else
return vec_div(vec_splats(1),a);
#endif
}
// a*b+c
ICBC_FORCEINLINE VFloat vmadd(VFloat a, VFloat b, VFloat c) {
return vec_madd(a, b, c);
}
ICBC_FORCEINLINE VFloat vmsub(VFloat a, VFloat b, VFloat c) {
return vec_msub(a, b, c); // @@ Is this right?
}
ICBC_FORCEINLINE VFloat vm2sub(VFloat a, VFloat b, VFloat c, VFloat d) {
return vmsub(a, b, c * d);
}
ICBC_FORCEINLINE VFloat vsaturate(VFloat a) {
return vec_min(vec_max(a, vzero()), vbroadcast(1));
}
ICBC_FORCEINLINE VFloat vround01(VFloat a) {
// @@ Assumes a is positive and ~small
return vec_trunc(a + vbroadcast(0.5));
}
ICBC_FORCEINLINE VFloat vtruncate(VFloat a) {
return vec_trunc(a);
}
ICBC_FORCEINLINE VFloat lane_id() {
return (VFloat){ 0, 1, 2, 3 };
}
ICBC_FORCEINLINE VMask operator> (VFloat A, VFloat B) { return { vec_cmpgt(A, B) }; }
ICBC_FORCEINLINE VMask operator>=(VFloat A, VFloat B) { return { vec_cmpge(A, B) }; }
ICBC_FORCEINLINE VMask operator< (VFloat A, VFloat B) { return { vec_cmplt(A, B) }; }
ICBC_FORCEINLINE VMask operator<=(VFloat A, VFloat B) { return { vec_cmple(A, B) }; }
ICBC_FORCEINLINE VMask operator| (VMask A, VMask B) { return { vec_or(A, B) }; }
ICBC_FORCEINLINE VMask operator& (VMask A, VMask B) { return { vec_and(A, B) }; }
ICBC_FORCEINLINE VMask operator^ (VMask A, VMask B) { return { vec_xor(A, B) }; }
// mask ? b : a
ICBC_FORCEINLINE VFloat vselect(VMask mask, VFloat a, VFloat b) {
return vec_sel(a, b, mask);
}
ICBC_FORCEINLINE int reduce_min_index(VFloat v) {
//VFloat vmin = //@@ Horizontal min?
//return vec_cmpeq_idx(v, vmin);
// @@ Is there a better way to do this reduction?
int min_idx = 0;
float min_value = lane(v, 0);
for (int i = 1; i < VEC_SIZE; i++) {
float value = lane(v, i);
if (value < min_value) {
min_value = value;
min_idx = i;
}
}
return min_idx;
}
ICBC_FORCEINLINE void vtranspose4(VFloat & a, VFloat & b, VFloat & c, VFloat & d) {
VFloat t1 = vec_mergeh(a, c);
VFloat t2 = vec_mergel(a, c);
VFloat t3 = vec_mergeh(b, d);
VFloat t4 = vec_mergel(b, d);
a = vec_mergeh(t1, t3);
b = vec_mergel(t1, t3);
c = vec_mergeh(t2, t4);
d = vec_mergel(t2, t4);
}
#endif // ICBC_SIMD == *
#if ICBC_SIMD != ICBC_SCALAR
ICBC_FORCEINLINE VFloat vmadd(VFloat a, float b, VFloat c) {
VFloat vb = vbroadcast(b);
return vmadd(a, vb, c);
}
#endif
struct VVector3 {
VFloat x;
VFloat y;
VFloat z;
};
ICBC_FORCEINLINE VVector3 vbroadcast(Vector3 v) {
VVector3 v8;
v8.x = vbroadcast(v.x);
v8.y = vbroadcast(v.y);
v8.z = vbroadcast(v.z);
return v8;
}
ICBC_FORCEINLINE VVector3 vbroadcast(float x, float y, float z) {
VVector3 v8;
v8.x = vbroadcast(x);
v8.y = vbroadcast(y);
v8.z = vbroadcast(z);
return v8;
}
/*ICBC_FORCEINLINE VVector3 vload(const Vector3 * v) {
// @@ See this for a 8x3 transpose: https://software.intel.com/content/www/us/en/develop/articles/3d-vector-normalization-using-256-bit-intel-advanced-vector-extensions-intel-avx.html
}*/
ICBC_FORCEINLINE VVector3 vload(const Vector4 * ptr) {
#if ICBC_SIMD == ICBC_AVX512
// @@ AVX512 transpose not implemented.
__m512i vindex = _mm512_set_epi32(4 * 15, 4 * 14, 4 * 13, 4 * 12, 4 * 11, 4 * 10, 4 * 9, 4 * 8, 4 * 7, 4 * 6, 4 * 5, 4 * 4, 4 * 3, 4 * 2, 4 * 1, 0);
VVector3 v;
v.x = _mm512_i32gather_ps(vindex, &ptr->x, 4);
v.y = _mm512_i32gather_ps(vindex, &ptr->y, 4);
v.z = _mm512_i32gather_ps(vindex, &ptr->z, 4);
return v;
#else
VVector3 v;
v.x = vload(&ptr->x + 0 * VEC_SIZE);
v.y = vload(&ptr->x + 1 * VEC_SIZE);
v.z = vload(&ptr->x + 2 * VEC_SIZE);
VFloat tmp = vload(&ptr->x + 3 * VEC_SIZE);
vtranspose4(v.x, v.y, v.z, tmp);
return v;
#endif
}
ICBC_FORCEINLINE VVector3 operator+(VVector3 a, VVector3 b) {
VVector3 v;
v.x = (a.x + b.x);
v.y = (a.y + b.y);
v.z = (a.z + b.z);
return v;
}
ICBC_FORCEINLINE VVector3 operator-(VVector3 a, VVector3 b) {
VVector3 v8;
v8.x = (a.x - b.x);
v8.y = (a.y - b.y);
v8.z = (a.z - b.z);
return v8;
}
ICBC_FORCEINLINE VVector3 operator*(VVector3 a, VVector3 b) {
VVector3 v8;
v8.x = (a.x * b.x);
v8.y = (a.y * b.y);
v8.z = (a.z * b.z);
return v8;
}
ICBC_FORCEINLINE VVector3 operator*(VVector3 a, VFloat b) {
VVector3 v8;
v8.x = (a.x * b);
v8.y = (a.y * b);
v8.z = (a.z * b);
return v8;
}
ICBC_FORCEINLINE VVector3 vmadd(VVector3 a, VVector3 b, VVector3 c) {
VVector3 v8;
v8.x = vmadd(a.x, b.x, c.x);
v8.y = vmadd(a.y, b.y, c.y);
v8.z = vmadd(a.z, b.z, c.z);
return v8;
}
ICBC_FORCEINLINE VVector3 vmadd(VVector3 a, VFloat b, VVector3 c) {
VVector3 v8;
v8.x = vmadd(a.x, b, c.x);
v8.y = vmadd(a.y, b, c.y);
v8.z = vmadd(a.z, b, c.z);
return v8;
}
#if ICBC_SIMD != ICBC_SCALAR
ICBC_FORCEINLINE VVector3 vmadd(VVector3 a, float b, VVector3 c) {
VVector3 v8;
VFloat vb = vbroadcast(b);
v8.x = vmadd(a.x, vb, c.x);
v8.y = vmadd(a.y, vb, c.y);
v8.z = vmadd(a.z, vb, c.z);
return v8;
}
#endif
ICBC_FORCEINLINE VVector3 vmsub(VVector3 a, VFloat b, VFloat c) {
VVector3 v8;
v8.x = vmsub(a.x, b, c);
v8.y = vmsub(a.y, b, c);
v8.z = vmsub(a.z, b, c);
return v8;
}
ICBC_FORCEINLINE VVector3 vmsub(VVector3 a, VFloat b, VVector3 c) {
VVector3 v8;
v8.x = vmsub(a.x, b, c.x);
v8.y = vmsub(a.y, b, c.y);
v8.z = vmsub(a.z, b, c.z);
return v8;
}
ICBC_FORCEINLINE VVector3 vm2sub(VVector3 a, VFloat b, VVector3 c, VFloat d) {
VVector3 v;
v.x = vm2sub(a.x, b, c.x, d);
v.y = vm2sub(a.y, b, c.y, d);
v.z = vm2sub(a.z, b, c.z, d);
return v;
}
ICBC_FORCEINLINE VVector3 vm2sub(VVector3 a, VVector3 b, VVector3 c, VFloat d) {
VVector3 v;
v.x = vm2sub(a.x, b.x, c.x, d);
v.y = vm2sub(a.y, b.y, c.y, d);
v.z = vm2sub(a.z, b.z, c.z, d);
return v;
}
ICBC_FORCEINLINE VVector3 vm2sub(VVector3 a, VVector3 b, VVector3 c, VVector3 d) {
VVector3 v;
v.x = vm2sub(a.x, b.x, c.x, d.x);
v.y = vm2sub(a.y, b.y, c.y, d.y);
v.z = vm2sub(a.z, b.z, c.z, d.z);
return v;
}
ICBC_FORCEINLINE VFloat vdot(VVector3 a, VVector3 b) {
VFloat r;
r = a.x * b.x + a.y * b.y + a.z * b.z;
return r;
}
// Length squared.
ICBC_FORCEINLINE VFloat vlen2(VVector3 v) {
return vdot(v, v);
}
// mask ? b : a
ICBC_FORCEINLINE VVector3 vselect(VMask mask, VVector3 a, VVector3 b) {
VVector3 r;
r.x = vselect(mask, a.x, b.x);
r.y = vselect(mask, a.y, b.y);
r.z = vselect(mask, a.z, b.z);
return r;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
// Color conversion functions.
static const float midpoints5[32] = {
0.015686f, 0.047059f, 0.078431f, 0.111765f, 0.145098f, 0.176471f, 0.207843f, 0.241176f, 0.274510f, 0.305882f, 0.337255f, 0.370588f, 0.403922f, 0.435294f, 0.466667f, 0.5f,
0.533333f, 0.564706f, 0.596078f, 0.629412f, 0.662745f, 0.694118f, 0.725490f, 0.758824f, 0.792157f, 0.823529f, 0.854902f, 0.888235f, 0.921569f, 0.952941f, 0.984314f, FLT_MAX
};
static const float midpoints6[64] = {
0.007843f, 0.023529f, 0.039216f, 0.054902f, 0.070588f, 0.086275f, 0.101961f, 0.117647f, 0.133333f, 0.149020f, 0.164706f, 0.180392f, 0.196078f, 0.211765f, 0.227451f, 0.245098f,
0.262745f, 0.278431f, 0.294118f, 0.309804f, 0.325490f, 0.341176f, 0.356863f, 0.372549f, 0.388235f, 0.403922f, 0.419608f, 0.435294f, 0.450980f, 0.466667f, 0.482353f, 0.500000f,
0.517647f, 0.533333f, 0.549020f, 0.564706f, 0.580392f, 0.596078f, 0.611765f, 0.627451f, 0.643137f, 0.658824f, 0.674510f, 0.690196f, 0.705882f, 0.721569f, 0.737255f, 0.754902f,
0.772549f, 0.788235f, 0.803922f, 0.819608f, 0.835294f, 0.850980f, 0.866667f, 0.882353f, 0.898039f, 0.913725f, 0.929412f, 0.945098f, 0.960784f, 0.976471f, 0.992157f, FLT_MAX
};
/*void init_tables() {
for (int i = 0; i < 31; i++) {
float f0 = float(((i+0) << 3) | ((i+0) >> 2)) / 255.0f;
float f1 = float(((i+1) << 3) | ((i+1) >> 2)) / 255.0f;
midpoints5[i] = (f0 + f1) * 0.5;
}
midpoints5[31] = FLT_MAX;
for (int i = 0; i < 63; i++) {
float f0 = float(((i+0) << 2) | ((i+0) >> 4)) / 255.0f;
float f1 = float(((i+1) << 2) | ((i+1) >> 4)) / 255.0f;
midpoints6[i] = (f0 + f1) * 0.5;
}
midpoints6[63] = FLT_MAX;
}*/
ICBC_FORCEINLINE VFloat vround5(VFloat x) {
const VFloat rb_scale = vbroadcast(31.0f);
const VFloat rb_inv_scale = vbroadcast(1.0f / 31.0f);
#if ICBC_PERFECT_ROUND
VFloat q = vtruncate(x * rb_scale);
VFloat mp = vgather(midpoints5, q);
//return (q + (vbroadcast(1.0f) & (x > mp))) * rb_inv_scale;
return (q + vselect(x > mp, vzero(), vbroadcast(1))) * rb_inv_scale;
#else
return vround01(x * rb_scale) * rb_inv_scale;
#endif
}
ICBC_FORCEINLINE VFloat vround6(VFloat x) {
const VFloat g_scale = vbroadcast(63.0f);
const VFloat g_inv_scale = vbroadcast(1.0f / 63.0f);
#if ICBC_PERFECT_ROUND
VFloat q = vtruncate(x * g_scale);
VFloat mp = vgather(midpoints6, q);
//return (q + (vbroadcast(1) & (x > mp))) * g_inv_scale;
return (q + vselect(x > mp, vzero(), vbroadcast(1))) * g_inv_scale;
#else
return vround01(x * g_scale) * g_inv_scale;
#endif
}
ICBC_FORCEINLINE VVector3 vround_ept(VVector3 v) {
VVector3 r;
r.x = vround5(vsaturate(v.x));
r.y = vround6(vsaturate(v.y));
r.z = vround5(vsaturate(v.z));
return r;
}
static Color16 vector3_to_color16(const Vector3 & v) {
// Truncate.
uint r = uint(clamp(v.x * 31.0f, 0.0f, 31.0f));
uint g = uint(clamp(v.y * 63.0f, 0.0f, 63.0f));
uint b = uint(clamp(v.z * 31.0f, 0.0f, 31.0f));
// Round exactly according to 565 bit-expansion.
r += (v.x > midpoints5[r]);
g += (v.y > midpoints6[g]);
b += (v.z > midpoints5[b]);
Color16 c;
c.u = uint16((r << 11) | (g << 5) | b);
return c;
}
static Color32 bitexpand_color16_to_color32(Color16 c16) {
Color32 c32;
//c32.b = (c16.b << 3) | (c16.b >> 2);
//c32.g = (c16.g << 2) | (c16.g >> 4);
//c32.r = (c16.r << 3) | (c16.r >> 2);
//c32.a = 0xFF;
c32.u = ((c16.u << 3) & 0xf8) | ((c16.u << 5) & 0xfc00) | ((c16.u << 8) & 0xf80000);
c32.u |= (c32.u >> 5) & 0x070007;
c32.u |= (c32.u >> 6) & 0x000300;
return c32;
}
inline Vector3 color_to_vector3(Color32 c) {
return { c.r / 255.0f, c.g / 255.0f, c.b / 255.0f };
}
inline Color32 vector3_to_color32(Vector3 v) {
Color32 color;
color.r = uint8(saturate(v.x) * 255 + 0.5f);
color.g = uint8(saturate(v.y) * 255 + 0.5f);
color.b = uint8(saturate(v.z) * 255 + 0.5f);
color.a = 255;
return color;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
// Input block processing.
inline bool is_black(Vector3 c) {
// This large threshold seems to improve compression. This is not forcing these texels to be black, just
// causes them to be ignored during PCA.
// @@ We may want to adjust this based on the quality level, since this increases the number of blocks that try cluster-3 fitting.
//return c.x < midpoints5[0] && c.y < midpoints6[0] && c.z < midpoints5[0];
//return c.x < 1.0f / 32 && c.y < 1.0f / 32 && c.z < 1.0f / 32;
return c.x < 1.0f / 8 && c.y < 1.0f / 8 && c.z < 1.0f / 8;
}
// Find similar colors and combine them together.
static int reduce_colors(const Vector4 * input_colors, const float * input_weights, int count, float threshold, Vector3 * colors, float * weights, bool * any_black)
{
#if 0
for (int i = 0; i < 16; i++) {
colors[i] = input_colors[i].xyz;
weights[i] = input_weights[i];
}
return 16;
#else
*any_black = false;
int n = 0;
for (int i = 0; i < count; i++)
{
Vector3 ci = input_colors[i].xyz;
float wi = input_weights[i];
if (wi > 0) {
// Find matching color.
int j;
for (j = 0; j < n; j++) {
if (equal(colors[j], ci, threshold)) {
colors[j] = (colors[j] * weights[j] + ci * wi) / (weights[j] + wi);
weights[j] += wi;
break;
}
}
// No match found. Add new color.
if (j == n) {
colors[n] = ci;
weights[n] = wi;
n++;
}
if (is_black(ci)) {
*any_black = true;
}
}
}
ICBC_ASSERT(n <= count);
return n;
#endif
}
static int skip_blacks(const Vector3 * input_colors, const float * input_weights, int count, Vector3 * colors, float * weights)
{
int n = 0;
for (int i = 0; i < count; i++)
{
Vector3 ci = input_colors[i];
float wi = input_weights[i];
if (is_black(ci)) {
continue;
}
colors[n] = ci;
weights[n] = wi;
n += 1;
}
return n;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
// PCA
static Vector3 computeCentroid(int n, const Vector3 *__restrict points, const float *__restrict weights)
{
Vector3 centroid = { 0 };
float total = 0.0f;
for (int i = 0; i < n; i++)
{
total += weights[i];
centroid += weights[i] * points[i];
}
centroid *= (1.0f / total);
return centroid;
}
static Vector3 computeCovariance(int n, const Vector3 *__restrict points, const float *__restrict weights, float *__restrict covariance)
{
// compute the centroid
Vector3 centroid = computeCentroid(n, points, weights);
// compute covariance matrix
for (int i = 0; i < 6; i++)
{
covariance[i] = 0.0f;
}
for (int i = 0; i < n; i++)
{
Vector3 a = (points[i] - centroid); // @@ I think weight should be squared, but that seems to increase the error slightly.
Vector3 b = weights[i] * a;
covariance[0] += a.x * b.x;
covariance[1] += a.x * b.y;
covariance[2] += a.x * b.z;
covariance[3] += a.y * b.y;
covariance[4] += a.y * b.z;
covariance[5] += a.z * b.z;
}
return centroid;
}
// @@ We should be able to do something cheaper...
static Vector3 estimatePrincipalComponent(const float * __restrict matrix)
{
const Vector3 row0 = { matrix[0], matrix[1], matrix[2] };
const Vector3 row1 = { matrix[1], matrix[3], matrix[4] };
const Vector3 row2 = { matrix[2], matrix[4], matrix[5] };
float r0 = lengthSquared(row0);
float r1 = lengthSquared(row1);
float r2 = lengthSquared(row2);
if (r0 > r1 && r0 > r2) return row0;
if (r1 > r2) return row1;
return row2;
}
static inline Vector3 firstEigenVector_PowerMethod(const float *__restrict matrix)
{
if (matrix[0] == 0 && matrix[3] == 0 && matrix[5] == 0)
{
return {0};
}
Vector3 v = estimatePrincipalComponent(matrix);
const int NUM = 8;
for (int i = 0; i < NUM; i++)
{
float x = v.x * matrix[0] + v.y * matrix[1] + v.z * matrix[2];
float y = v.x * matrix[1] + v.y * matrix[3] + v.z * matrix[4];
float z = v.x * matrix[2] + v.y * matrix[4] + v.z * matrix[5];
float norm = max(max(x, y), z);
v = { x, y, z };
v *= (1.0f / norm);
}
return v;
}
static Vector3 computePrincipalComponent_PowerMethod(int n, const Vector3 *__restrict points, const float *__restrict weights)
{
float matrix[6];
computeCovariance(n, points, weights, matrix);
return firstEigenVector_PowerMethod(matrix);
}
///////////////////////////////////////////////////////////////////////////////////////////////////
// SAT
struct SummedAreaTable {
ICBC_ALIGN_16 float r[16];
ICBC_ALIGN_16 float g[16];
ICBC_ALIGN_16 float b[16];
ICBC_ALIGN_16 float w[16];
};
int compute_sat(const Vector3 * colors, const float * weights, int count, SummedAreaTable * sat)
{
// I've tried using a lower quality approximation of the principal direction, but the best fit line seems to produce best results.
Vector3 principal = computePrincipalComponent_PowerMethod(count, colors, weights);
// build the list of values
int order[16];
float dps[16];
for (int i = 0; i < count; ++i)
{
order[i] = i;
dps[i] = dot(colors[i], principal);
}
// stable sort
for (int i = 0; i < count; ++i)
{
for (int j = i; j > 0 && dps[j] < dps[j - 1]; --j)
{
swap(dps[j], dps[j - 1]);
swap(order[j], order[j - 1]);
}
}
float w = weights[order[0]];
sat->r[0] = colors[order[0]].x * w;
sat->g[0] = colors[order[0]].y * w;
sat->b[0] = colors[order[0]].z * w;
sat->w[0] = w;
for (int i = 1; i < count; i++) {
w = weights[order[i]];
sat->r[i] = sat->r[i - 1] + colors[order[i]].x * w;
sat->g[i] = sat->g[i - 1] + colors[order[i]].y * w;
sat->b[i] = sat->b[i - 1] + colors[order[i]].z * w;
sat->w[i] = sat->w[i - 1] + w;
}
// Try incremental decimation:
/*if (count > 4)
{
float threshold = 1.0f / 4;
for (uint i = 1; i < count; ++i)
{
if (sat->r[i] - sat->r[i - 1] < threshold &&
sat->g[i] - sat->g[i - 1] < threshold &&
sat->b[i] - sat->b[i - 1] < threshold)
{
for (int j = i+1; j < count; j++) {
sat->r[j - 1] = sat->r[j];
sat->g[j - 1] = sat->g[j];
sat->b[j - 1] = sat->b[j];
sat->w[j - 1] = sat->w[j];
}
count -= 1;
i -= 1;
if (count == 4) break;
}
}
}*/
for (int i = count; i < 16; i++) {
sat->r[i] = FLT_MAX;
sat->g[i] = FLT_MAX;
sat->b[i] = FLT_MAX;
sat->w[i] = FLT_MAX;
}
return count;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
// Cluster Fit
struct Combinations {
uint8 c0, c1, c2, pad;
};
static ICBC_ALIGN_16 int s_fourClusterTotal[16];
static ICBC_ALIGN_16 int s_threeClusterTotal[16];
static ICBC_ALIGN_16 Combinations s_fourCluster[968 + 8];
static ICBC_ALIGN_16 Combinations s_threeCluster[152 + 8];
#if ICBC_USE_NEON_VTL
static uint8 s_neon_vtl_index0_4[4 * 968];
static uint8 s_neon_vtl_index1_4[4 * 968];
static uint8 s_neon_vtl_index2_4[4 * 968];
static uint8 s_neon_vtl_index0_3[4 * 152];
static uint8 s_neon_vtl_index1_3[4 * 152];
#endif
static void init_cluster_tables() {
for (int t = 1, i = 0; t <= 16; t++) {
for (int c0 = 0; c0 <= t; c0++) {
for (int c1 = 0; c1 <= t - c0; c1++) {
for (int c2 = 0; c2 <= t - c0 - c1; c2++) {
// Skip this cluster so that the total is a multiple of 8
if (c0 == 0 && c1 == 0 && c2 == 0) continue;
bool found = false;
if (t > 1) {
for (int j = 0; j < s_fourClusterTotal[t-2]; j++) {
if (s_fourCluster[j].c0 == c0 && s_fourCluster[j].c1 == c0+c1 && s_fourCluster[j].c2 == c0+c1+c2) {
found = true;
break;
}
}
}
if (!found) {
s_fourCluster[i].c0 = uint8(c0);
s_fourCluster[i].c1 = uint8(c0+c1);
s_fourCluster[i].c2 = uint8(c0+c1+c2);
i++;
}
}
}
}
s_fourClusterTotal[t - 1] = i;
}
// Replicate last entry.
for (int i = 0; i < 8; i++) {
s_fourCluster[968 + i] = s_fourCluster[968-1];
}
for (int t = 1, i = 0; t <= 16; t++) {
for (int c0 = 0; c0 <= t; c0++) {
for (int c1 = 0; c1 <= t - c0; c1++) {
// Skip this cluster so that the total is a multiple of 8
if (c0 == 0 && c1 == 0) continue;
bool found = false;
if (t > 1) {
for (int j = 0; j < s_threeClusterTotal[t - 2]; j++) {
if (s_threeCluster[j].c0 == c0 && s_threeCluster[j].c1 == c0+c1) {
found = true;
break;
}
}
}
if (!found) {
s_threeCluster[i].c0 = uint8(c0);
s_threeCluster[i].c1 = uint8(c0 + c1);
i++;
}
}
}
s_threeClusterTotal[t - 1] = i;
}
// Replicate last entry.
for (int i = 0; i < 8; i++) {
s_threeCluster[152 + i] = s_threeCluster[152 - 1];
}
#if ICBC_USE_NEON_VTL
for (int i = 0; i < 968; i++) {
int c0 = (s_fourCluster[i].c0) - 1;
s_neon_vtl_index0_4[4 * i + 0] = uint8(c0 * 4 + 0);
s_neon_vtl_index0_4[4 * i + 1] = uint8(c0 * 4 + 1);
s_neon_vtl_index0_4[4 * i + 2] = uint8(c0 * 4 + 2);
s_neon_vtl_index0_4[4 * i + 3] = uint8(c0 * 4 + 3);
int c1 = (s_fourCluster[i].c1) - 1;
s_neon_vtl_index1_4[4 * i + 0] = uint8(c1 * 4 + 0);
s_neon_vtl_index1_4[4 * i + 1] = uint8(c1 * 4 + 1);
s_neon_vtl_index1_4[4 * i + 2] = uint8(c1 * 4 + 2);
s_neon_vtl_index1_4[4 * i + 3] = uint8(c1 * 4 + 3);
int c2 = (s_fourCluster[i].c2) - 1;
s_neon_vtl_index2_4[4 * i + 0] = uint8(c2 * 4 + 0);
s_neon_vtl_index2_4[4 * i + 1] = uint8(c2 * 4 + 1);
s_neon_vtl_index2_4[4 * i + 2] = uint8(c2 * 4 + 2);
s_neon_vtl_index2_4[4 * i + 3] = uint8(c2 * 4 + 3);
}
for (int i = 0; i < 152; i++) {
int c0 = (s_threeCluster[i].c0) - 1;
s_neon_vtl_index0_3[4 * i + 0] = uint8(c0 * 4 + 0);
s_neon_vtl_index0_3[4 * i + 1] = uint8(c0 * 4 + 1);
s_neon_vtl_index0_3[4 * i + 2] = uint8(c0 * 4 + 2);
s_neon_vtl_index0_3[4 * i + 3] = uint8(c0 * 4 + 3);
int c1 = (s_threeCluster[i].c1) - 1;
s_neon_vtl_index1_3[4 * i + 0] = uint8(c1 * 4 + 0);
s_neon_vtl_index1_3[4 * i + 1] = uint8(c1 * 4 + 1);
s_neon_vtl_index1_3[4 * i + 2] = uint8(c1 * 4 + 2);
s_neon_vtl_index1_3[4 * i + 3] = uint8(c1 * 4 + 3);
}
#endif
}
static void cluster_fit_three(const SummedAreaTable & sat, int count, Vector3 metric_sqr, Vector3 * start, Vector3 * end)
{
const float r_sum = sat.r[count-1];
const float g_sum = sat.g[count-1];
const float b_sum = sat.b[count-1];
const float w_sum = sat.w[count-1];
VFloat vbesterror = vbroadcast(FLT_MAX);
VVector3 vbeststart = { vzero(), vzero(), vzero() };
VVector3 vbestend = { vzero(), vzero(), vzero() };
// check all possible clusters for this total order
const int total_order_count = s_threeClusterTotal[count - 1];
for (int i = 0; i < total_order_count; i += VEC_SIZE)
{
VVector3 x0, x1;
VFloat w0, w1;
#if ICBC_USE_AVX512_PERMUTE
auto loadmask = lane_id() < vbroadcast(float(count));
// Load sat in one register:
VFloat vrsat = vload(loadmask, sat.r, FLT_MAX);
VFloat vgsat = vload(loadmask, sat.g, FLT_MAX);
VFloat vbsat = vload(loadmask, sat.b, FLT_MAX);
VFloat vwsat = vload(loadmask, sat.w, FLT_MAX);
// Load 4 uint8 per lane.
VInt packedClusterIndex = vload((int *)&s_threeCluster[i]);
VInt c0 = (packedClusterIndex & 0xFF) - 1;
VInt c1 = (packedClusterIndex >> 8) - 1;
x0.x = vpermuteif(c0 >= 0, vrsat, c0);
x0.y = vpermuteif(c0 >= 0, vgsat, c0);
x0.z = vpermuteif(c0 >= 0, vbsat, c0);
w0 = vpermuteif(c0 >= 0, vwsat, c0);
x1.x = vpermuteif(c1 >= 0, vrsat, c1);
x1.y = vpermuteif(c1 >= 0, vgsat, c1);
x1.z = vpermuteif(c1 >= 0, vbsat, c1);
w1 = vpermuteif(c1 >= 0, vwsat, c1);
#elif ICBC_USE_AVX2_PERMUTE2
// Load 4 uint8 per lane. @@ Ideally I should pack this better and load only 2.
VInt packedClusterIndex = vload((int *)&s_threeCluster[i]);
VInt c0 = (packedClusterIndex & 0xFF);
VInt c1 = ((packedClusterIndex >> 8)); // No need for & 0xFF
if (count <= 8) {
// Load sat.r in one register:
VFloat rLo = vload(sat.r);
VFloat gLo = vload(sat.g);
VFloat bLo = vload(sat.b);
VFloat wLo = vload(sat.w);
x0.x = vpermuteif(c0>0, rLo, c0-1);
x0.y = vpermuteif(c0>0, gLo, c0-1);
x0.z = vpermuteif(c0>0, bLo, c0-1);
w0 = vpermuteif(c0>0, wLo, c0-1);
x1.x = vpermuteif(c1>0, rLo, c1-1);
x1.y = vpermuteif(c1>0, gLo, c1-1);
x1.z = vpermuteif(c1>0, bLo, c1-1);
w1 = vpermuteif(c1>0, wLo, c1-1);
}
else {
// Load sat.r in two registers:
VFloat rLo = vload(sat.r); VFloat rHi = vload(sat.r + 8);
VFloat gLo = vload(sat.g); VFloat gHi = vload(sat.g + 8);
VFloat bLo = vload(sat.b); VFloat bHi = vload(sat.b + 8);
VFloat wLo = vload(sat.w); VFloat wHi = vload(sat.w + 8);
x0.x = vpermute2if(c0>0, rLo, rHi, c0-1);
x0.y = vpermute2if(c0>0, gLo, gHi, c0-1);
x0.z = vpermute2if(c0>0, bLo, bHi, c0-1);
w0 = vpermute2if(c0>0, wLo, wHi, c0-1);
x1.x = vpermute2if(c1>0, rLo, rHi, c1-1);
x1.y = vpermute2if(c1>0, gLo, gHi, c1-1);
x1.z = vpermute2if(c1>0, bLo, bHi, c1-1);
w1 = vpermute2if(c1>0, wLo, wHi, c1-1);
}
#elif ICBC_USE_NEON_VTL
uint8x16_t idx0 = (uint8x16_t &)s_neon_vtl_index0_3[4*i];
uint8x16_t idx1 = (uint8x16_t &)s_neon_vtl_index1_3[4*i];
if (count <= 4) {
uint8x16_t rsat1 = vld1q_u8((uint8*)sat.r);
uint8x16_t gsat1 = vld1q_u8((uint8*)sat.g);
uint8x16_t bsat1 = vld1q_u8((uint8*)sat.b);
uint8x16_t wsat1 = vld1q_u8((uint8*)sat.w);
x0.x = vreinterpretq_f32_u8(vqtbl1q_u8(rsat1, idx0));
x0.y = vreinterpretq_f32_u8(vqtbl1q_u8(gsat1, idx0));
x0.z = vreinterpretq_f32_u8(vqtbl1q_u8(bsat1, idx0));
w0 = vreinterpretq_f32_u8(vqtbl1q_u8(wsat1, idx0));
x1.x = vreinterpretq_f32_u8(vqtbl1q_u8(rsat1, idx1));
x1.y = vreinterpretq_f32_u8(vqtbl1q_u8(gsat1, idx1));
x1.z = vreinterpretq_f32_u8(vqtbl1q_u8(bsat1, idx1));
w1 = vreinterpretq_f32_u8(vqtbl1q_u8(wsat1, idx1));
}
else if (count <= 8) {
uint8x16x2_t rsat2 = vld2q_u8((uint8*)sat.r);
uint8x16x2_t gsat2 = vld2q_u8((uint8*)sat.g);
uint8x16x2_t bsat2 = vld2q_u8((uint8*)sat.b);
uint8x16x2_t wsat2 = vld2q_u8((uint8*)sat.w);
x0.x = vreinterpretq_f32_u8(vqtbl2q_u8(rsat2, idx0));
x0.y = vreinterpretq_f32_u8(vqtbl2q_u8(gsat2, idx0));
x0.z = vreinterpretq_f32_u8(vqtbl2q_u8(bsat2, idx0));
w0 = vreinterpretq_f32_u8(vqtbl2q_u8(wsat2, idx0));
x1.x = vreinterpretq_f32_u8(vqtbl2q_u8(rsat2, idx1));
x1.y = vreinterpretq_f32_u8(vqtbl2q_u8(gsat2, idx1));
x1.z = vreinterpretq_f32_u8(vqtbl2q_u8(bsat2, idx1));
w1 = vreinterpretq_f32_u8(vqtbl2q_u8(wsat2, idx1));
}
else if (count <= 12) {
uint8x16x3_t rsat3 = vld3q_u8((uint8*)sat.r);
uint8x16x3_t gsat3 = vld3q_u8((uint8*)sat.g);
uint8x16x3_t bsat3 = vld3q_u8((uint8*)sat.b);
uint8x16x3_t wsat3 = vld3q_u8((uint8*)sat.w);
x0.x = vreinterpretq_f32_u8(vqtbl3q_u8(rsat3, idx0));
x0.y = vreinterpretq_f32_u8(vqtbl3q_u8(gsat3, idx0));
x0.z = vreinterpretq_f32_u8(vqtbl3q_u8(bsat3, idx0));
w0 = vreinterpretq_f32_u8(vqtbl3q_u8(wsat3, idx0));
x1.x = vreinterpretq_f32_u8(vqtbl3q_u8(rsat3, idx1));
x1.y = vreinterpretq_f32_u8(vqtbl3q_u8(gsat3, idx1));
x1.z = vreinterpretq_f32_u8(vqtbl3q_u8(bsat3, idx1));
w1 = vreinterpretq_f32_u8(vqtbl3q_u8(wsat3, idx1));
}
else {
// Load SAT.
uint8x16x4_t rsat4 = vld4q_u8((uint8*)sat.r);
uint8x16x4_t gsat4 = vld4q_u8((uint8*)sat.g);
uint8x16x4_t bsat4 = vld4q_u8((uint8*)sat.b);
uint8x16x4_t wsat4 = vld4q_u8((uint8*)sat.w);
x0.x = vreinterpretq_f32_u8(vqtbl4q_u8(rsat4, idx0));
x0.y = vreinterpretq_f32_u8(vqtbl4q_u8(gsat4, idx0));
x0.z = vreinterpretq_f32_u8(vqtbl4q_u8(bsat4, idx0));
w0 = vreinterpretq_f32_u8(vqtbl4q_u8(wsat4, idx0));
x1.x = vreinterpretq_f32_u8(vqtbl4q_u8(rsat4, idx1));
x1.y = vreinterpretq_f32_u8(vqtbl4q_u8(gsat4, idx1));
x1.z = vreinterpretq_f32_u8(vqtbl4q_u8(bsat4, idx1));
w1 = vreinterpretq_f32_u8(vqtbl4q_u8(wsat4, idx1));
}
#else
// Scalar path
x0.x = vzero(); x0.y = vzero(); x0.z = vzero(); w0 = vzero();
x1.x = vzero(); x1.y = vzero(); x1.z = vzero(); w1 = vzero();
for (int l = 0; l < VEC_SIZE; l++) {
int c0 = s_threeCluster[i + l].c0 - 1;
if (c0 >= 0) {
lane(x0.x, l) = sat.r[c0];
lane(x0.y, l) = sat.g[c0];
lane(x0.z, l) = sat.b[c0];
lane(w0, l) = sat.w[c0];
}
int c1 = s_threeCluster[i + l].c1 - 1;
if (c1 >= 0) {
lane(x1.x, l) = sat.r[c1];
lane(x1.y, l) = sat.g[c1];
lane(x1.z, l) = sat.b[c1];
lane(w1, l) = sat.w[c1];
}
}
#endif
VFloat w2 = vbroadcast(w_sum) - w1;
x1 = x1 - x0;
w1 = w1 - w0;
VFloat alphabeta_sum = w1 * vbroadcast(0.25f);
VFloat alpha2_sum = w0 + alphabeta_sum;
VFloat beta2_sum = w2 + alphabeta_sum;
VFloat factor = vrcp(vm2sub(alpha2_sum, beta2_sum, alphabeta_sum, alphabeta_sum));
VVector3 alphax_sum = x0 + x1 * vbroadcast(0.5f);
VVector3 betax_sum = vbroadcast(r_sum, g_sum, b_sum) - alphax_sum;
VVector3 a = vm2sub(alphax_sum, beta2_sum, betax_sum, alphabeta_sum) * factor;
VVector3 b = vm2sub(betax_sum, alpha2_sum, alphax_sum, alphabeta_sum) * factor;
// snap to the grid
a = vround_ept(a);
b = vround_ept(b);
// compute the error
VVector3 e2 = vm2sub(a, vmsub(b, alphabeta_sum, alphax_sum), b, betax_sum) * vbroadcast(2.0f);
VVector3 e1 = vmadd(a * a, alpha2_sum, vmadd(b * b, beta2_sum, e2));
// apply the metric to the error term
VFloat error = vdot(e1, vbroadcast(metric_sqr));
// keep the solution if it wins
auto mask = (error < vbesterror);
// I could mask the unused lanes here, but instead I set the invalid SAT entries to FLT_MAX.
//mask = (mask & (vbroadcast(total_order_count) >= tid8(i))); // This doesn't seem to help. Is it OK to consider elements out of bounds?
vbesterror = vselect(mask, vbesterror, error);
vbeststart = vselect(mask, vbeststart, a);
vbestend = vselect(mask, vbestend, b);
}
int bestindex = reduce_min_index(vbesterror);
start->x = lane(vbeststart.x, bestindex);
start->y = lane(vbeststart.y, bestindex);
start->z = lane(vbeststart.z, bestindex);
end->x = lane(vbestend.x, bestindex);
end->y = lane(vbestend.y, bestindex);
end->z = lane(vbestend.z, bestindex);
}
static void cluster_fit_four(const SummedAreaTable & sat, int count, Vector3 metric_sqr, Vector3 * start, Vector3 * end)
{
const float r_sum = sat.r[count-1];
const float g_sum = sat.g[count-1];
const float b_sum = sat.b[count-1];
const float w_sum = sat.w[count-1];
VFloat vbesterror = vbroadcast(FLT_MAX);
VVector3 vbeststart = { vzero(), vzero(), vzero() };
VVector3 vbestend = { vzero(), vzero(), vzero() };
// check all possible clusters for this total order
const int total_order_count = s_fourClusterTotal[count - 1];
for (int i = 0; i < total_order_count; i += VEC_SIZE)
{
VVector3 x0, x1, x2;
VFloat w0, w1, w2;
/*
// Another approach would be to load and broadcast one color at a time like I do in my old CUDA implementation.
uint akku = 0;
// Compute alpha & beta for this permutation.
#pragma unroll
for (int i = 0; i < 16; i++)
{
const uint bits = permutation >> (2*i);
alphax_sum += alphaTable4[bits & 3] * colors[i];
akku += prods4[bits & 3];
}
float alpha2_sum = float(akku >> 16);
float beta2_sum = float((akku >> 8) & 0xff);
float alphabeta_sum = float(akku & 0xff);
float3 betax_sum = 9.0f * color_sum - alphax_sum;
*/
#if ICBC_USE_AVX512_PERMUTE
auto loadmask = lane_id() < vbroadcast(float(count));
// Load sat in one register:
VFloat vrsat = vload(loadmask, sat.r, FLT_MAX);
VFloat vgsat = vload(loadmask, sat.g, FLT_MAX);
VFloat vbsat = vload(loadmask, sat.b, FLT_MAX);
VFloat vwsat = vload(loadmask, sat.w, FLT_MAX);
// Load 4 uint8 per lane.
VInt packedClusterIndex = vload((int *)&s_fourCluster[i]);
VInt c0 = (packedClusterIndex & 0xFF) - 1;
VInt c1 = ((packedClusterIndex >> 8) & 0xFF) - 1;
VInt c2 = ((packedClusterIndex >> 16)) - 1; // @@ No need for &
x0.x = vpermuteif(c0 >= 0, vrsat, c0);
x0.y = vpermuteif(c0 >= 0, vgsat, c0);
x0.z = vpermuteif(c0 >= 0, vbsat, c0);
w0 = vpermuteif(c0 >= 0, vwsat, c0);
x1.x = vpermuteif(c1 >= 0, vrsat, c1);
x1.y = vpermuteif(c1 >= 0, vgsat, c1);
x1.z = vpermuteif(c1 >= 0, vbsat, c1);
w1 = vpermuteif(c1 >= 0, vwsat, c1);
x2.x = vpermuteif(c2 >= 0, vrsat, c2);
x2.y = vpermuteif(c2 >= 0, vgsat, c2);
x2.z = vpermuteif(c2 >= 0, vbsat, c2);
w2 = vpermuteif(c2 >= 0, vwsat, c2);
#elif ICBC_USE_AVX2_PERMUTE2
// Load 4 uint8 per lane.
VInt packedClusterIndex = vload((int *)&s_fourCluster[i]);
VInt c0 = (packedClusterIndex & 0xFF);
VInt c1 = ((packedClusterIndex >> 8) & 0xFF);
VInt c2 = ((packedClusterIndex >> 16)); // @@ No need for &
if (count <= 8) {
// Load sat.r in one register:
VFloat rLo = vload(sat.r);
VFloat gLo = vload(sat.g);
VFloat bLo = vload(sat.b);
VFloat wLo = vload(sat.w);
x0.x = vpermuteif(c0>0, rLo, c0-1);
x0.y = vpermuteif(c0>0, gLo, c0-1);
x0.z = vpermuteif(c0>0, bLo, c0-1);
w0 = vpermuteif(c0>0, wLo, c0-1);
x1.x = vpermuteif(c1>0, rLo, c1-1);
x1.y = vpermuteif(c1>0, gLo, c1-1);
x1.z = vpermuteif(c1>0, bLo, c1-1);
w1 = vpermuteif(c1>0, wLo, c1-1);
x2.x = vpermuteif(c2>0, rLo, c2-1);
x2.y = vpermuteif(c2>0, gLo, c2-1);
x2.z = vpermuteif(c2>0, bLo, c2-1);
w2 = vpermuteif(c2>0, wLo, c2-1);
}
else {
// Load sat.r in two registers:
VFloat rLo = vload(sat.r); VFloat rHi = vload(sat.r + 8);
VFloat gLo = vload(sat.g); VFloat gHi = vload(sat.g + 8);
VFloat bLo = vload(sat.b); VFloat bHi = vload(sat.b + 8);
VFloat wLo = vload(sat.w); VFloat wHi = vload(sat.w + 8);
x0.x = vpermute2if(c0>0, rLo, rHi, c0-1);
x0.y = vpermute2if(c0>0, gLo, gHi, c0-1);
x0.z = vpermute2if(c0>0, bLo, bHi, c0-1);
w0 = vpermute2if(c0>0, wLo, wHi, c0-1);
x1.x = vpermute2if(c1>0, rLo, rHi, c1-1);
x1.y = vpermute2if(c1>0, gLo, gHi, c1-1);
x1.z = vpermute2if(c1>0, bLo, bHi, c1-1);
w1 = vpermute2if(c1>0, wLo, wHi, c1-1);
x2.x = vpermute2if(c2>0, rLo, rHi, c2-1);
x2.y = vpermute2if(c2>0, gLo, gHi, c2-1);
x2.z = vpermute2if(c2>0, bLo, bHi, c2-1);
w2 = vpermute2if(c2>0, wLo, wHi, c2-1);
}
#elif ICBC_USE_NEON_VTL
uint8x16_t idx0 = (uint8x16_t &)s_neon_vtl_index0_4[4*i];
uint8x16_t idx1 = (uint8x16_t &)s_neon_vtl_index1_4[4*i];
uint8x16_t idx2 = (uint8x16_t &)s_neon_vtl_index2_4[4*i];
if (count <= 4) {
uint8x16_t rsat1 = vld1q_u8((uint8*)sat.r);
uint8x16_t gsat1 = vld1q_u8((uint8*)sat.g);
uint8x16_t bsat1 = vld1q_u8((uint8*)sat.b);
uint8x16_t wsat1 = vld1q_u8((uint8*)sat.w);
x0.x = vreinterpretq_f32_u8(vqtbl1q_u8(rsat1, idx0));
x0.y = vreinterpretq_f32_u8(vqtbl1q_u8(gsat1, idx0));
x0.z = vreinterpretq_f32_u8(vqtbl1q_u8(bsat1, idx0));
w0 = vreinterpretq_f32_u8(vqtbl1q_u8(wsat1, idx0));
x1.x = vreinterpretq_f32_u8(vqtbl1q_u8(rsat1, idx1));
x1.y = vreinterpretq_f32_u8(vqtbl1q_u8(gsat1, idx1));
x1.z = vreinterpretq_f32_u8(vqtbl1q_u8(bsat1, idx1));
w1 = vreinterpretq_f32_u8(vqtbl1q_u8(wsat1, idx1));
x2.x = vreinterpretq_f32_u8(vqtbl1q_u8(rsat1, idx2));
x2.y = vreinterpretq_f32_u8(vqtbl1q_u8(gsat1, idx2));
x2.z = vreinterpretq_f32_u8(vqtbl1q_u8(bsat1, idx2));
w2 = vreinterpretq_f32_u8(vqtbl1q_u8(wsat1, idx2));
}
else if (count <= 8) {
uint8x16x2_t rsat2 = vld2q_u8((uint8*)sat.r);
uint8x16x2_t gsat2 = vld2q_u8((uint8*)sat.g);
uint8x16x2_t bsat2 = vld2q_u8((uint8*)sat.b);
uint8x16x2_t wsat2 = vld2q_u8((uint8*)sat.w);
x0.x = vreinterpretq_f32_u8(vqtbl2q_u8(rsat2, idx0));
x0.y = vreinterpretq_f32_u8(vqtbl2q_u8(gsat2, idx0));
x0.z = vreinterpretq_f32_u8(vqtbl2q_u8(bsat2, idx0));
w0 = vreinterpretq_f32_u8(vqtbl2q_u8(wsat2, idx0));
x1.x = vreinterpretq_f32_u8(vqtbl2q_u8(rsat2, idx1));
x1.y = vreinterpretq_f32_u8(vqtbl2q_u8(gsat2, idx1));
x1.z = vreinterpretq_f32_u8(vqtbl2q_u8(bsat2, idx1));
w1 = vreinterpretq_f32_u8(vqtbl2q_u8(wsat2, idx1));
x2.x = vreinterpretq_f32_u8(vqtbl2q_u8(rsat2, idx2));
x2.y = vreinterpretq_f32_u8(vqtbl2q_u8(gsat2, idx2));
x2.z = vreinterpretq_f32_u8(vqtbl2q_u8(bsat2, idx2));
w2 = vreinterpretq_f32_u8(vqtbl2q_u8(wsat2, idx2));
}
else if (count <= 12) {
uint8x16x3_t rsat3 = vld3q_u8((uint8*)sat.r);
uint8x16x3_t gsat3 = vld3q_u8((uint8*)sat.g);
uint8x16x3_t bsat3 = vld3q_u8((uint8*)sat.b);
uint8x16x3_t wsat3 = vld3q_u8((uint8*)sat.w);
x0.x = vreinterpretq_f32_u8(vqtbl3q_u8(rsat3, idx0));
x0.y = vreinterpretq_f32_u8(vqtbl3q_u8(gsat3, idx0));
x0.z = vreinterpretq_f32_u8(vqtbl3q_u8(bsat3, idx0));
w0 = vreinterpretq_f32_u8(vqtbl3q_u8(wsat3, idx0));
x1.x = vreinterpretq_f32_u8(vqtbl3q_u8(rsat3, idx1));
x1.y = vreinterpretq_f32_u8(vqtbl3q_u8(gsat3, idx1));
x1.z = vreinterpretq_f32_u8(vqtbl3q_u8(bsat3, idx1));
w1 = vreinterpretq_f32_u8(vqtbl3q_u8(wsat3, idx1));
x2.x = vreinterpretq_f32_u8(vqtbl3q_u8(rsat3, idx2));
x2.y = vreinterpretq_f32_u8(vqtbl3q_u8(gsat3, idx2));
x2.z = vreinterpretq_f32_u8(vqtbl3q_u8(bsat3, idx2));
w2 = vreinterpretq_f32_u8(vqtbl3q_u8(wsat3, idx2));
}
else {
uint8x16x4_t rsat4 = vld4q_u8((uint8*)sat.r);
uint8x16x4_t gsat4 = vld4q_u8((uint8*)sat.g);
uint8x16x4_t bsat4 = vld4q_u8((uint8*)sat.b);
uint8x16x4_t wsat4 = vld4q_u8((uint8*)sat.w);
x0.x = vreinterpretq_f32_u8(vqtbl4q_u8(rsat4, idx0));
x0.y = vreinterpretq_f32_u8(vqtbl4q_u8(gsat4, idx0));
x0.z = vreinterpretq_f32_u8(vqtbl4q_u8(bsat4, idx0));
w0 = vreinterpretq_f32_u8(vqtbl4q_u8(wsat4, idx0));
x1.x = vreinterpretq_f32_u8(vqtbl4q_u8(rsat4, idx1));
x1.y = vreinterpretq_f32_u8(vqtbl4q_u8(gsat4, idx1));
x1.z = vreinterpretq_f32_u8(vqtbl4q_u8(bsat4, idx1));
w1 = vreinterpretq_f32_u8(vqtbl4q_u8(wsat4, idx1));
x2.x = vreinterpretq_f32_u8(vqtbl4q_u8(rsat4, idx2));
x2.y = vreinterpretq_f32_u8(vqtbl4q_u8(gsat4, idx2));
x2.z = vreinterpretq_f32_u8(vqtbl4q_u8(bsat4, idx2));
w2 = vreinterpretq_f32_u8(vqtbl4q_u8(wsat4, idx2));
}
#else
// Scalar path
x0.x = vzero(); x0.y = vzero(); x0.z = vzero(); w0 = vzero();
x1.x = vzero(); x1.y = vzero(); x1.z = vzero(); w1 = vzero();
x2.x = vzero(); x2.y = vzero(); x2.z = vzero(); w2 = vzero();
for (int l = 0; l < VEC_SIZE; l++) {
int c0 = s_fourCluster[i + l].c0 - 1;
if (c0 >= 0) {
lane(x0.x, l) = sat.r[c0];
lane(x0.y, l) = sat.g[c0];
lane(x0.z, l) = sat.b[c0];
lane(w0, l) = sat.w[c0];
}
int c1 = s_fourCluster[i + l].c1 - 1;
if (c1 >= 0) {
lane(x1.x, l) = sat.r[c1];
lane(x1.y, l) = sat.g[c1];
lane(x1.z, l) = sat.b[c1];
lane(w1, l) = sat.w[c1];
}
int c2 = s_fourCluster[i + l].c2 - 1;
if (c2 >= 0) {
lane(x2.x, l) = sat.r[c2];
lane(x2.y, l) = sat.g[c2];
lane(x2.z, l) = sat.b[c2];
lane(w2, l) = sat.w[c2];
}
}
#endif
VFloat w3 = vbroadcast(w_sum) - w2;
x2 = x2 - x1;
x1 = x1 - x0;
w2 = w2 - w1;
w1 = w1 - w0;
VFloat alpha2_sum = vmadd(w2, (1.0f / 9.0f), vmadd(w1, (4.0f / 9.0f), w0));
VFloat beta2_sum = vmadd(w1, (1.0f / 9.0f), vmadd(w2, (4.0f / 9.0f), w3));
VFloat alphabeta_sum = (w1 + w2) * vbroadcast(2.0f / 9.0f);
VFloat factor = vrcp(vm2sub(alpha2_sum, beta2_sum, alphabeta_sum, alphabeta_sum));
VVector3 alphax_sum = vmadd(x2, (1.0f / 3.0f), vmadd(x1, (2.0f / 3.0f), x0));
VVector3 betax_sum = vbroadcast(r_sum, g_sum, b_sum) - alphax_sum;
VVector3 a = vm2sub(alphax_sum, beta2_sum, betax_sum, alphabeta_sum) * factor;
VVector3 b = vm2sub(betax_sum, alpha2_sum, alphax_sum, alphabeta_sum) * factor;
// snap to the grid
a = vround_ept(a);
b = vround_ept(b);
// compute the error
VVector3 e2 = vm2sub(a, vmsub(b, alphabeta_sum, alphax_sum), b, betax_sum) * vbroadcast(2.0f);
VVector3 e1 = vmadd(a * a, alpha2_sum, vmadd(b * b, beta2_sum, e2));
// apply the metric to the error term
VFloat error = vdot(e1, vbroadcast(metric_sqr));
// keep the solution if it wins
auto mask = (error < vbesterror);
// We could mask the unused lanes here, but instead set the invalid SAT entries to FLT_MAX.
//mask = (mask & (vbroadcast(total_order_count) >= tid8(i))); // This doesn't seem to help. Is it OK to consider elements out of bounds?
vbesterror = vselect(mask, vbesterror, error);
vbeststart = vselect(mask, vbeststart, a);
vbestend = vselect(mask, vbestend, b);
}
int bestindex = reduce_min_index(vbesterror);
start->x = lane(vbeststart.x, bestindex);
start->y = lane(vbeststart.y, bestindex);
start->z = lane(vbeststart.z, bestindex);
end->x = lane(vbestend.x, bestindex);
end->y = lane(vbestend.y, bestindex);
end->z = lane(vbestend.z, bestindex);
}
///////////////////////////////////////////////////////////////////////////////////////////////////
// Palette evaluation.
Decoder s_decoder = Decoder_D3D10;
// D3D10
inline void evaluate_palette4_d3d10(Color32 palette[4]) {
palette[2].r = (2 * palette[0].r + palette[1].r) / 3;
palette[2].g = (2 * palette[0].g + palette[1].g) / 3;
palette[2].b = (2 * palette[0].b + palette[1].b) / 3;
palette[2].a = 0xFF;
palette[3].r = (2 * palette[1].r + palette[0].r) / 3;
palette[3].g = (2 * palette[1].g + palette[0].g) / 3;
palette[3].b = (2 * palette[1].b + palette[0].b) / 3;
palette[3].a = 0xFF;
}
inline void evaluate_palette3_d3d10(Color32 palette[4]) {
palette[2].r = (palette[0].r + palette[1].r) / 2;
palette[2].g = (palette[0].g + palette[1].g) / 2;
palette[2].b = (palette[0].b + palette[1].b) / 2;
palette[2].a = 0xFF;
palette[3].u = 0;
}
static void evaluate_palette_d3d10(Color16 c0, Color16 c1, Color32 palette[4]) {
palette[0] = bitexpand_color16_to_color32(c0);
palette[1] = bitexpand_color16_to_color32(c1);
if (c0.u > c1.u) {
evaluate_palette4_d3d10(palette);
}
else {
evaluate_palette3_d3d10(palette);
}
}
// NV
inline void evaluate_palette4_nv(Color16 c0, Color16 c1, Color32 palette[4]) {
int gdiff = palette[1].g - palette[0].g;
palette[2].r = uint8(((2 * c0.r + c1.r) * 22) / 8);
palette[2].g = uint8((256 * palette[0].g + gdiff / 4 + 128 + gdiff * 80) / 256);
palette[2].b = uint8(((2 * c0.b + c1.b) * 22) / 8);
palette[2].a = 0xFF;
palette[3].r = uint8(((2 * c1.r + c0.r) * 22) / 8);
palette[3].g = uint8((256 * palette[1].g - gdiff / 4 + 128 - gdiff * 80) / 256);
palette[3].b = uint8(((2 * c1.b + c0.b) * 22) / 8);
palette[3].a = 0xFF;
}
inline void evaluate_palette3_nv(Color16 c0, Color16 c1, Color32 palette[4]) {
int gdiff = palette[1].g - palette[0].g;
palette[2].r = uint8(((c0.r + c1.r) * 33) / 8);
palette[2].g = uint8((256 * palette[0].g + gdiff / 4 + 128 + gdiff * 128) / 256);
palette[2].b = uint8(((c0.b + c1.b) * 33) / 8);
palette[2].a = 0xFF;
palette[3].u = 0;
}
static void evaluate_palette_nv(Color16 c0, Color16 c1, Color32 palette[4]) {
palette[0] = bitexpand_color16_to_color32(c0);
palette[1] = bitexpand_color16_to_color32(c1);
if (c0.u > c1.u) {
evaluate_palette4_nv(c0, c1, palette);
}
else {
evaluate_palette3_nv(c0, c1, palette);
}
}
// AMD
inline void evaluate_palette4_amd(Color32 palette[4]) {
palette[2].r = uint8((43 * palette[0].r + 21 * palette[1].r + 32) >> 6);
palette[2].g = uint8((43 * palette[0].g + 21 * palette[1].g + 32) >> 6);
palette[2].b = uint8((43 * palette[0].b + 21 * palette[1].b + 32) >> 6);
palette[2].a = 0xFF;
palette[3].r = uint8((43 * palette[1].r + 21 * palette[0].r + 32) >> 6);
palette[3].g = uint8((43 * palette[1].g + 21 * palette[0].g + 32) >> 6);
palette[3].b = uint8((43 * palette[1].b + 21 * palette[0].b + 32) >> 6);
palette[3].a = 0xFF;
}
inline void evaluate_palette3_amd(Color32 palette[4]) {
palette[2].r = uint8((palette[0].r + palette[1].r + 1) / 2);
palette[2].g = uint8((palette[0].g + palette[1].g + 1) / 2);
palette[2].b = uint8((palette[0].b + palette[1].b + 1) / 2);
palette[2].a = 0xFF;
palette[3].u = 0;
}
static void evaluate_palette_amd(Color16 c0, Color16 c1, Color32 palette[4]) {
palette[0] = bitexpand_color16_to_color32(c0);
palette[1] = bitexpand_color16_to_color32(c1);
if (c0.u > c1.u) {
evaluate_palette4_amd(palette);
}
else {
evaluate_palette3_amd(palette);
}
}
inline void evaluate_palette(Color16 c0, Color16 c1, Color32 palette[4], Decoder decoder = s_decoder) {
if (decoder == Decoder_D3D10) evaluate_palette_d3d10(c0, c1, palette);
else if (decoder == Decoder_NVIDIA) evaluate_palette_nv(c0, c1, palette);
else if (decoder == Decoder_AMD) evaluate_palette_amd(c0, c1, palette);
}
static void evaluate_palette(Color16 c0, Color16 c1, Vector3 palette[4]) {
Color32 palette32[4];
evaluate_palette(c0, c1, palette32);
for (int i = 0; i < 4; i++) {
palette[i] = color_to_vector3(palette32[i]);
}
}
static void decode_dxt1(const BlockDXT1 * block, unsigned char rgba_block[16 * 4], Decoder decoder)
{
Color32 palette[4];
evaluate_palette(block->col0, block->col1, palette, decoder);
for (int i = 0; i < 16; i++) {
int index = (block->indices >> (2 * i)) & 3;
Color32 c = palette[index];
rgba_block[4 * i + 0] = c.r;
rgba_block[4 * i + 1] = c.g;
rgba_block[4 * i + 2] = c.b;
rgba_block[4 * i + 3] = c.a;
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////
// Error evaluation.
// Different ways of estimating the error.
static float evaluate_mse(const Vector3 & p, const Vector3 & c, const Vector3 & w) {
Vector3 d = (p - c) * w * 255;
return dot(d, d);
}
static float evaluate_mse(const Color32 & p, const Vector3 & c, const Vector3 & w) {
Vector3 d = (color_to_vector3(p) - c) * w * 255;
return dot(d, d);
}
static int evaluate_mse(const Color32 & p, const Color32 & c) {
return (square(int(p.r)-c.r) + square(int(p.g)-c.g) + square(int(p.b)-c.b));
}
static int evaluate_mse(const Color32 palette[4], const Color32 & c) {
int e0 = evaluate_mse(palette[0], c);
int e1 = evaluate_mse(palette[1], c);
int e2 = evaluate_mse(palette[2], c);
int e3 = evaluate_mse(palette[3], c);
return min(min(e0, e1), min(e2, e3));
}
// Returns MSE error in [0-255] range.
static int evaluate_mse(const BlockDXT1 * output, Color32 color, int index) {
Color32 palette[4];
evaluate_palette(output->col0, output->col1, palette);
return evaluate_mse(palette[index], color);
}
// Returns weighted MSE error in [0-255] range.
static float evaluate_palette_error(Color32 palette[4], const Color32 * colors, const float * weights, int count) {
float total = 0.0f;
for (int i = 0; i < count; i++) {
total += weights[i] * evaluate_mse(palette, colors[i]);
}
return total;
}
static float evaluate_palette_error(Color32 palette[4], const Color32 * colors, int count) {
float total = 0.0f;
for (int i = 0; i < count; i++) {
total += evaluate_mse(palette, colors[i]);
}
return total;
}
static float evaluate_mse(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, const BlockDXT1 * output) {
Color32 palette[4];
evaluate_palette(output->col0, output->col1, palette);
// evaluate error for each index.
float error = 0.0f;
for (int i = 0; i < 16; i++) {
int index = (output->indices >> (2 * i)) & 3;
error += input_weights[i] * evaluate_mse(palette[index], input_colors[i].xyz, color_weights);
}
return error;
}
static float evaluate_mse(const Vector4 input_colors[16], const float input_weights[16], const Vector3& color_weights, Vector3 palette[4], uint32 indices) {
// evaluate error for each index.
float error = 0.0f;
for (int i = 0; i < 16; i++) {
int index = (indices >> (2 * i)) & 3;
error += input_weights[i] * evaluate_mse(palette[index], input_colors[i].xyz, color_weights);
}
return error;
}
float evaluate_dxt1_error(const uint8 rgba_block[16*4], const BlockDXT1 * block, Decoder decoder) {
Color32 palette[4];
evaluate_palette(block->col0, block->col1, palette, decoder);
// evaluate error for each index.
float error = 0.0f;
for (int i = 0; i < 16; i++) {
int index = (block->indices >> (2 * i)) & 3;
Color32 c;
c.r = rgba_block[4 * i + 0];
c.g = rgba_block[4 * i + 1];
c.b = rgba_block[4 * i + 2];
c.a = 255;
error += evaluate_mse(palette[index], c);
}
return error;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
// Index selection
// @@ Can we interleave the two uint16 at once?
inline uint32 interleave_uint16_with_zeros(uint32 input) {
uint32 word = input;
word = (word ^ (word << 8 )) & 0x00ff00ff;
word = (word ^ (word << 4 )) & 0x0f0f0f0f;
word = (word ^ (word << 2 )) & 0x33333333;
word = (word ^ (word << 1 )) & 0x55555555;
return word;
}
// Interleave the bits. https://lemire.me/blog/2018/01/08/how-fast-can-you-bit-interleave-32-bit-integers/
ICBC_FORCEINLINE uint32 interleave(uint32 a, uint32 b) {
#if ICBC_BMI2
return _pdep_u32(a, 0x55555555) | _pdep_u32(b, 0xaaaaaaaa);
#else
return interleave_uint16_with_zeros(a) | (interleave_uint16_with_zeros(b) << 1);
#endif
}
static uint compute_indices4(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 palette[4]) {
uint indices0 = 0;
uint indices1 = 0;
VVector3 vw = vbroadcast(color_weights);
VVector3 vp0 = vbroadcast(palette[0]) * vw;
VVector3 vp1 = vbroadcast(palette[1]) * vw;
VVector3 vp2 = vbroadcast(palette[2]) * vw;
VVector3 vp3 = vbroadcast(palette[3]) * vw;
for (int i = 0; i < 16; i += VEC_SIZE) {
VVector3 vc = vload(&input_colors[i]) * vw;
VFloat d0 = vlen2(vc - vp0);
VFloat d1 = vlen2(vc - vp1);
VFloat d2 = vlen2(vc - vp2);
VFloat d3 = vlen2(vc - vp3);
VMask b1 = d1 > d2;
VMask b2 = d0 > d2;
VMask x0 = b1 & b2;
VMask b0 = d0 > d3;
VMask b3 = d1 > d3;
x0 = x0 | (b0 & b3);
VMask b4 = d2 > d3;
VMask x1 = b0 & b4;
indices0 |= mask(x0) << i;
indices1 |= mask(x1) << i;
}
return interleave(indices1, indices0);
}
static uint compute_indices3(const Vector4 input_colors[16], const Vector3 & color_weights, bool allow_transparent_black, const Vector3 palette[4]) {
uint indices0 = 0;
uint indices1 = 0;
VVector3 vw = vbroadcast(color_weights);
VVector3 vp0 = vbroadcast(palette[0]) * vw;
VVector3 vp1 = vbroadcast(palette[1]) * vw;
VVector3 vp2 = vbroadcast(palette[2]) * vw;
if (allow_transparent_black) {
for (int i = 0; i < 16; i += VEC_SIZE) {
VVector3 vc = vload(&input_colors[i]) * vw;
VFloat d0 = vlen2(vp0 - vc);
VFloat d1 = vlen2(vp1 - vc);
VFloat d2 = vlen2(vp2 - vc);
VFloat d3 = vdot(vc, vc);
VMask i1 = (d1 < d2);
VMask i2 = (d2 <= d0) & (d2 <= d1);
VMask i3 = (d3 <= d0) & (d3 <= d1) & (d3 <= d2);
indices0 |= mask(i2 | i3) << i;
indices1 |= mask(i1 | i3) << i;
}
}
else {
for (int i = 0; i < 16; i += VEC_SIZE) {
VVector3 vc = vload(&input_colors[i]) * vw;
VFloat d0 = vlen2(vc - vp0);
VFloat d1 = vlen2(vc - vp1);
VFloat d2 = vlen2(vc - vp2);
VMask i1 = (d1 < d2);
VMask i2 = (d2 <= d0) & (d2 <= d1);
indices0 |= mask(i2) << i;
indices1 |= mask(i1) << i;
}
}
return interleave(indices1, indices0);
}
static uint compute_indices(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 palette[4]) {
#if 0
Vector3 p0 = palette[0] * color_weights;
Vector3 p1 = palette[1] * color_weights;
Vector3 p2 = palette[2] * color_weights;
Vector3 p3 = palette[3] * color_weights;
uint indices = 0;
for (int i = 0; i < 16; i++) {
Vector3 ci = input_colors[i].xyz * color_weights;
float d0 = lengthSquared(p0 - ci);
float d1 = lengthSquared(p1 - ci);
float d2 = lengthSquared(p2 - ci);
float d3 = lengthSquared(p3 - ci);
uint index;
if (d0 < d1 && d0 < d2 && d0 < d3) index = 0;
else if (d1 < d2 && d1 < d3) index = 1;
else if (d2 < d3) index = 2;
else index = 3;
indices |= index << (2 * i);
}
return indices;
#else
uint indices0 = 0;
uint indices1 = 0;
VVector3 vw = vbroadcast(color_weights);
VVector3 vp0 = vbroadcast(palette[0]) * vw;
VVector3 vp1 = vbroadcast(palette[1]) * vw;
VVector3 vp2 = vbroadcast(palette[2]) * vw;
VVector3 vp3 = vbroadcast(palette[3]) * vw;
for (int i = 0; i < 16; i += VEC_SIZE) {
VVector3 vc = vload(&input_colors[i]) * vw;
VFloat d0 = vlen2(vc - vp0);
VFloat d1 = vlen2(vc - vp1);
VFloat d2 = vlen2(vc - vp2);
VFloat d3 = vlen2(vc - vp3);
//VMask i0 = (d0 < d1) & (d0 < d2) & (d0 < d3);
VMask i1 = (d1 <= d0) & (d1 < d2) & (d1 < d3);
VMask i2 = (d2 <= d0) & (d2 <= d1) & (d2 < d3);
VMask i3 = (d3 <= d0) & (d3 <= d1) & (d3 <= d2);
//VFloat vindex = vselect(i0, vselect(i1, vselect(i2, vbroadcast(3), vbroadcast(2)), vbroadcast(1)), vbroadcast(0));
indices0 |= mask(i2 | i3) << i;
indices1 |= mask(i1 | i3) << i;
}
uint indices = interleave(indices1, indices0);
return indices;
#endif
}
static float output_block3(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, bool allow_transparent_black, const Vector3 & v0, const Vector3 & v1, BlockDXT1 * block)
{
Color16 color0 = vector3_to_color16(v0);
Color16 color1 = vector3_to_color16(v1);
if (color0.u > color1.u) {
swap(color0, color1);
}
Vector3 palette[4];
evaluate_palette(color0, color1, palette);
block->col0 = color0;
block->col1 = color1;
block->indices = compute_indices3(input_colors, color_weights, allow_transparent_black, palette);
return evaluate_mse(input_colors, input_weights, color_weights, palette, block->indices);
}
static float output_block4(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, const Vector3 & v0, const Vector3 & v1, BlockDXT1 * block)
{
Color16 color0 = vector3_to_color16(v0);
Color16 color1 = vector3_to_color16(v1);
if (color0.u < color1.u) {
swap(color0, color1);
}
Vector3 palette[4];
evaluate_palette(color0, color1, palette);
block->col0 = color0;
block->col1 = color1;
block->indices = compute_indices4(input_colors, color_weights, palette);
return evaluate_mse(input_colors, input_weights, color_weights, palette, block->indices);
}
// Least squares fitting of color end points for the given indices. @@ Take weights into account.
static bool optimize_end_points4(uint indices, const Vector4 * colors, /*const float * weights,*/ int count, Vector3 * a, Vector3 * b)
{
float alpha2_sum = 0.0f;
float beta2_sum = 0.0f;
float alphabeta_sum = 0.0f;
Vector3 alphax_sum = { 0,0,0 };
Vector3 betax_sum = { 0,0,0 };
for (int i = 0; i < count; i++)
{
const uint bits = indices >> (2 * i);
float beta = float(bits & 1);
if (bits & 2) beta = (1 + beta) / 3.0f;
float alpha = 1.0f - beta;
alpha2_sum += alpha * alpha;
beta2_sum += beta * beta;
alphabeta_sum += alpha * beta;
alphax_sum += alpha * colors[i].xyz;
betax_sum += beta * colors[i].xyz;
}
float denom = alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum;
if (equal(denom, 0.0f)) return false;
float factor = 1.0f / denom;
*a = saturate((alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor);
*b = saturate((betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor);
return true;
}
// Least squares optimization with custom factors.
// This allows us passing the standard [1, 0, 2/3 1/3] weights by default, but also use alternative mappings when the number of clusters is not 4.
static bool optimize_end_points4(uint indices, const Vector3 * colors, int count, float factors[4], Vector3 * a, Vector3 * b)
{
float alpha2_sum = 0.0f;
float beta2_sum = 0.0f;
float alphabeta_sum = 0.0f;
Vector3 alphax_sum = { 0,0,0 };
Vector3 betax_sum = { 0,0,0 };
for (int i = 0; i < count; i++)
{
const uint idx = (indices >> (2 * i)) & 3;
float alpha = factors[idx];
float beta = 1 - alpha;
alpha2_sum += alpha * alpha;
beta2_sum += beta * beta;
alphabeta_sum += alpha * beta;
alphax_sum += alpha * colors[i];
betax_sum += beta * colors[i];
}
float denom = alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum;
if (equal(denom, 0.0f)) return false;
float factor = 1.0f / denom;
*a = saturate((alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor);
*b = saturate((betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor);
return true;
}
static bool optimize_end_points4(uint indices, const Vector3 * colors, int count, Vector3 * a, Vector3 * b)
{
float factors[4] = { 1, 0, 2.f / 3, 1.f / 3 };
return optimize_end_points4(indices, colors, count, factors, a, b);
}
// Least squares fitting of color end points for the given indices. @@ This does not support black/transparent index. @@ Take weights into account.
static bool optimize_end_points3(uint indices, const Vector3 * colors, /*const float * weights,*/ int count, Vector3 * a, Vector3 * b)
{
float alpha2_sum = 0.0f;
float beta2_sum = 0.0f;
float alphabeta_sum = 0.0f;
Vector3 alphax_sum = { 0,0,0 };
Vector3 betax_sum = { 0,0,0 };
for (int i = 0; i < count; i++)
{
const uint bits = indices >> (2 * i);
float beta = float(bits & 1);
if (bits & 2) beta = 0.5f;
float alpha = 1.0f - beta;
alpha2_sum += alpha * alpha;
beta2_sum += beta * beta;
alphabeta_sum += alpha * beta;
alphax_sum += alpha * colors[i];
betax_sum += beta * colors[i];
}
float denom = alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum;
if (equal(denom, 0.0f)) return false;
float factor = 1.0f / denom;
*a = saturate((alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor);
*b = saturate((betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor);
return true;
}
// find minimum and maximum colors based on bounding box in color space
inline static void fit_colors_bbox(const Vector3 * colors, int count, Vector3 * __restrict c0, Vector3 * __restrict c1)
{
*c0 = { 0,0,0 };
*c1 = { 1,1,1 };
for (int i = 0; i < count; i++) {
*c0 = max(*c0, colors[i]);
*c1 = min(*c1, colors[i]);
}
}
inline static void select_diagonal(const Vector3 * colors, int count, Vector3 * __restrict c0, Vector3 * __restrict c1)
{
Vector3 center = (*c0 + *c1) * 0.5f;
/*Vector3 center = colors[0];
for (int i = 1; i < count; i++) {
center = center * float(i-1) / i + colors[i] / i;
}*/
/*Vector3 center = colors[0];
for (int i = 1; i < count; i++) {
center += colors[i];
}
center /= count;*/
float cov_xz = 0.0f;
float cov_yz = 0.0f;
for (int i = 0; i < count; i++) {
Vector3 t = colors[i] - center;
cov_xz += t.x * t.z;
cov_yz += t.y * t.z;
}
float x0 = c0->x;
float y0 = c0->y;
float x1 = c1->x;
float y1 = c1->y;
if (cov_xz < 0) {
swap(x0, x1);
}
if (cov_yz < 0) {
swap(y0, y1);
}
*c0 = { x0, y0, c0->z };
*c1 = { x1, y1, c1->z };
}
inline static void inset_bbox(Vector3 * __restrict c0, Vector3 * __restrict c1)
{
float bias = (8.0f / 255.0f) / 16.0f;
Vector3 inset = (*c0 - *c1) / 16.0f - scalar_to_vector3(bias);
*c0 = saturate(*c0 - inset);
*c1 = saturate(*c1 + inset);
}
// Single color lookup tables from:
// https://github.com/nothings/stb/blob/master/stb_dxt.h
static uint8 s_match5[256][2];
static uint8 s_match6[256][2];
static inline int Lerp13(int a, int b)
{
// replace "/ 3" by "* 0xaaab) >> 17" if your compiler sucks or you really need every ounce of speed.
return (a * 2 + b) / 3;
}
static void PrepareOptTable5(uint8 * table, Decoder decoder)
{
uint8 expand[32];
for (int i = 0; i < 32; i++) expand[i] = uint8((i << 3) | (i >> 2));
for (int i = 0; i < 256; i++) {
int bestErr = 256 * 100;
for (int mn = 0; mn < 32; mn++) {
for (int mx = 0; mx < 32; mx++) {
int mine = expand[mn];
int maxe = expand[mx];
int err;
int amd_r = (43 * maxe + 21 * mine + 32) >> 6;
int amd_err = abs(amd_r - i);
int nv_r = ((2 * mx + mn) * 22) / 8;
int nv_err = abs(nv_r - i);
if (decoder == Decoder_D3D10) {
// DX10 spec says that interpolation must be within 3% of "correct" result,
// add this as error term. (normally we'd expect a random distribution of
// +-1.5% error, but nowhere in the spec does it say that the error has to be
// unbiased - better safe than sorry).
int r = (maxe * 2 + mine) / 3;
err = abs(r - i) * 100 + abs(mx - mn) * 3;
// Another approach is to consider the worst of AMD and NVIDIA errors.
err = max(amd_err, nv_err);
}
else if (decoder == Decoder_NVIDIA) {
err = nv_err;
}
else /*if (decoder == Decoder_AMD)*/ {
err = amd_err;
}
if (err < bestErr) {
bestErr = err;
table[i * 2 + 0] = uint8(mx);
table[i * 2 + 1] = uint8(mn);
}
}
}
}
}
static void PrepareOptTable6(uint8 * table, Decoder decoder)
{
uint8 expand[64];
for (int i = 0; i < 64; i++) expand[i] = uint8((i << 2) | (i >> 4));
for (int i = 0; i < 256; i++) {
int bestErr = 256 * 100;
for (int mn = 0; mn < 64; mn++) {
for (int mx = 0; mx < 64; mx++) {
int mine = expand[mn];
int maxe = expand[mx];
int err;
int amd_g = (43 * maxe + 21 * mine + 32) >> 6;
int amd_err = abs(amd_g - i);
int nv_g = (256 * mine + (maxe - mine) / 4 + 128 + (maxe - mine) * 80) / 256;
int nv_err = abs(nv_g - i);
if (decoder == Decoder_D3D10) {
// DX10 spec says that interpolation must be within 3% of "correct" result,
// add this as error term. (normally we'd expect a random distribution of
// +-1.5% error, but nowhere in the spec does it say that the error has to be
// unbiased - better safe than sorry).
int g = (maxe * 2 + mine) / 3;
err = abs(g - i) * 100 + abs(mx - mn) * 3;
// Another approach is to consider the worst of AMD and NVIDIA errors.
err = max(amd_err, nv_err);
}
else if (decoder == Decoder_NVIDIA) {
err = nv_err;
}
else /*if (decoder == Decoder_AMD)*/ {
err = amd_err;
}
if (err < bestErr) {
bestErr = err;
table[i * 2 + 0] = uint8(mx);
table[i * 2 + 1] = uint8(mn);
}
}
}
}
}
static void init_single_color_tables(Decoder decoder)
{
// Prepare single color lookup tables.
PrepareOptTable5(&s_match5[0][0], decoder);
PrepareOptTable6(&s_match6[0][0], decoder);
}
// Single color compressor, based on:
// https://mollyrocket.com/forums/viewtopic.php?t=392
static void compress_dxt1_single_color_optimal(Color32 c, BlockDXT1 * output)
{
output->col0.r = s_match5[c.r][0];
output->col0.g = s_match6[c.g][0];
output->col0.b = s_match5[c.b][0];
output->col1.r = s_match5[c.r][1];
output->col1.g = s_match6[c.g][1];
output->col1.b = s_match5[c.b][1];
output->indices = 0xaaaaaaaa;
if (output->col0.u < output->col1.u)
{
swap(output->col0.u, output->col1.u);
output->indices ^= 0x55555555;
}
}
static float compress_dxt1_cluster_fit(const Vector4 input_colors[16], const float input_weights[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, bool three_color_mode, bool try_transparent_black, bool allow_transparent_black, BlockDXT1 * output)
{
Vector3 metric_sqr = color_weights * color_weights;
SummedAreaTable sat;
int sat_count = compute_sat(colors, weights, count, &sat);
Vector3 start, end;
cluster_fit_four(sat, sat_count, metric_sqr, &start, &end);
float best_error = output_block4(input_colors, input_weights, color_weights, start, end, output);
if (three_color_mode) {
if (try_transparent_black) {
Vector3 tmp_colors[16];
float tmp_weights[16];
int tmp_count = skip_blacks(colors, weights, count, tmp_colors, tmp_weights);
if (!tmp_count) return best_error;
sat_count = compute_sat(tmp_colors, tmp_weights, tmp_count, &sat);
}
cluster_fit_three(sat, sat_count, metric_sqr, &start, &end);
BlockDXT1 three_color_block;
float three_color_error = output_block3(input_colors, input_weights, color_weights, allow_transparent_black, start, end, &three_color_block);
if (three_color_error < best_error) {
best_error = three_color_error;
*output = three_color_block;
}
}
return best_error;
}
static float refine_endpoints(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, bool three_color_mode, float input_error, BlockDXT1 * output) {
// TODO:
// - Optimize palette evaluation when updating only one channel.
// - try all diagonals.
// Things that don't help:
// - Alternate endpoint updates.
// - Randomize order.
// - If one direction does not improve, test opposite direction next.
static const int8 deltas[16][3] = {
{1,0,0},
{0,1,0},
{0,0,1},
{-1,0,0},
{0,-1,0},
{0,0,-1},
{1,1,0},
{1,0,1},
{0,1,1},
{-1,-1,0},
{-1,0,-1},
{0,-1,-1},
{-1,1,0},
//{-1,0,1},
{1,-1,0},
{0,-1,1},
//{1,0,-1},
{0,1,-1},
};
float best_error = input_error;
int lastImprovement = 0;
for (int i = 0; i < 256; i++) {
BlockDXT1 refined = *output;
int8 delta[3] = { deltas[i % 16][0], deltas[i % 16][1], deltas[i % 16][2] };
if ((i / 16) & 1) {
refined.col0.r += delta[0];
refined.col0.g += delta[1];
refined.col0.b += delta[2];
}
else {
refined.col1.r += delta[0];
refined.col1.g += delta[1];
refined.col1.b += delta[2];
}
if (!three_color_mode) {
if (refined.col0.u == refined.col1.u) refined.col1.g += 1;
if (refined.col0.u < refined.col1.u) swap(refined.col0.u, refined.col1.u);
}
Vector3 palette[4];
evaluate_palette(output->col0, output->col1, palette);
refined.indices = compute_indices(input_colors, color_weights, palette);
float refined_error = evaluate_mse(input_colors, input_weights, color_weights, &refined);
if (refined_error < best_error) {
best_error = refined_error;
*output = refined;
lastImprovement = i;
}
// Early out if the last 32 steps didn't improve error.
if (i - lastImprovement > 32) break;
}
return best_error;
}
struct Options {
float threshold = 0.0f;
bool box_fit = false;
bool least_squares_fit = false;
bool cluster_fit = false;
bool cluster_fit_3 = false;
bool cluster_fit_3_black_only = false;
bool endpoint_refinement = false;
};
static Options setup_options(Quality level, bool enable_three_color_mode, bool enable_transparent_black) {
Options opt;
switch (level) {
case Quality_Level1: // Box fit + least squares fit.
opt.box_fit = true;
opt.least_squares_fit = true;
opt.threshold = 1.0f / 256;
break;
case Quality_Level2: // Cluster fit 4, threshold = 24.
opt.box_fit = true;
opt.least_squares_fit = true;
opt.cluster_fit = true;
opt.cluster_fit_3_black_only = enable_three_color_mode && enable_transparent_black;
opt.threshold = 1.0f / 24;
break;
case Quality_Level3: // Cluster fit 4, threshold = 32.
opt.box_fit = true;
opt.cluster_fit = true;
opt.cluster_fit_3_black_only = enable_three_color_mode && enable_transparent_black;
opt.threshold = 1.0f / 32;
break;
case Quality_Level4: // Cluster fit 3+4, threshold = 48.
opt.cluster_fit = true;
opt.cluster_fit_3_black_only = enable_three_color_mode && enable_transparent_black;
opt.threshold = 1.0f / 48;
break;
case Quality_Level5: // Cluster fit 3+4, threshold = 64.
opt.cluster_fit = true;
opt.cluster_fit_3_black_only = enable_three_color_mode && enable_transparent_black;
opt.threshold = 1.0f / 64;
break;
case Quality_Level6: // Cluster fit 3+4, threshold = 96.
opt.cluster_fit = true;
opt.cluster_fit_3_black_only = enable_three_color_mode && enable_transparent_black;
opt.threshold = 1.0f / 96;
break;
case Quality_Level7: // Cluster fit 3+4, threshold = 128.
opt.cluster_fit = true;
opt.cluster_fit_3_black_only = enable_three_color_mode && enable_transparent_black;
opt.threshold = 1.0f / 128;
break;
case Quality_Level8: // Cluster fit 3+4, threshold = 256.
opt.cluster_fit = true;
opt.cluster_fit_3 = enable_three_color_mode;
opt.threshold = 1.0f / 256;
break;
case Quality_Level9: // Cluster fit 3+4, threshold = 256 + Refinement.
opt.cluster_fit = true;
opt.cluster_fit_3 = enable_three_color_mode;
opt.threshold = 1.0f / 256;
opt.endpoint_refinement = true;
break;
}
return opt;
}
static float compress_dxt1(Quality level, const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, bool three_color_mode, bool three_color_black, BlockDXT1 * output)
{
Options opt = setup_options(level, three_color_mode, three_color_black);
Vector3 colors[16];
float weights[16];
bool any_black = false;
int count;
if (opt.cluster_fit) {
count = reduce_colors(input_colors, input_weights, 16, opt.threshold, colors, weights, &any_black);
}
else {
for (int i = 0; i < 16; i++) {
colors[i] = input_colors[i].xyz;
}
count = 16;
}
if (count == 0) {
// Output trivial block.
output->col0.u = 0;
output->col1.u = 0;
output->indices = 0;
return 0;
}
// Cluster fit cannot handle single color blocks, so encode them optimally.
if (count == 1) {
compress_dxt1_single_color_optimal(vector3_to_color32(colors[0]), output);
return evaluate_mse(input_colors, input_weights, color_weights, output);
}
float error = FLT_MAX;
// Quick end point selection.
if (opt.box_fit) {
Vector3 c0, c1;
fit_colors_bbox(colors, count, &c0, &c1);
inset_bbox(&c0, &c1);
select_diagonal(colors, count, &c0, &c1);
error = output_block4(input_colors, input_weights, color_weights, c0, c1, output);
// Refine color for the selected indices.
if (opt.least_squares_fit && optimize_end_points4(output->indices, input_colors, 16, &c0, &c1)) {
BlockDXT1 optimized_block;
float optimized_error = output_block4(input_colors, input_weights, color_weights, c0, c1, &optimized_block);
if (optimized_error < error) {
error = optimized_error;
*output = optimized_block;
}
}
}
if (opt.cluster_fit) {
// @@ Use current endpoints as input for initial PCA approximation?
bool use_three_color_black = any_black && three_color_black;
bool use_three_color_mode = opt.cluster_fit_3 || (use_three_color_black && opt.cluster_fit_3_black_only);
// Try cluster fit.
BlockDXT1 cluster_fit_output;
float cluster_fit_error = compress_dxt1_cluster_fit(input_colors, input_weights, colors, weights, count, color_weights, use_three_color_mode, use_three_color_black, three_color_black, &cluster_fit_output);
if (cluster_fit_error < error) {
*output = cluster_fit_output;
error = cluster_fit_error;
}
}
if (opt.endpoint_refinement) {
error = refine_endpoints(input_colors, input_weights, color_weights, three_color_mode, error, output);
}
return error;
}
// Public API
void init_dxt1(Decoder decoder) {
s_decoder = decoder;
init_single_color_tables(decoder);
init_cluster_tables();
}
void decode_dxt1(const void * block, unsigned char rgba_block[16 * 4], Decoder decoder/*=Decoder_D3D10*/) {
decode_dxt1((const BlockDXT1 *)block, rgba_block, decoder);
}
float evaluate_dxt1_error(const unsigned char rgba_block[16 * 4], const void * dxt_block, Decoder decoder/*=Decoder_D3D10*/) {
return evaluate_dxt1_error(rgba_block, (const BlockDXT1 *)dxt_block, decoder);
}
float compress_dxt1(Quality level, const float * input_colors, const float * input_weights, const float rgb[3], bool three_color_mode, bool three_color_black, void * output) {
return compress_dxt1(level, (Vector4*)input_colors, input_weights, { rgb[0], rgb[1], rgb[2] }, three_color_mode, three_color_black, (BlockDXT1*)output);
}
} // icbc
// // Do not polute preprocessor definitions.
// #undef ICBC_SIMD
// #undef ICBC_ASSERT
// #undef ICBC_SCALAR
// #undef ICBC_SSE2
// #undef ICBC_SSE41
// #undef ICBC_AVX1
// #undef ICBC_AVX2
// #undef ICBC_AVX512
// #undef ICBC_NEON
// #undef ICBC_VMX
// #undef ICBC_USE_FMA
// #undef ICBC_USE_AVX2_PERMUTE2
// #undef ICBC_USE_AVX512_PERMUTE
// #undef ICBC_USE_NEON_VTL
// #undef ICBC_PERFECT_ROUND
#endif // ICBC_IMPLEMENTATION
// Version History:
// v1.00 - Initial release.
// v1.01 - Added SPMD code path with AVX support.
// v1.02 - Removed SIMD code path.
// v1.03 - Quality levels. AVX512, Neon, Altivec, vectorized reduction and index selection.
// v1.04 - Automatic compile-time SIMD selection. Specify hw decoder at runtime. More optimizations.
// v1.05 - Bug fixes. Small optimizations.
// Copyright (c) 2020 Ignacio Castano <castano@gmail.com>
//
// Permission is hereby granted, free of charge, to any person obtaining
// a copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
// IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
// CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
// TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
// SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.