diff --git a/extern/goofy_tc.h b/extern/goofy_tc.h new file mode 100644 index 0000000..43197af --- /dev/null +++ b/extern/goofy_tc.h @@ -0,0 +1,1565 @@ +// goofy_tc.h v1.0 +// Realtime BC1/ETC1 encoder by Sergey Makeev +// +// LICENSE: +// MIT license at the end of this file. + +namespace goofy +{ + int compressDXT1(unsigned char* result, const unsigned char* input, unsigned int width, unsigned int height, unsigned int stride); + int compressETC1(unsigned char* result, const unsigned char* input, unsigned int width, unsigned int height, unsigned int stride); +} + +#include + +// Enable SSE2 codec +#define GOOFY_SSE2 (1) + +#define goofy_restrict __restrict +#define goofy_inline __forceinline + +#define goofy_align16(x) __declspec(align(16)) x + +#ifdef GOOFY_SSE2 +#include // SSE2 +#else +#include // memset/memcpy +#endif + +#ifdef GOOFYTC_IMPLEMENTATION +namespace goofy +{ + +// constants +goofy_align16(static const uint32_t gConstEight[4]) = { 0x08080808, 0x08080808, 0x08080808, 0x08080808 }; +goofy_align16(static const uint32_t gConstSixteen[4]) = { 0x10101010, 0x10101010, 0x10101010, 0x10101010 }; +goofy_align16(static const uint32_t gConstMaxInt[4]) = { 0x7f7f7f7f, 0x7f7f7f7f, 0x7f7f7f7f, 0x7f7f7f7f }; + +#ifdef GOOFY_SSE2 +typedef __m128i uint8x16_t; +#else + +struct uint8x16_t +{ + union + { + uint8_t data[16]; + + int8_t m128i_i8[16]; + uint8_t m128i_u8[16]; + + struct + { + uint8_t r0; + uint8_t g0; + uint8_t b0; + uint8_t a0; + + uint8_t r1; + uint8_t g1; + uint8_t b1; + uint8_t a1; + + uint8_t r2; + uint8_t g2; + uint8_t b2; + uint8_t a2; + + uint8_t r3; + uint8_t g3; + uint8_t b3; + uint8_t a3; + }; + + struct + { + uint16_t s0; + uint16_t s1; + uint16_t s2; + uint16_t s3; + uint16_t s4; + uint16_t s5; + uint16_t s6; + uint16_t s7; + }; + + struct + { + uint32_t u0; + uint32_t u1; + uint32_t u2; + uint32_t u3; + }; + + struct + { + uint64_t l0; + uint64_t l1; + }; + }; +}; + +#endif + + +// 2x16xU8 +struct uint8x16x2_t +{ + // rows + uint8x16_t r0; + uint8x16_t r1; +}; + +// 3x16xU8 +struct uint8x16x3_t +{ + // rows + uint8x16_t r0; + uint8x16_t r1; + uint8x16_t r2; +}; + +// 4x16xU8 +struct uint8x16x4_t +{ + // rows + uint8x16_t r0; + uint8x16_t r1; + uint8x16_t r2; + uint8x16_t r3; +}; + +// 2xU64 +struct uint64x2_t +{ + uint64_t r0; + uint64_t r1; +}; + + +namespace simd +{ +// SSE2 implementation +#ifdef GOOFY_SSE2 + + goofy_inline uint8x16_t zero() + { + return _mm_setzero_si128(); + } + + goofy_inline uint8x16_t fetch(const void* p) + { + return _mm_load_si128((const __m128i*)p); + } + + goofy_inline uint64x2_t getAsUInt64x2(const uint8x16_t& a) + { + uint64x2_t res; + res.r0 = _mm_cvtsi128_si64(a); + res.r1 = _mm_cvtsi128_si64(_mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); + return res; + } + + goofy_inline uint8x16_t or(const uint8x16_t& a, const uint8x16_t& b) + { + return _mm_or_si128(a, b); + } + + goofy_inline uint8x16_t and(const uint8x16_t& a, const uint8x16_t& b) + { + return _mm_and_si128(a, b); + } + + goofy_inline uint8x16_t andnot(const uint8x16_t& a, const uint8x16_t& b) + { + return _mm_andnot_si128(a, b); + } + + goofy_inline uint8x16_t select(const uint8x16_t& mask, const uint8x16_t& a, const uint8x16_t& b) + { + return _mm_or_si128(_mm_and_si128(mask, a), _mm_andnot_si128(mask, b)); + } + + goofy_inline uint8x16_t minu(const uint8x16_t& a, const uint8x16_t& b) + { + return _mm_min_epu8(a, b); + } + + goofy_inline uint8x16_t maxu(const uint8x16_t& a, const uint8x16_t& b) + { + return _mm_max_epu8(a, b); + } + + goofy_inline uint8x16_t avg(const uint8x16_t& a, const uint8x16_t& b) + { + return _mm_avg_epu8(a, b); + } + + goofy_inline uint8x16_t replicateU0000(const uint8x16_t& a) + { + return _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 0)); + } + + goofy_inline uint8x16_t replicateU1111(const uint8x16_t& a) + { + return _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 1, 1, 1)); + } + + goofy_inline uint8x16_t replicateU2222(const uint8x16_t& a) + { + return _mm_shuffle_epi32(a, _MM_SHUFFLE(2, 2, 2, 2)); + } + + goofy_inline uint8x16_t replicateU3333(const uint8x16_t& a) + { + return _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 3, 3, 3)); + } + + goofy_inline uint8x16_t cmpeqi(const uint8x16_t& a, const uint8x16_t& b) + { + return _mm_cmpeq_epi8(a, b); + } + + goofy_inline uint8x16_t cmplti(const uint8x16_t& a, const uint8x16_t& b) + { + return _mm_cmplt_epi8(a, b); + } + + goofy_inline uint8x16_t addsatu(const uint8x16_t& a, const uint8x16_t& b) + { + return _mm_adds_epu8(a, b); + } + + goofy_inline uint8x16_t subsatu(const uint8x16_t& a, const uint8x16_t& b) + { + return _mm_subs_epu8(a, b); + } + + goofy_inline uint8x16x4_t transposeAs4x4(const uint8x16x4_t& v) + { + uint8x16_t tr0 = _mm_unpacklo_epi32(v.r0, v.r1); + uint8x16_t tr1 = _mm_unpacklo_epi32(v.r2, v.r3); + uint8x16_t tr2 = _mm_unpackhi_epi32(v.r0, v.r1); + uint8x16_t tr3 = _mm_unpackhi_epi32(v.r2, v.r3); + + uint8x16x4_t res; + res.r0 = _mm_unpacklo_epi64(tr0, tr1); + res.r1 = _mm_unpackhi_epi64(tr0, tr1); + res.r2 = _mm_unpacklo_epi64(tr2, tr3); + res.r3 = _mm_unpackhi_epi64(tr2, tr3); + return res; + } + + goofy_inline uint8x16x3_t deinterleaveRGB(const uint8x16x4_t& v) + { + uint8x16_t s0a = _mm_unpacklo_epi8(v.r0, v.r1); + uint8x16_t s0b = _mm_unpackhi_epi8(v.r0, v.r1); + uint8x16_t s0c = _mm_unpacklo_epi8(v.r2, v.r3); + uint8x16_t s0d = _mm_unpackhi_epi8(v.r2, v.r3); + uint8x16_t s1a = _mm_unpacklo_epi8(s0a, s0b); + uint8x16_t s1b = _mm_unpackhi_epi8(s0a, s0b); + uint8x16_t s1c = _mm_unpacklo_epi8(s0c, s0d); + uint8x16_t s1d = _mm_unpackhi_epi8(s0c, s0d); + uint8x16_t s2a = _mm_unpacklo_epi8(s1a, s1b); + uint8x16_t s2b = _mm_unpackhi_epi8(s1a, s1b); + uint8x16_t s2c = _mm_unpacklo_epi8(s1c, s1d); + uint8x16_t s2d = _mm_unpackhi_epi8(s1c, s1d); + + uint8x16x3_t res; + res.r0 = _mm_unpacklo_epi64(s2a, s2c); // red + res.r1 = _mm_unpackhi_epi64(s2a, s2c); // green + res.r2 = _mm_unpacklo_epi64(s2b, s2d); // blue + //res.r3 = _mm_unpackhi_epi64(s2b, s2d); // alpha + return res; + } + + // transpose as four single channel 4x4 blocks at once + // + // in: + // + // R0 = | bl0.abcd | bl0.efgh | bl0.ijkl | bl0.mnop | + // R1 = | bl1.abcd | bl1.efgh | bl1.ijkl | bl1.mnop | + // R2 = | bl2.abcd | bl2.efgh | bl2.ijkl | bl2.mnop | + // R3 = | bl3.abcd | bl3.efgh | bl3.ijkl | bl3.mnop | + // + // out: + // + // R1 = | bl0.aeim | bl0.bfjo | bl0.cgko | bl0.dhkl | + // R2 = | bl1.aeim | bl1.bfjo | bl1.cgko | bl1.dhkl | + // R3 = | bl2.aeim | bl2.bfjo | bl2.cgko | bl2.dhkl | + // R0 = | bl3.aeim | bl3.bfjo | bl3.cgko | bl3.dhkl | + // + // +---+---+---+---+ +---+---+---+---+ + // | A | B | C | D | | A | E | I | M | + // +---+---+---+---+ +---+---+---+---+ + // | E | F | G | H | | B | F | J | O | + // +---+---+---+---+ --> +---+---+---+---+ + // | I | J | K | L | | C | G | K | O | + // +---+---+---+---+ +---+---+---+---+ + // | M | N | O | P | | D | H | K | L | + // +---+---+---+---+ +---+---+---+---+ + // + goofy_inline uint8x16x4_t transposeAs4x4x4(const uint8x16x4_t& v) + { + const uint8x16_t s0a = _mm_unpacklo_epi8(v.r0, v.r1); + const uint8x16_t s0b = _mm_unpackhi_epi8(v.r0, v.r1); + const uint8x16_t s0c = _mm_unpacklo_epi8(v.r2, v.r3); + const uint8x16_t s0d = _mm_unpackhi_epi8(v.r2, v.r3); + const uint8x16_t s1a = _mm_unpacklo_epi8(s0a, s0b); + const uint8x16_t s1b = _mm_unpackhi_epi8(s0a, s0b); + const uint8x16_t s1c = _mm_unpacklo_epi8(s0c, s0d); + const uint8x16_t s1d = _mm_unpackhi_epi8(s0c, s0d); + const uint8x16_t s2a = _mm_unpacklo_epi8(s1a, s1b); + const uint8x16_t s2b = _mm_unpackhi_epi8(s1a, s1b); + const uint8x16_t s2c = _mm_unpacklo_epi8(s1c, s1d); + const uint8x16_t s2d = _mm_unpackhi_epi8(s1c, s1d); + + const uint8x16_t s3a = _mm_unpacklo_epi32(s2a, s2b); + const uint8x16_t s3b = _mm_unpackhi_epi32(s2a, s2b); + const uint8x16_t s3c = _mm_unpacklo_epi32(s2c, s2d); + const uint8x16_t s3d = _mm_unpackhi_epi32(s2c, s2d); + + const uint8x16_t s4a = _mm_unpacklo_epi64(s3a, s3b); + const uint8x16_t s4b = _mm_unpackhi_epi64(s3a, s3b); + const uint8x16_t s4c = _mm_unpacklo_epi64(s3c, s3d); + const uint8x16_t s4d = _mm_unpackhi_epi64(s3c, s3d); + + uint8x16x4_t res; + res.r0 = _mm_shuffle_epi32(s4a, _MM_SHUFFLE(2, 0, 3, 1)); + res.r1 = _mm_shuffle_epi32(s4b, _MM_SHUFFLE(2, 0, 3, 1)); + res.r2 = _mm_shuffle_epi32(s4c, _MM_SHUFFLE(2, 0, 3, 1)); + res.r3 = _mm_shuffle_epi32(s4d, _MM_SHUFFLE(2, 0, 3, 1)); + return res; + } + + goofy_inline uint8x16x4_t zipU4x2(const uint8x16_t& a, const uint8x16_t& b, const uint8x16_t& c, const uint8x16_t& d) + { + const uint8x16x4_t res = { + _mm_unpacklo_epi32(a, b), + _mm_unpackhi_epi32(a, b), + _mm_unpacklo_epi32(c, d), + _mm_unpackhi_epi32(c, d), + }; + + return res; + } + + goofy_inline uint8x16x2_t zipU4(const uint8x16_t& a, const uint8x16_t& b) + { + uint8x16x2_t res; + res.r0 = _mm_unpacklo_epi32(a, b); + res.r1 = _mm_unpackhi_epi32(a, b); + return res; + } + + goofy_inline uint32_t moveMaskMSB(const uint8x16_t& v) + { + return (uint32_t)_mm_movemask_epi8(v); + } + + goofy_inline uint8x16x2_t zipB16(const uint8x16_t& a, const uint8x16_t& b) + { + uint8x16x2_t res; + res.r0 = _mm_unpacklo_epi8(a, b); + res.r1 = _mm_unpackhi_epi8(a, b); + return res; + } + + goofy_inline uint8x16_t not(const uint8x16_t& v) + { + return _mm_xor_si128(v, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())); + } + +#else + // generic CPU implementation + namespace detail + { + goofy_inline uint8x16_t unpacklo16(const uint8x16_t& a, const uint8x16_t& b) + { + uint8x16_t res; + res.s0 = a.s0; + res.s1 = b.s0; + res.s2 = a.s1; + res.s3 = b.s1; + res.s4 = a.s2; + res.s5 = b.s2; + res.s6 = a.s3; + res.s7 = b.s3; + return res; + } + + goofy_inline uint8x16_t unpackhi16(const uint8x16_t& a, const uint8x16_t& b) + { + uint8x16_t res; + res.s0 = a.s4; + res.s1 = b.s4; + res.s2 = a.s5; + res.s3 = b.s5; + res.s4 = a.s6; + res.s5 = b.s6; + res.s6 = a.s7; + res.s7 = b.s7; + return res; + } + + goofy_inline uint8x16_t unpacklo8(const uint8x16_t& a, const uint8x16_t& b) + { + uint8x16_t res; + res.data[0] = a.data[0]; + res.data[1] = b.data[0]; + res.data[2] = a.data[1]; + res.data[3] = b.data[1]; + res.data[4] = a.data[2]; + res.data[5] = b.data[2]; + res.data[6] = a.data[3]; + res.data[7] = b.data[3]; + res.data[8] = a.data[4]; + res.data[9] = b.data[4]; + res.data[10] = a.data[5]; + res.data[11] = b.data[5]; + res.data[12] = a.data[6]; + res.data[13] = b.data[6]; + res.data[14] = a.data[7]; + res.data[15] = b.data[7]; + return res; + } + + goofy_inline uint8x16_t unpackhi8(const uint8x16_t& a, const uint8x16_t& b) + { + uint8x16_t res; + res.data[0] = a.data[8]; + res.data[1] = b.data[8]; + res.data[2] = a.data[9]; + res.data[3] = b.data[9]; + res.data[4] = a.data[10]; + res.data[5] = b.data[10]; + res.data[6] = a.data[11]; + res.data[7] = b.data[11]; + res.data[8] = a.data[12]; + res.data[9] = b.data[12]; + res.data[10] = a.data[13]; + res.data[11] = b.data[13]; + res.data[12] = a.data[14]; + res.data[13] = b.data[14]; + res.data[14] = a.data[15]; + res.data[15] = b.data[15]; + return res; + } + + goofy_inline uint8x16_t unpacklo64(const uint8x16_t& a, const uint8x16_t& b) + { + uint8x16_t res; + res.l0 = a.l0; + res.l1 = b.l0; + return res; + } + + goofy_inline uint8x16_t unpackhi64(const uint8x16_t& a, const uint8x16_t& b) + { + uint8x16_t res; + res.l0 = a.l1; + res.l1 = b.l1; + return res; + } + + goofy_inline uint8x16_t unpacklo32(const uint8x16_t& a, const uint8x16_t& b) + { + uint8x16_t res; + res.u0 = a.u0; + res.u1 = b.u0; + res.u2 = a.u1; + res.u3 = b.u1; + return res; + } + + goofy_inline uint8x16_t unpackhi32(const uint8x16_t& a, const uint8x16_t& b) + { + uint8x16_t res; + res.u0 = a.u2; + res.u1 = b.u2; + res.u2 = a.u3; + res.u3 = b.u3; + return res; + } + + goofy_inline uint8x16_t replicateU0011(const uint8x16_t& a) + { + uint8x16_t res; + res.u0 = a.u0; + res.u1 = a.u0; + res.u2 = a.u1; + res.u3 = a.u1; + return res; + } + + goofy_inline uint8x16_t replicateU2233(const uint8x16_t& a) + { + uint8x16_t res; + res.u0 = a.u2; + res.u1 = a.u2; + res.u2 = a.u3; + res.u3 = a.u3; + return res; + } + + goofy_inline uint8x16_t swizzleU1302(const uint8x16_t& a) + { + uint8x16_t res; + res.u0 = a.u1; + res.u1 = a.u3; + res.u2 = a.u0; + res.u3 = a.u2; + return res; + } + + } //detail + + goofy_inline uint8x16_t zero() + { + uint8x16_t r; + memset(&r, 0, sizeof(uint8x16_t)); + return r; + } + + goofy_inline uint8x16_t fetch(const void* p) + { + uint8x16_t r; + memcpy(&r, p, sizeof(uint8x16_t)); + return r; + } + + goofy_inline uint64x2_t getAsUInt64x2(const uint8x16_t& a) + { + uint64x2_t res; + res.r0 = a.l0; + res.r1 = a.l1; + return res; + } + + // bit or + goofy_inline uint8x16_t or(const uint8x16_t& a, const uint8x16_t& b) + { + uint8x16_t res; + for (uint32_t i = 0; i < 16; i++) + { + res.data[i] = a.data[i] | b.data[i]; + } + return res; + } + + // bit and + goofy_inline uint8x16_t and(const uint8x16_t& a, const uint8x16_t& b) + { + uint8x16_t res; + for (uint32_t i = 0; i < 16; i++) + { + res.data[i] = a.data[i] & b.data[i]; + } + return res; + } + + goofy_inline uint8x16_t andnot(const uint8x16_t& a, const uint8x16_t& b) + { + uint8x16_t res; + for (uint32_t i = 0; i < 16; i++) + { + res.data[i] = (~a.data[i]) & b.data[i]; + } + return res; + } + + goofy_inline uint32_t moveMaskMSB(const uint8x16_t& v) + { + uint32_t res = 0; + for (uint32_t i = 0; i < 16; i++) + { + uint32_t msb = ((v.data[i] & 0x80) >> 7); + res = res | (msb << i); + } + return res; + } + + // + // if (maskA) { + // return a; + // } + // else { + // return b; + // } + goofy_inline uint8x16_t select(const uint8x16_t& mask, const uint8x16_t& a, const uint8x16_t& b) + { + uint8x16_t res; + for (uint32_t i = 0; i < 16; i++) + { + unsigned char msk = mask.data[i]; + res.data[i] = (msk & a.data[i]) | ((~msk) & b.data[i]); // _mm_or_si128(_mm_and_si128(mask, a), _mm_andnot_si128(mask, b)) + } + return res; + } + + goofy_inline uint8x16_t minu(const uint8x16_t& a, const uint8x16_t& b) + { + uint8x16_t res; + for (uint32_t i = 0; i < 16; i++) + { + res.data[i] = (a.data[i] < b.data[i]) ? a.data[i] : b.data[i]; + } + return res; + } + + goofy_inline uint8x16_t maxu(const uint8x16_t& a, const uint8x16_t& b) + { + uint8x16_t res; + for (uint32_t i = 0; i < 16; i++) + { + res.data[i] = (a.data[i] > b.data[i]) ? a.data[i] : b.data[i]; + } + return res; + } + + goofy_inline uint8x16_t avg(const uint8x16_t& a, const uint8x16_t& b) + { + uint8x16_t res; + for (uint32_t i = 0; i < 16; i++) + { + uint32_t t = (a.data[i] + b.data[i]) + 1; + res.data[i] = (uint8_t)(t >> 1); + } + return res; + } + + goofy_inline uint8x16_t replicateU0000(const uint8x16_t& a) + { + uint8x16_t res; + res.u0 = a.u0; + res.u1 = a.u0; + res.u2 = a.u0; + res.u3 = a.u0; + return res; + } + + goofy_inline uint8x16_t replicateU1111(const uint8x16_t& a) + { + uint8x16_t res; + res.u0 = a.u1; + res.u1 = a.u1; + res.u2 = a.u1; + res.u3 = a.u1; + return res; + } + + goofy_inline uint8x16_t replicateU2222(const uint8x16_t& a) + { + uint8x16_t res; + res.u0 = a.u2; + res.u1 = a.u2; + res.u2 = a.u2; + res.u3 = a.u2; + return res; + } + + goofy_inline uint8x16_t replicateU3333(const uint8x16_t& a) + { + uint8x16_t res; + res.u0 = a.u3; + res.u1 = a.u3; + res.u2 = a.u3; + res.u3 = a.u3; + return res; + } + + // cmp equal (signed) + goofy_inline uint8x16_t cmpeqi(const uint8x16_t& a, const uint8x16_t& b) + { + uint8x16_t res; + for (uint32_t i = 0; i < 16; i++) + { + res.data[i] = ((char)a.data[i] == (char)b.data[i]) ? 0xFF : 0x00; + } + return res; + } + + // cmp less (signed) + goofy_inline uint8x16_t cmplti(const uint8x16_t& a, const uint8x16_t& b) + { + uint8x16_t res; + for (uint32_t i = 0; i < 16; i++) + { + res.data[i] = ((char)a.data[i] < (char)b.data[i]) ? 0xFF : 0x00; + } + return res; + } + + // add unsigned saturate + goofy_inline uint8x16_t addsatu(const uint8x16_t& a, const uint8x16_t& b) + { + uint8x16_t res; + for (uint32_t i = 0; i < 16; i++) + { + int32_t diff = ((unsigned char)a.data[i] + (unsigned char)b.data[i]); + if (diff > 255) + diff = 255; + res.data[i] = (uint8_t)diff; + } + return res; + } + + // sub unsigned saturate + goofy_inline uint8x16_t subsatu(const uint8x16_t& a, const uint8x16_t& b) + { + uint8x16_t res; + for (uint32_t i = 0; i < 16; i++) + { + int32_t diff = ((unsigned char)a.data[i] - (unsigned char)b.data[i]); + if (diff < 0) + diff = 0; + res.data[i] = (uint8_t)diff; + } + return res; + } + + // transpose as one 4x4 RGBA block + // + // in: + // + // R0 = | a0.rgba | a1.rgba | a2.rgba | a3.rgba | + // R1 = | b0.rgba | b1.rgba | b2.rgba | b3.rgba | + // R2 = | c0.rgba | c1.rgba | c2.rgba | c3.rgba | + // R3 = | d0.rgba | d1.rgba | d2.rgba | d3.rgba | + // + // out: + // + // R0 = | a0.rgba | b0.rgba | c0.rgba | d0.rgba | + // R1 = | a1.rgba | b1.rgba | c1.rgba | d1.rgba | + // R2 = | a2.rgba | b2.rgba | c2.rgba | d2.rgba | + // R3 = | a3.rgba | b3.rgba | c3.rgba | d3.rgba | + // + goofy_inline uint8x16x4_t transposeAs4x4(const uint8x16x4_t& v) + { + // a0, b0, a1, b1 + const uint8x16_t tr0 = detail::unpacklo32(v.r0, v.r1); + // c0, d0, c1, d1 + const uint8x16_t tr1 = detail::unpacklo32(v.r2, v.r3); + // a2, b2, a3, b3 + const uint8x16_t tr2 = detail::unpackhi32(v.r0, v.r1); + // c2, d2, c3, d3 + const uint8x16_t tr3 = detail::unpackhi32(v.r2, v.r3); + + uint8x16x4_t res; + // a0, b0, c0, d0 + res.r0 = detail::unpacklo64(tr0, tr1); + // a1, b1, c1, d1 + res.r1 = detail::unpackhi64(tr0, tr1); + // a2, b2, c2, d2 + res.r2 = detail::unpacklo64(tr2, tr3); + // a3, b3, c3, d3 + res.r3 = detail::unpackhi64(tr2, tr3); + return res; + } + + // deinterleave as 4x16 + // + // in: + // + // R0 = | a0.rgba | a1.rgba | a2.rgba | a3.rgba | + // R1 = | b0.rgba | b1.rgba | b2.rgba | b3.rgba | + // R2 = | c0.rgba | c1.rgba | c2.rgba | c3.rgba | + // R3 = | d0.rgba | d1.rgba | d2.rgba | d3.rgba | + // + // out: + // + // R0 = | a0.r | a1.r | a2.r | a3.r | b0.r | b1.r | b2.r | b3.r | c0.r | c1.r | c2.r | c3.r | d0.r | d1.r | d2.r | d3.r | + // R1 = | a0.g | a1.g | a2.g | a3.g | b0.g | b1.g | b2.g | b3.g | c0.g | c1.g | c2.g | c3.g | d0.g | d1.g | d2.g | d3.g | + // R2 = | a0.b | a1.b | a2.b | a3.b | b0.b | b1.b | b2.b | b3.b | c0.b | c1.b | c2.b | c3.b | d0.b | d1.b | d2.b | d3.b | + // R3 = | a0.a | a1.a | a2.a | a3.a | b0.a | b1.a | b2.a | b3.a | c0.a | c1.a | c2.a | c3.a | d0.a | d1.a | d2.a | d3.a | + // + goofy_inline uint8x16x3_t deinterleaveRGB(const uint8x16x4_t& v) + { + // step 1 + + // | a0.r | b0.r | a0.g | b0.g | a0.b | b0.b | a0.a | b0.a | a1.r | b1.r | a1.g | b1.g | a1.b | b1.b | a1.a | b1.a | + // | a2.r | b2.r | a2.g | b2.g | a2.b | b2.b | a2.a | b2.a | a3.r | b3.r | a3.g | b3.g | a3.b | b3.b | a3.a | b3.a | + const uint8x16_t s0a = detail::unpacklo8(v.r0, v.r1); + const uint8x16_t s0b = detail::unpackhi8(v.r0, v.r1); + + // | c0.r | d0.r | c0.g | d0.g | c0.b | d0.b | c0.a | d0.a | c1.r | d1.r | c1.g | d1.g | c1.b | d1.b | c1.a | d1.a | + // | c2.r | d2.r | c2.g | d2.g | c2.b | d2.b | c2.a | d2.a | c3.r | d3.r | c3.g | d3.g | c3.b | d3.b | c3.a | d3.a | + const uint8x16_t s0c = detail::unpacklo8(v.r2, v.r3); + const uint8x16_t s0d = detail::unpackhi8(v.r2, v.r3); + + // step 2 + // | a0.r | a2.r | b0.r | b2.r | a0.g | a2.g | b0.g | b2.g | a0.b | a2.b | b0.b | b2.b | a0.a | a2.a | b0.a | b2.a | + // | a1.r | a3.r | b1.r | b3.r | a1.g | a3.g | b1.g | b3.g | a1.b | a3.b | b1.b | b3.b | a1.a | a3.a | b1.a | b3.a | + const uint8x16_t s1a = detail::unpacklo8(s0a, s0b); + const uint8x16_t s1b = detail::unpackhi8(s0a, s0b); + + // | c0.r | c2.r | d0.r | d2.r | c0.g | c2.g | d0.g | d2.g | c0.b | c2.b | d0.b | d2.b | c0.a | c2.a | d0.a | d2.a | + // | c1.r | c3.r | d1.r | d3.r | c1.g | c3.g | d1.g | d3.g | c1.b | c3.b | d1.b | d3.b | c1.a | c3.a | d1.a | d3.a | + const uint8x16_t s1c = detail::unpacklo8(s0c, s0d); + const uint8x16_t s1d = detail::unpackhi8(s0c, s0d); + + // step 3 + // | a0.r | a1.r | a2.r | a3.r | b0.r | b1.r | b2.r | b3.r | a0.g | a1.g | a2.g | a3.g | b0.g | b1.g | b2.g | b3.g | + // | a0.b | a1.b | a2.b | a3.b | b0.b | b1.b | b2.b | b3.b | a0.a | a1.a | a2.a | a3.a | b0.a | b1.a | b2.a | b3.a | + const uint8x16_t s2a = detail::unpacklo8(s1a, s1b); + const uint8x16_t s2b = detail::unpackhi8(s1a, s1b); + + // | c0.r | c1.r | c2.r | c3.r | d0.r | d1.r | d2.r | d3.r | c0.g | c1.g | c2.g | c3.g | d0.g | d1.g | d2.g | d3.g | + // | c0.b | c1.b | c2.b | c3.b | d0.b | d1.b | d2.b | d3.b | c0.a | c1.a | c2.a | c3.a | d0.a | d1.a | d2.a | d3.a | + const uint8x16_t s2c = detail::unpacklo8(s1c, s1d); + const uint8x16_t s2d = detail::unpackhi8(s1c, s1d); + + // step 4 (final) + uint8x16x3_t res; + // | a0.r | a1.r | a2.r | a3.r | b0.r | b1.r | b2.r | b3.r | c0.r | c1.r | c2.r | c3.r | d0.r | d1.r | d2.r | d3.r | + // | a0.g | a1.g | a2.g | a3.g | b0.g | b1.g | b2.g | b3.g | c0.g | c1.g | c2.g | c3.g | d0.g | d1.g | d2.g | d3.g | + res.r0 = detail::unpacklo64(s2a, s2c); + res.r1 = detail::unpackhi64(s2a, s2c); + + // | a0.b | a1.b | a2.b | a3.b | b0.b | b1.b | b2.b | b3.b | c0.b | c1.b | c2.b | c3.b | d0.b | d1.b | d2.b | d3.b | + // | a0.a | a1.a | a2.a | a3.a | b0.a | b1.a | b2.a | b3.a | c0.a | c1.a | c2.a | c3.a | d0.a | d1.a | d2.a | d3.a | + res.r2 = detail::unpacklo64(s2b, s2d); + //res.r3 = detail::unpackhi64(s2b, s2d); + return res; + } + + // transpose as four single channel 4x4 blocks at once + // + // in: + // + // R0 = | bl0.abcd | bl0.efgh | bl0.ijkl | bl0.mnop | + // R1 = | bl1.abcd | bl1.efgh | bl1.ijkl | bl1.mnop | + // R2 = | bl2.abcd | bl2.efgh | bl2.ijkl | bl2.mnop | + // R3 = | bl3.abcd | bl3.efgh | bl3.ijkl | bl3.mnop | + // + // out: + // NOTE: columns are swapped! + // + // 3 4 0 1 + // R1 = | bl0.cgko | bl0.dhlp | bl0.aeim | bl0.bfjn | + // R2 = | bl1.cgko | bl1.dhlp | bl1.aeim | bl1.bfjn | + // R3 = | bl2.cgko | bl2.dhlp | bl2.aeim | bl2.bfjn | + // R0 = | bl3.cgko | bl3.dhlp | bl3.aeim | bl3.bfjn | + // + // +---+---+---+---+ +---+---+---+---+ + // | A | B | C | D | | C | G | K | O | + // +---+---+---+---+ +---+---+---+---+ + // | E | F | G | H | | D | H | L | P | + // +---+---+---+---+ --> +---+---+---+---+ + // | I | J | K | L | | A | E | I | M | + // +---+---+---+---+ +---+---+---+---+ + // | M | N | O | P | | B | F | J | N | + // +---+---+---+---+ +---+---+---+---+ + // + goofy_inline uint8x16x4_t transposeAs4x4x4(const uint8x16x4_t& v) + { + // step 1 + + // | 0.a | 1.a | 0.b | 1.b | 0.c | 1.c | 0.d | 1.d | 0.e | 1.e | 0.f | 1.f | 0.g | 1.g | 0.h | 1.h | + // | 0.i | 1.i | 0.j | 1.j | 0.k | 1.k | 0.l | 1.l | 0.m | 1.m | 0.n | 1.n | 0.o | 1.o | 0.p | 1.p | + const uint8x16_t s0a = detail::unpacklo8(v.r0, v.r1); + const uint8x16_t s0b = detail::unpackhi8(v.r0, v.r1); + + // | 2.a | 3.a | 2.b | 3.b | 2.c | 3.c | 2.d | 3.d | 2.e | 3.e | 2.f | 3.f | 2.g | 3.g | 2.h | 3.h | + // | 2.i | 3.i | 2.j | 3.j | 2.k | 3.k | 2.l | 3.l | 2.m | 3.m | 2.n | 3.n | 2.o | 3.o | 2.p | 3.p | + const uint8x16_t s0c = detail::unpacklo8(v.r2, v.r3); + const uint8x16_t s0d = detail::unpackhi8(v.r2, v.r3); + + // step 2 + + // | 0.a | 0.i | 1.a | 1.i | 0.b | 0.j | 1.b | 1.j | 0.c | 0.k | 1.c | 1.k | 0.d | 0.l | 1.d | 1.l | + // | 0.e | 0.m | 1.e | 1.m | 0.f | 0.n | 1.f | 1.n | 0.g | 0.o | 1.g | 1.o | 0.h | 0.p | 1.h | 1.p | + const uint8x16_t s1a = detail::unpacklo8(s0a, s0b); + const uint8x16_t s1b = detail::unpackhi8(s0a, s0b); + + // | 2.a | 2.i | 3.a | 3.i | 2.b | 2.j | 3.b | 3.j | 2.c | 2.k | 3.c | 3.k | 2.d | 2.l | 3.d | 3.l | + // | 2.e | 2.m | 3.e | 3.m | 2.f | 2.n | 3.f | 3.n | 2.g | 2.o | 3.g | 3.o | 2.h | 2.p | 3.h | 3.p | + const uint8x16_t s1c = detail::unpacklo8(s0c, s0d); + const uint8x16_t s1d = detail::unpackhi8(s0c, s0d); + + // step 3 + + // | 0.a | 0.e | 0.i | 0.m | 1.a | 1.e | 1.i | 1.m | 0.b | 0.f | 0.j | 0.n | 1.b | 1.f | 1.j | 1.n | + // | 0.c | 0.g | 0.k | 0.o | 1.c | 1.g | 1.k | 1.o | 0.d | 0.h | 0.l | 0.p | 1.d | 1.h | 1.l | 1.p | + const uint8x16_t s2a = detail::unpacklo8(s1a, s1b); + const uint8x16_t s2b = detail::unpackhi8(s1a, s1b); + + // | 2.a | 2.e | 2.i | 2.m | 3.a | 3.e | 3.i | 3.m | 2.b | 2.f | 2.j | 2.n | 3.b | 3.f | 3.j | 3.n | + // | 2.c | 2.g | 2.k | 2.o | 3.c | 3.g | 3.k | 3.o | 2.d | 2.h | 2.l | 2.p | 3.d | 3.h | 3.l | 3.p | + const uint8x16_t s2c = detail::unpacklo8(s1c, s1d); + const uint8x16_t s2d = detail::unpackhi8(s1c, s1d); + + // step 4 + + // | 0.a | 0.e | 0.i | 0.m | 0.c | 0.g | 0.k | 0.o | 1.a | 1.e | 1.i | 1.m | 1.c | 1.g | 1.k | 1.o | + // | 0.b | 0.f | 0.j | 0.n | 0.d | 0.h | 0.l | 0.p | 1.b | 1.f | 1.j | 1.n | 1.d | 1.h | 1.l | 1.p | + const uint8x16_t s3a = detail::unpacklo32(s2a, s2b); + const uint8x16_t s3b = detail::unpackhi32(s2a, s2b); + + // | 2.a | 2.e | 2.i | 2.m | 2.c | 2.g | 2.k | 2.o | 3.a | 3.e | 3.i | 3.m | 3.c | 3.g | 3.k | 3.o | + // | 2.b | 2.f | 2.j | 2.n | 2.d | 2.h | 2.l | 2.p | 3.b | 3.f | 3.j | 3.n | 3.d | 3.h | 3.l | 3.p | + const uint8x16_t s3c = detail::unpacklo32(s2c, s2d); + const uint8x16_t s3d = detail::unpackhi32(s2c, s2d); + + // step 5 + + // | 0.a | 0.e | 0.i | 0.m | 0.c | 0.g | 0.k | 0.o | 0.b | 0.f | 0.j | 0.n | 0.d | 0.h | 0.l | 0.p | + // | 1.a | 1.e | 1.i | 1.m | 1.c | 1.g | 1.k | 1.o | 1.b | 1.f | 1.j | 1.n | 1.d | 1.h | 1.l | 1.p | + const uint8x16_t s4a = detail::unpacklo64(s3a, s3b); + const uint8x16_t s4b = detail::unpackhi64(s3a, s3b); + + // | 2.a | 2.e | 2.i | 2.m | 2.c | 2.g | 2.k | 2.o | 2.b | 2.f | 2.j | 2.n | 2.d | 2.h | 2.l | 2.p | + // | 3.a | 3.e | 3.i | 3.m | 3.c | 3.g | 3.k | 3.o | 3.b | 3.f | 3.j | 3.n | 3.d | 3.h | 3.l | 3.p | + const uint8x16_t s4c = detail::unpacklo64(s3c, s3d); + const uint8x16_t s4d = detail::unpackhi64(s3c, s3d); + + // step 5 (final) + uint8x16x4_t res; + // | 0.c | 0.g | 0.k | 0.o | 0.d | 0.h | 0.l | 0.p | 0.a | 0.e | 0.i | 0.m | 0.b | 0.f | 0.j | 0.n | + res.r0 = detail::swizzleU1302(s4a); + // | 1.c | 1.g | 1.k | 1.o | 1.d | 1.h | 1.l | 1.p | 1.a | 1.e | 1.i | 1.m | 1.b | 1.f | 1.j | 1.n | + res.r1 = detail::swizzleU1302(s4b); + // | 2.c | 2.g | 2.k | 2.o | 2.d | 2.h | 2.l | 2.p | 2.a | 2.e | 2.i | 2.m | 2.b | 2.f | 2.j | 2.n | + res.r2 = detail::swizzleU1302(s4c); + // | 3.c | 3.g | 3.k | 3.o | 3.d | 3.h | 3.l | 3.p | 3.a | 3.e | 3.i | 3.m | 3.b | 3.f | 3.j | 3.n | + res.r3 = detail::swizzleU1302(s4d); + return res; + } + + // like ZipU4 but for two parallel zips + // + // in: + // + // A = | a0.rgba | a1.rgba | a2.rgba | a3.rgba | + // B = | b0.rgba | b1.rgba | b2.rgba | b3.rgba | + // C = | c0.rgba | c1.rgba | c2.rgba | c3.rgba | + // D = | d0.rgba | d1.rgba | d2.rgba | d3.rgba | + // + // out: + // + // R0 = | a0.rgba | b0.rgba | a1.rgba | b1.rgba | + // R1 = | a2.rgba | b2.rgba | a3.rgba | b3.rgba | + // R2 = | c0.rgba | d0.rgba | c1.rgba | d1.rgba | + // R3 = | c2.rgba | d2.rgba | c3.rgba | d3.rgba | + // + goofy_inline uint8x16x4_t zipU4x2(const uint8x16_t& a, const uint8x16_t& b, const uint8x16_t& c, const uint8x16_t& d) + { + const uint8x16x4_t res = { + detail::unpacklo32(a, b), + detail::unpackhi32(a, b), + detail::unpacklo32(c, d), + detail::unpackhi32(c, d), + }; + return res; + } + + // + // in: + // + // a = | a0.rgba | a1.rgba | a2.rgba| a3.rgba + // b = | b0.rgba | b1.rgba | b2.rgba| b3.rgba + // + // out: + // + // R0 = | a0.rgba | b0.rgba | a1.rgba | b1.rgba | + // R1 = | a2.rgba | b2.rgba | a3.rgba | b3.rgba | + // + goofy_inline uint8x16x2_t zipU4(const uint8x16_t& a, const uint8x16_t& b) + { + uint8x16x2_t res; + res.r0 = detail::unpacklo32(a, b); + res.r1 = detail::unpackhi32(a, b); + return res; + } + + // + // in: + // + // a = | a0 | a1 | a2 | a3 | a4 | a5 | a6 | a7 | a8 | a9 | aA | aB | aC | aD | aE | aF | + // b = | b0 | b1 | b2 | b3 | b4 | b5 | b6 | b7 | b8 | b9 | bA | bB | bC | bD | bE | bF | + // + // out: + // + // R0 = | a0 | b0 | a1 | b1 | a2 | b2 | a3 | b3 | a4 | b4 | a5 | b5 | a6 | b6 | a7 | b7 | + // R1 = | a8 | b8 | a9 | b9 | aA | bA | aB | bB | aC | bC | aD | bD | aE | bE | aF | bF | + // + goofy_inline uint8x16x2_t zipB16(const uint8x16_t& a, const uint8x16_t& b) + { + uint8x16x2_t res; + res.r0 = detail::unpacklo8(a, b); + res.r1 = detail::unpackhi8(a, b); + return res; + } + + goofy_inline uint8x16_t not(const uint8x16_t& v) + { + uint8x16_t res; + for (int i = 0; i < 16; i++) + { + res.data[i] = v.data[i] ^ 0xFF; + } + return res; + } + +#endif +} + +static_assert(sizeof(uint8x16_t) == 16, "Incorrect byte8x16 sizeof"); +static_assert(sizeof(uint8x16x2_t) == 32, "Incorrect byte8x16x1 sizeof"); +static_assert(sizeof(uint8x16x3_t) == 48, "Incorrect byte8x16x2 sizeof"); +static_assert(sizeof(uint8x16x4_t) == 64, "Incorrect byte8x16x4 sizeof"); +static_assert(sizeof(uint64x2_t) == 16, "Incorrect uint64x2_t sizeof"); + + +// Block brightness variance to ETC control byte +static const uint32_t etc1BrighnessRangeTocontrolByte[256] = { + 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, + 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, + 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, + 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, + 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, + 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, + 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, + 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, + 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, + 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, + 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, + 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, + 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, + 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, + 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, + 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xFF000000, 0xFF000000 +}; + + +enum GoofyCodecType +{ + GOOFY_DXT1, + GOOFY_ETC1, +}; + +// +// Encode 4 DXT1/ETC1 at once +// +template +goofy_inline void goofySimdEncode(const unsigned char* goofy_restrict inputRGBA, size_t inputStride, unsigned char* goofy_restrict pResult) +{ + // Fetch 16x4 pixels from the buffer(four DX blocks) + // 16 pixels wide is better for the CPU cache utilization (64 bytes per line) and it is better for SIMD lane utilization + // ----------------------------------------------------------- + uint8x16x4_t bl0; + uint8x16x4_t bl1; + uint8x16x4_t bl2; + uint8x16x4_t bl3; + bl0.r0 = simd::fetch(inputRGBA); + bl1.r0 = simd::fetch(inputRGBA + 16); + bl2.r0 = simd::fetch(inputRGBA + 32); + bl3.r0 = simd::fetch(inputRGBA + 48); + inputRGBA += inputStride; + bl0.r1 = simd::fetch(inputRGBA); + bl1.r1 = simd::fetch(inputRGBA + 16); + bl2.r1 = simd::fetch(inputRGBA + 32); + bl3.r1 = simd::fetch(inputRGBA + 48); + inputRGBA += inputStride; + bl0.r2 = simd::fetch(inputRGBA); + bl1.r2 = simd::fetch(inputRGBA + 16); + bl2.r2 = simd::fetch(inputRGBA + 32); + bl3.r2 = simd::fetch(inputRGBA + 48); + inputRGBA += inputStride; + bl0.r3 = simd::fetch(inputRGBA); + bl1.r3 = simd::fetch(inputRGBA + 16); + bl2.r3 = simd::fetch(inputRGBA + 32); + bl3.r3 = simd::fetch(inputRGBA + 48); + + // Find min block colors + // ----------------------------------------------------------- + const uint8x16x4_t blMin = { + simd::minu(simd::minu(bl0.r0, bl0.r1), simd::minu(bl0.r2, bl0.r3)), // min0_clmn0.rgba | min0_clmn1.rgba | min0_clmn2.rgba | min0_clmn3.rgba + simd::minu(simd::minu(bl1.r0, bl1.r1), simd::minu(bl1.r2, bl1.r3)), // min1_clmn0.rgba | min1_clmn1.rgba | min1_clmn2.rgba | min1_clmn3.rgba + simd::minu(simd::minu(bl2.r0, bl2.r1), simd::minu(bl2.r2, bl2.r3)), // min2_clmn0.rgba | min2_clmn1.rgba | min2_clmn2.rgba | min2_clmn3.rgba + simd::minu(simd::minu(bl3.r0, bl3.r1), simd::minu(bl3.r2, bl3.r3)) // min3_clmn0.rgba | min3_clmn1.rgba | min3_clmn2.rgba | min3_clmn3.rgba + }; + + // blMinTr (transposed blMin) + // min0_clmn0.rgba | min1_clmn0.rgba | min2_clmn0.rgba | min3_clmn0.rgba + // min0_clmn1.rgba | min1_clmn1.rgba | min2_clmn1.rgba | min3_clmn1.rgba + // min0_clmn2.rgba | min1_clmn2.rgba | min2_clmn2.rgba | min3_clmn2.rgba + // min0_clmn3.rgba | min1_clmn3.rgba | min2_clmn3.rgba | min3_clmn3.rgba + const uint8x16x4_t blMinTr = simd::transposeAs4x4(blMin); + + // Per-block min colors + // min0.rgba | min1.rgba | min2.rgba | min3.rgba + const uint8x16_t minColors = simd::minu( + simd::minu(blMinTr.r0, blMinTr.r1), + simd::minu(blMinTr.r2, blMinTr.r3) + ); + + // Same to find max block colors + // ----------------------------------------------------------- + const uint8x16x4_t blMax = { + simd::maxu(simd::maxu(bl0.r0, bl0.r1), simd::maxu(bl0.r2, bl0.r3)), + simd::maxu(simd::maxu(bl1.r0, bl1.r1), simd::maxu(bl1.r2, bl1.r3)), + simd::maxu(simd::maxu(bl2.r0, bl2.r1), simd::maxu(bl2.r2, bl2.r3)), + simd::maxu(simd::maxu(bl3.r0, bl3.r1), simd::maxu(bl3.r2, bl3.r3)) + }; + + const uint8x16x4_t blMaxTr = simd::transposeAs4x4(blMax); + + // Per-block max colors + // max0.rgba | max1.rgba | max2.rgba | max3.rgba + const uint8x16_t maxColors = simd::maxu( + simd::maxu(blMaxTr.r0, blMaxTr.r1), + simd::maxu(blMaxTr.r2, blMaxTr.r3) + ); + + // Find min/max brigtness + // ----------------------------------------------------------- + + // Note: some SSE lanes wasted, it is not ideal, but seems OK-ish? + // + // min0.rgba | min0.rgba | min1.rgba | min1.rgba + // min2.rgba | min2.rgba | min3.rgba | min3.rgba + // max0.rgba | max0.rgba | max1.rgba | max1.rgba + // max2.rgba | max2.rgba | max3.rgba | max3.rgba + const uint8x16x4_t blMinMax = simd::zipU4x2(minColors, minColors, maxColors, maxColors); + + // Deinterleave + // min0.rr | min1.rr | min2.rr | min3.rr | max0.rr | max1.rr | max2.rr | max3.rr + // min0.gg | min1.gg | min2.gg | min3.gg | max0.gg | max1.gg | max2.gg | max3.gg + // min0.bb | min1.bb | min2.bb | min3.bb | max0.bb | max1.bb | max2.bb | max3.bb + const uint8x16x3_t blMinMaxDi = simd::deinterleaveRGB(blMinMax); + + // Get Y component of YCoCg color-model (perceptual brightness) + // https://en.wikipedia.org/wiki/YCoCg + // Y = 0.25 * R + 0.25 * B + 0.5 * G + // We can rewrite equation above using the following form + // Y = (((R + B) / 2) + G) / 2 + // + // Y = min0.yy | min1.yy | min2.yy | min3.yy | max0.yy | max1.yy | max2.yy | max3.yy + const uint8x16_t Y = simd::avg(simd::avg(blMinMaxDi.r0, blMinMaxDi.r2), blMinMaxDi.r1); + + // Min/max brightness per block + // R0 = min0.yyyy | min1.yyyy | min2.yyyy | min3.yyyy + // R1 = max0.yyyy | max1.yyyy | max2.yyyy | max3.yyyy + const uint8x16x2_t blMinMaxY = simd::zipB16(Y, Y); + + // Clamp to min brightness + const uint8x16_t constEight = simd::fetch(&gConstEight); + // range0.yyyy | range1.yyyy | range2.yyyy | range3.yyyy + const uint8x16_t blRangeY = simd::maxu(simd::subsatu(blMinMaxY.r1, blMinMaxY.r0), constEight); + + // mid0.yyyy | mid1.yyyy | mid2.yyyy | mid3.yyyy + const uint8x16_t blMidY = simd::avg(blMinMaxY.r0, blMinMaxY.r1); + + // Approximate multiplication by 0.375 to get quantization thresholds + const uint8x16_t constZero = simd::zero(); + + const uint8x16_t blHalfRangeY = simd::avg(blRangeY, constZero); + const uint8x16_t blQuarterRangeY = simd::avg(blHalfRangeY, constZero); + const uint8x16_t blEighthsRangeY = simd::avg(blQuarterRangeY, constZero); + + // Threshold = (quarter + eights) = (0.25 + 0.125) ~= (range * 0.375) + // qt0.yyyy | qt1.yyyy | qt2.yyyy | qt3.yyyy + const uint8x16_t blQThreshold = simd::addsatu(blQuarterRangeY, blEighthsRangeY); + + // Quantization (generate indices) + // ----------------------------------------------------------- + const uint8x16_t constMaxInt = simd::fetch(&gConstMaxInt); + + // block 0 + // + // p0.r p1.r p2.r p3.r p4.r p5.r p6.r p7.r p8.r p9.r p10.r p11.r p12.r p13.r p14.r p15.r + // p0.g p1.g p2.g p3.g p4.g p5.g p6.g p7.g p8.g p9.g p10.g p11.g p12.g p13.g p14.g p15.g + // p0.b p1.b p2.b p3.b p4.b p5.b p6.b p7.b p8.b p9.b p10.b p11.b p12.b p13.b p14.b p15.b + const uint8x16x3_t bl0Di = simd::deinterleaveRGB(bl0); + + // Convert RGB to brightness + // per-pixel block brightness + const uint8x16_t bl0Y = simd::avg(simd::avg(bl0Di.r0, bl0Di.r2), bl0Di.r1); + + // Block brightness to compare with + const uint8x16_t bl0MidY = simd::replicateU0000(blMidY); + + // Brightness difference (per-pixel in block) + // NOTE: we need to clamp difference to max signed int8, because of the signed comparison later + const uint8x16_t bl0PosDiffY = simd::minu(simd::subsatu(bl0Y, bl0MidY), constMaxInt); + const uint8x16_t bl0NegDiffY = simd::minu(simd::subsatu(bl0MidY, bl0Y), constMaxInt); + // Greater or Equal to zero mask + const uint8x16_t bl0GezMask = simd::cmpeqi(bl0NegDiffY, constZero); + + // Absolute diffference of brightness (per-pixel in block) + const uint8x16_t bl0AbsDiffY = simd::or(bl0PosDiffY, bl0NegDiffY); + + // get quantization threshold for current block + const uint8x16_t bl0QThreshold = simd::replicateU0000(blQThreshold); + + // Less than Quantization Threshold mask + const uint8x16_t bl0LqtMask = simd::cmplti(bl0AbsDiffY, bl0QThreshold); + + // Here we've got two bitmasks + // + // GezMask = greater or equal than zero (per pixel) + // LqtMask = less than quantization threshold (per pixel) + // + // + // min qt qt max + // x---------x-----+-----x---------x + // 0 + // + // |---------------| greater or equal than zero (GezMask) + // + // |-----------| less than quantization threshold (LqtMask) + // + + // block 1 + const uint8x16x3_t bl1Di = simd::deinterleaveRGB(bl1); + const uint8x16_t bl1Y = simd::avg(simd::avg(bl1Di.r0, bl1Di.r2), bl1Di.r1); + const uint8x16_t bl1MidY = simd::replicateU1111(blMidY); + const uint8x16_t bl1PosDiffY = simd::minu(simd::subsatu(bl1Y, bl1MidY), constMaxInt); + const uint8x16_t bl1NegDiffY = simd::minu(simd::subsatu(bl1MidY, bl1Y), constMaxInt); + const uint8x16_t bl1GezMask = simd::cmpeqi(bl1NegDiffY, constZero); + const uint8x16_t bl1AbsDiffY = simd::or(bl1PosDiffY, bl1NegDiffY); + const uint8x16_t bl1QThreshold = simd::replicateU1111(blQThreshold); + const uint8x16_t bl1LqtMask = simd::cmplti(bl1AbsDiffY, bl1QThreshold); + + // block 2 + const uint8x16x3_t bl2Di = simd::deinterleaveRGB(bl2); + const uint8x16_t bl2Y = simd::avg(simd::avg(bl2Di.r0, bl2Di.r2), bl2Di.r1); + const uint8x16_t bl2MidY = simd::replicateU2222(blMidY); + const uint8x16_t bl2PosDiffY = simd::minu(simd::subsatu(bl2Y, bl2MidY), constMaxInt); + const uint8x16_t bl2NegDiffY = simd::minu(simd::subsatu(bl2MidY, bl2Y), constMaxInt); + const uint8x16_t bl2GezMask = simd::cmpeqi(bl2NegDiffY, constZero); + const uint8x16_t bl2AbsDiffY = simd::or(bl2PosDiffY, bl2NegDiffY); + const uint8x16_t bl2QThreshold = simd::replicateU2222(blQThreshold); + const uint8x16_t bl2LqtMask = simd::cmplti(bl2AbsDiffY, bl2QThreshold); + + // block 3 + const uint8x16x3_t bl3Di = simd::deinterleaveRGB(bl3); + const uint8x16_t bl3Y = simd::avg(simd::avg(bl3Di.r0, bl3Di.r2), bl3Di.r1); + const uint8x16_t bl3MidY = simd::replicateU3333(blMidY); + const uint8x16_t bl3PosDiffY = simd::minu(simd::subsatu(bl3Y, bl3MidY), constMaxInt); + const uint8x16_t bl3NegDiffY = simd::minu(simd::subsatu(bl3MidY, bl3Y), constMaxInt); + const uint8x16_t bl3GezMask = simd::cmpeqi(bl3NegDiffY, constZero); + const uint8x16_t bl3AbsDiffY = simd::or(bl3PosDiffY, bl3NegDiffY); + const uint8x16_t bl3QThreshold = simd::replicateU3333(blQThreshold); + const uint8x16_t bl3LqtMask = simd::cmplti(bl3AbsDiffY, bl3QThreshold); + + // Finalize blocks + // ----------------------------------------------------------- + if (CODEC_TYPE == GOOFY_DXT1) + { + // Generate DXT indices using given masks + + // DXT indices order + // ------------------------- + // C0(max) C2 C3 C1(min) + // DEC: | 0 | 2 | 3 | 1 | + // BIN: | 00b | 10b | 11b | 01b | + // + // | GezMask | + // | LqtMask | + + // Zip two masks to match DX bits order + // Gez0 | Lqt0 | Gez1 | Lqt1 | Gez2 | Lqt2 | Gez3 | Lqt3 | Gez4 | Lqt4 | Gez5 | Lqt5 | Gez6 | Lqt6 | Gez7 | Lqt7 + // Gez8 | Lqt8 | Gez9 | Lqt9 | GezA | LqtA | GezB | LqtB | GezC | LqtC | GezD | LqtD | GezE | LqtE | GezF | LqtF + const uint8x16x2_t bl0RawIndices = simd::zipB16(simd::not(bl0GezMask), bl0LqtMask); + const uint8x16x2_t bl3RawIndices = simd::zipB16(simd::not(bl3GezMask), bl3LqtMask); + const uint8x16x2_t bl2RawIndices = simd::zipB16(simd::not(bl2GezMask), bl2LqtMask); + const uint8x16x2_t bl1RawIndices = simd::zipB16(simd::not(bl1GezMask), bl1LqtMask); + + // Bytes to bits + uint32_t bl0Indices = simd::moveMaskMSB(bl0RawIndices.r0) | (simd::moveMaskMSB(bl0RawIndices.r1) << 16); + uint32_t bl1Indices = simd::moveMaskMSB(bl1RawIndices.r0) | (simd::moveMaskMSB(bl1RawIndices.r1) << 16); + uint32_t bl2Indices = simd::moveMaskMSB(bl2RawIndices.r0) | (simd::moveMaskMSB(bl2RawIndices.r1) << 16); + uint32_t bl3Indices = simd::moveMaskMSB(bl3RawIndices.r0) | (simd::moveMaskMSB(bl3RawIndices.r1) << 16); + + // Convert rgb888 to rgb555 + + // We can't shift right by 3 using SIMD, but we can shift right by 1 three times instead + // We need to sub eight before, because avg is (a+b+1) >> 1 + + // max555_0.rgba | max555_1.rgba | max555_2.rgba | max555_3.rgba + const uint8x16_t maxColors555 = simd::avg(simd::avg(simd::avg(simd::subsatu(maxColors, constEight), constZero), constZero), constZero); + // min555_0.rgba | min555_1.rgba | min555_2.rgba | min555_3.rgba + const uint8x16_t minColors555 = simd::avg(simd::avg(simd::avg(simd::subsatu(minColors, constEight), constZero), constZero), constZero); + + // max555_0.rgba | min555_0.rgba | max555_1.rgba | min555_1.rgba + // max555_2.rgba | min555_2.rgba | max555_3.rgba | min555_3.rgba + const uint8x16x2_t maxMinColors555 = simd::zipU4(maxColors555, minColors555); + + const uint64x2_t maxMin01 = simd::getAsUInt64x2(maxMinColors555.r0); + const uint64x2_t maxMin23 = simd::getAsUInt64x2(maxMinColors555.r1); + + // R0 + // AAAAAAAA000000000000000000000000AAAAAAAA000000000000000000011111b << 11 = 0000000000000000 1111100000000000b + // AAAAAAAA000000000000000000000000AAAAAAAA000000000001111100000000b >> 2 = 0000000000000000 0000011111000000b + // AAAAAAAA000000000000000000000000AAAAAAAA000111110000000000000000b >> 16 = 0000000000000000 0000000000011111b + + // R1 + // AAAAAAAA000000000000000000011111AAAAAAAA000000000000000000000000b >> 5 = 1111100000000000 0000000000000000b + // AAAAAAAA000000000001111100000000AAAAAAAA000000000000000000000000b >> 18 = 0000011111000000 0000000000000000b + // AAAAAAAA000111110000000000000000AAAAAAAA000000000000000000000000b >> 32 = 0000000000011111 0000000000000000b + + // 0x20 = 0000000000000000 0000000000100000b + + uint32_t* goofy_restrict pDest = (uint32_t* goofy_restrict)pResult; + + uint32_t block0a = (uint32_t)(0x20 | // max color green channel LSB (to avoid switching to DXT1 3-color mode) + (maxMin01.r0 & 0x1Full) << 11ull | (maxMin01.r0 & 0x1F00ull) >> 2ull | (maxMin01.r0 & 0x1F0000ull) >> 16ull | // max color + (maxMin01.r0 & 0x1F00000000ull) >> 5ull | (maxMin01.r0 & 0x1F0000000000ull) >> 18ull | (maxMin01.r0 & 0x1F000000000000ull) >> 32ull); // min color + //uint32_t block0b = bl0Indices << 32ull; // indices + *pDest = block0a; pDest++; *pDest = bl0Indices; + + uint32_t block1a = (uint32_t)(0x20 | + (maxMin01.r1 & 0x1Full) << 11ull | (maxMin01.r1 & 0x1F00ull) >> 2ull | (maxMin01.r1 & 0x1F0000ull) >> 16ull | + (maxMin01.r1 & 0x1F00000000ull) >> 5ull | (maxMin01.r1 & 0x1F0000000000ull) >> 18ull | (maxMin01.r1 & 0x1F000000000000ull) >> 32ull); + //uint32_t block1b = bl1Indices << 32ull; + pDest++; *pDest = block1a; pDest++; *pDest = bl1Indices; + + uint32_t block2a = (uint32_t)(0x20 | + (maxMin23.r0 & 0x1Full) << 11ull | (maxMin23.r0 & 0x1F00ull) >> 2ull | (maxMin23.r0 & 0x1F0000ull) >> 16ull | + (maxMin23.r0 & 0x1F00000000ull) >> 5ull | (maxMin23.r0 & 0x1F0000000000ull) >> 18ull | (maxMin23.r0 & 0x1F000000000000ull) >> 32ull); + //bl2Indices << 32ull; + pDest++; *pDest = block2a; pDest++; *pDest = bl2Indices; + + uint32_t block3a = (uint32_t)(0x20 | + (maxMin23.r1 & 0x1Full) << 11ull | (maxMin23.r1 & 0x1F00ull) >> 2ull | (maxMin23.r1 & 0x1F0000ull) >> 16ull | + (maxMin23.r1 & 0x1F00000000ull) >> 5ull | (maxMin23.r1 & 0x1F0000000000ull) >> 18ull | (maxMin23.r1 & 0x1F000000000000ull) >> 32ull); + //bl3Indices << 32ull; + pDest++; *pDest = block3a; pDest++; *pDest = bl3Indices; + } + else if (CODEC_TYPE == GOOFY_ETC1) + { + // Combined masks (major bit = GreaterEqualZero other 7 bits = LessQuantizationThreshold) + const uint8x16x4_t blMasks = { + simd::or(simd::andnot(constMaxInt, bl0GezMask), simd::and(bl0LqtMask, constMaxInt)), + simd::or(simd::andnot(constMaxInt, bl1GezMask), simd::and(bl1LqtMask, constMaxInt)), + simd::or(simd::andnot(constMaxInt, bl2GezMask), simd::and(bl2LqtMask, constMaxInt)), + simd::or(simd::andnot(constMaxInt, bl3GezMask), simd::and(bl3LqtMask, constMaxInt)) + }; + + // +---+---+---+---+ +---+---+---+---+ + // | A | B | C | D | | C | G | K | O | + // +---+---+---+---+ +---+---+---+---+ + // | E | F | G | H | | D | H | L | P | + // +---+---+---+---+ --> +---+---+---+---+ + // | I | J | K | L | | A | E | I | M | + // +---+---+---+---+ +---+---+---+---+ + // | M | N | O | P | | B | F | J | N | + // +---+---+---+---+ +---+---+---+---+ + const uint8x16x4_t blMasksTr = simd::transposeAs4x4x4(blMasks); + + // Unpack masks and copy from bytes to bits + const uint32_t bl0PosOrZero = simd::moveMaskMSB(blMasksTr.r0); + const uint32_t bl1PosOrZero = simd::moveMaskMSB(blMasksTr.r1); + const uint32_t bl2PosOrZero = simd::moveMaskMSB(blMasksTr.r2); + const uint32_t bl3PosOrZero = simd::moveMaskMSB(blMasksTr.r3); + + uint8x16_t bl0LessThanQtMask = simd::and(blMasksTr.r0, constMaxInt); + uint8x16_t bl1LessThanQtMask = simd::and(blMasksTr.r1, constMaxInt); + uint8x16_t bl2LessThanQtMask = simd::and(blMasksTr.r2, constMaxInt); + uint8x16_t bl3LessThanQtMask = simd::and(blMasksTr.r3, constMaxInt); + bl0LessThanQtMask = simd::addsatu(bl0LessThanQtMask, bl0LessThanQtMask); + bl1LessThanQtMask = simd::addsatu(bl1LessThanQtMask, bl1LessThanQtMask); + bl2LessThanQtMask = simd::addsatu(bl2LessThanQtMask, bl2LessThanQtMask); + bl3LessThanQtMask = simd::addsatu(bl3LessThanQtMask, bl3LessThanQtMask); + + const uint32_t bl0LessThanQt = simd::moveMaskMSB(bl0LessThanQtMask); + const uint32_t bl1LessThanQt = simd::moveMaskMSB(bl1LessThanQtMask); + const uint32_t bl2LessThanQt = simd::moveMaskMSB(bl2LessThanQtMask); + const uint32_t bl3LessThanQt = simd::moveMaskMSB(bl3LessThanQtMask); + +#if 1 + // Keep chromatic component from the average color, but override brightness + // NOTE: This is slightly slower but gets slightly better quality + + // Find average blocks color + const uint8x16x4_t blAvg = { + simd::avg(simd::avg(bl0.r0, bl0.r1), simd::avg(bl0.r2, bl0.r3)), + simd::avg(simd::avg(bl1.r0, bl1.r1), simd::avg(bl1.r2, bl1.r3)), + simd::avg(simd::avg(bl2.r0, bl2.r1), simd::avg(bl2.r2, bl2.r3)), + simd::avg(simd::avg(bl3.r0, bl3.r1), simd::avg(bl3.r2, bl3.r3)) + }; + + const uint8x16x4_t blAvgTr = simd::transposeAs4x4(blAvg); + + const uint8x16_t blAvgColors = simd::avg( + simd::avg(blAvgTr.r0, blAvgTr.r1), + simd::avg(blAvgTr.r2, blAvgTr.r3) + ); + + // Note: a lot of SSE lanes wasted, it is not ideal, TODO? + // + // avg0.rgba | avg0.rgba | avg1.rgba | avg1.rgba + // avg2.rgba | avg2.rgba | avg3.rgba | avg3.rgba + // avg0.rgba | avg0.rgba | avg1.rgba | avg1.rgba + // avg2.rgba | avg2.rgba | avg3.rgba | avg3.rgba + const uint8x16x4_t blAvg4 = simd::zipU4x2(blAvgColors, blAvgColors, blAvgColors, blAvgColors); + + // Deinterleave + // avg0.rr | avg1.rr | avg2.rr | avg3.rr | avg0.rr | avg1.rr | avg2.rr | avg3.rr + // avg0.gg | avg1.gg | avg2.gg | avg3.gg | avg0.gg | avg1.gg | avg2.gg | avg3.gg + // avg0.bb | avg1.bb | avg2.bb | avg3.bb | avg0.bb | avg1.bb | avg2.bb | avg3.bb + const uint8x16x3_t blAvg4Di = simd::deinterleaveRGB(blAvg4); + + // Y = avg0.yy | avg1.yy | avg2.yy | avg3.yy | avg0.yy | avg1.yy | avg2.yy | avg3.yy + const uint8x16_t Y = simd::avg(simd::avg(blAvg4Di.r0, blAvg4Di.r2), blAvg4Di.r1); + + // Min/max brightness per block + // R0 = avg0.yyyy | avg1.yyyy | avg2.yyyy | avg3.yyyy + // R1 = avg0.yyyy | avg1.yyyy | avg2.yyyy | avg3.yyyy // NOTE: not used! + const uint8x16x2_t blAvgY = simd::zipB16(Y, Y); + + const uint8x16_t blPosCorrectionY = simd::minu(simd::subsatu(blMidY, blAvgY.r0), constMaxInt); + const uint8x16_t blNegCorrectionY = simd::minu(simd::subsatu(blAvgY.r0, blMidY), constMaxInt); + const uint8x16_t blCorrectionYGezMask = simd::cmpeqi(blNegCorrectionY, constZero); + const uint8x16_t blCorrectionYAbs = simd::or(blPosCorrectionY, blNegCorrectionY); + + // Get the color in the middle between min/max colors of the block. + // NOTE: this is not the same as an average block color. + + const uint8x16_t blBaseColorsPos = simd::addsatu(blAvgColors, blCorrectionYAbs); + const uint8x16_t blBaseColorsNeg = simd::subsatu(blAvgColors, blCorrectionYAbs); + + const uint8x16_t blBaseColors = simd::select(blCorrectionYGezMask, blBaseColorsPos, blBaseColorsNeg); +#else + // Get the color in the middle between min/max colors of the block. + // NOTE: this is not the same as an average block color. + const uint8x16_t blBaseColors = simd::avg(minColors, maxColors); +#endif + + // Convert rgb888 to rgb555 + + // We can't shift right by 3 using SIMD, but we can shift right by 1 three times instead + // We need to sub eight before, because avg is (a+b+1) >> 1 + + // mid555_0.rgba | mid555_1.rgba | mid555_2.rgba | mid555_3.rgba + const uint8x16_t baseColors555 = simd::avg(simd::avg(simd::avg(simd::subsatu(blBaseColors, constEight), constZero), constZero), constZero); + + const uint64x2_t baseColors = simd::getAsUInt64x2(baseColors555); + + // R0 + // AAAAAAAA000000000000000000000000AAAAAAAA000000000000000000011111b << 3 = 00000000 00000000 11111000b + // AAAAAAAA000000000000000000000000AAAAAAAA000000000001111100000000b << 3 = 00000000 11111000 00000000b + // AAAAAAAA000000000000000000000000AAAAAAAA000111110000000000000000b << 3 = 11111000 00000000 00000000b + + // R1 + // AAAAAAAA000000000000000000011111AAAAAAAA000000000000000000000000b >> 29 = 00000000 00000000 11111000b + // AAAAAAAA000000000001111100000000AAAAAAAA000000000000000000000000b >> 29 = 00000000 11111000 00000000b + // AAAAAAAA000111110000000000000000AAAAAAAA000000000000000000000000b >> 29 = 11111000 00000000 00000000b + + uint32_t* goofy_restrict pDest = (uint32_t* goofy_restrict)pResult; + + const uint32_t block0a = etc1BrighnessRangeTocontrolByte[blRangeY.m128i_u8[0]] | ((baseColors.r0 << 3ull) & 0xFFFFFF); + const uint32_t block0b = ~(bl0PosOrZero | (bl0LessThanQt << 16)); + *pDest = block0a; pDest++; *pDest = block0b; + + const uint32_t block1a = etc1BrighnessRangeTocontrolByte[blRangeY.m128i_u8[4]] | ((baseColors.r0 >> 29ull) & 0xFFFFFF); + const uint32_t block1b = ~(bl1PosOrZero | (bl1LessThanQt << 16)); + pDest++; *pDest = block1a; pDest++; *pDest = block1b; + + const uint32_t block2a = etc1BrighnessRangeTocontrolByte[blRangeY.m128i_u8[8]] | ((baseColors.r1 << 3ull) & 0xFFFFFF); + const uint32_t block2b = ~(bl2PosOrZero | (bl2LessThanQt << 16)); + pDest++; *pDest = block2a; pDest++; *pDest = block2b; + + const uint32_t block3a = etc1BrighnessRangeTocontrolByte[blRangeY.m128i_u8[12]] | ((baseColors.r1 >> 29ull) & 0xFFFFFF); + const uint32_t block3b = ~(bl3PosOrZero | (bl3LessThanQt << 16)); + pDest++; *pDest = block3a; pDest++; *pDest = block3b; + } +} + + +int compressDXT1(unsigned char* result, const unsigned char* input, unsigned int width, unsigned int height, unsigned int stride) +{ + // those checks are required because of 4x1 block window inside the compressor + if (width % 16 != 0) + { + return -1; + } + + if (height % 4 != 0) + { + return -2; + } + + unsigned int blockW = width >> 2; + unsigned int blockH = height >> 2; + + size_t inputStride = stride; + for (uint32_t y = 0; y < blockH; y++) + { + const unsigned char* goofy_restrict encoderPos = input; + for (uint32_t x = 0; x < blockW; x += 4) + { + goofySimdEncode(encoderPos, inputStride, result); + encoderPos += 64; // 16 rgba pixels (4 DXT blocks) = 16 * 4 = 64 + result += 32; // 4 DXT1 blocks = 8 * 4 = 32 + } + input += inputStride * 4; // 4 lines + } + return 0; +} + +int compressETC1(unsigned char* result, const unsigned char* input, unsigned int width, unsigned int height, unsigned int stride) +{ + // those checks are required because of 4x1 block window inside the compressor + if (width % 16 != 0) + { + return -1; + } + + if (height % 4 != 0) + { + return -2; + } + + unsigned int blockW = width >> 2; + unsigned int blockH = height >> 2; + + size_t inputStride = stride; + for (uint32_t y = 0; y < blockH; y++) + { + const unsigned char* goofy_restrict encoderPos = input; + for (uint32_t x = 0; x < blockW; x += 4) + { + goofySimdEncode(encoderPos, inputStride, result); + encoderPos += 64; // 16 rgba pixels (4 DXT blocks) = 16 * 4 = 64 + result += 32; // 4 DXT1 blocks = 8 * 4 = 32 + } + input += inputStride * 4; // 4 lines + } + return 0; +} + + + +#undef goofy_restrict +#undef goofy_inline +#undef goofy_align16 +} +#endif + + + + + +// Copyright (c) 2020 Sergey Makeev +// +// Permission is hereby granted, free of charge, to any person obtaining +// a copy of this software and associated documentation files (the +// "Software"), to deal in the Software without restriction, including +// without limitation the rights to use, copy, modify, merge, publish, +// distribute, sublicense, and/or sell copies of the Software, and to +// permit persons to whom the Software is furnished to do so, subject to +// the following conditions: +// +// The above copyright notice and this permission notice shall be included +// in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +// IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +// CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +// TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +// SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.