// goofy_tc.h v1.0 // Realtime BC1/ETC1 encoder by Sergey Makeev // // LICENSE: // MIT license at the end of this file. namespace goofy { int compressDXT1(unsigned char* result, const unsigned char* input, unsigned int width, unsigned int height, unsigned int stride); int compressETC1(unsigned char* result, const unsigned char* input, unsigned int width, unsigned int height, unsigned int stride); } #include // Enable SSE2 codec #define GOOFY_SSE2 (1) #define goofy_restrict __restrict #define goofy_inline __forceinline #define goofy_align16(x) __declspec(align(16)) x #ifdef GOOFY_SSE2 #include // SSE2 #else #include // memset/memcpy #endif #ifdef GOOFYTC_IMPLEMENTATION namespace goofy { // constants goofy_align16(static const uint32_t gConstEight[4]) = { 0x08080808, 0x08080808, 0x08080808, 0x08080808 }; goofy_align16(static const uint32_t gConstSixteen[4]) = { 0x10101010, 0x10101010, 0x10101010, 0x10101010 }; goofy_align16(static const uint32_t gConstMaxInt[4]) = { 0x7f7f7f7f, 0x7f7f7f7f, 0x7f7f7f7f, 0x7f7f7f7f }; #ifdef GOOFY_SSE2 typedef __m128i uint8x16_t; #else struct uint8x16_t { union { uint8_t data[16]; int8_t m128i_i8[16]; uint8_t m128i_u8[16]; struct { uint8_t r0; uint8_t g0; uint8_t b0; uint8_t a0; uint8_t r1; uint8_t g1; uint8_t b1; uint8_t a1; uint8_t r2; uint8_t g2; uint8_t b2; uint8_t a2; uint8_t r3; uint8_t g3; uint8_t b3; uint8_t a3; }; struct { uint16_t s0; uint16_t s1; uint16_t s2; uint16_t s3; uint16_t s4; uint16_t s5; uint16_t s6; uint16_t s7; }; struct { uint32_t u0; uint32_t u1; uint32_t u2; uint32_t u3; }; struct { uint64_t l0; uint64_t l1; }; }; }; #endif // 2x16xU8 struct uint8x16x2_t { // rows uint8x16_t r0; uint8x16_t r1; }; // 3x16xU8 struct uint8x16x3_t { // rows uint8x16_t r0; uint8x16_t r1; uint8x16_t r2; }; // 4x16xU8 struct uint8x16x4_t { // rows uint8x16_t r0; uint8x16_t r1; uint8x16_t r2; uint8x16_t r3; }; // 2xU64 struct uint64x2_t { uint64_t r0; uint64_t r1; }; namespace simd { // SSE2 implementation #ifdef GOOFY_SSE2 goofy_inline uint8x16_t zero() { return _mm_setzero_si128(); } goofy_inline uint8x16_t fetch(const void* p) { return _mm_load_si128((const __m128i*)p); } goofy_inline uint64x2_t getAsUInt64x2(const uint8x16_t& a) { uint64x2_t res; res.r0 = _mm_cvtsi128_si64(a); res.r1 = _mm_cvtsi128_si64(_mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); return res; } goofy_inline uint8x16_t or(const uint8x16_t& a, const uint8x16_t& b) { return _mm_or_si128(a, b); } goofy_inline uint8x16_t and(const uint8x16_t& a, const uint8x16_t& b) { return _mm_and_si128(a, b); } goofy_inline uint8x16_t andnot(const uint8x16_t& a, const uint8x16_t& b) { return _mm_andnot_si128(a, b); } goofy_inline uint8x16_t select(const uint8x16_t& mask, const uint8x16_t& a, const uint8x16_t& b) { return _mm_or_si128(_mm_and_si128(mask, a), _mm_andnot_si128(mask, b)); } goofy_inline uint8x16_t minu(const uint8x16_t& a, const uint8x16_t& b) { return _mm_min_epu8(a, b); } goofy_inline uint8x16_t maxu(const uint8x16_t& a, const uint8x16_t& b) { return _mm_max_epu8(a, b); } goofy_inline uint8x16_t avg(const uint8x16_t& a, const uint8x16_t& b) { return _mm_avg_epu8(a, b); } goofy_inline uint8x16_t replicateU0000(const uint8x16_t& a) { return _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 0)); } goofy_inline uint8x16_t replicateU1111(const uint8x16_t& a) { return _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 1, 1, 1)); } goofy_inline uint8x16_t replicateU2222(const uint8x16_t& a) { return _mm_shuffle_epi32(a, _MM_SHUFFLE(2, 2, 2, 2)); } goofy_inline uint8x16_t replicateU3333(const uint8x16_t& a) { return _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 3, 3, 3)); } goofy_inline uint8x16_t cmpeqi(const uint8x16_t& a, const uint8x16_t& b) { return _mm_cmpeq_epi8(a, b); } goofy_inline uint8x16_t cmplti(const uint8x16_t& a, const uint8x16_t& b) { return _mm_cmplt_epi8(a, b); } goofy_inline uint8x16_t addsatu(const uint8x16_t& a, const uint8x16_t& b) { return _mm_adds_epu8(a, b); } goofy_inline uint8x16_t subsatu(const uint8x16_t& a, const uint8x16_t& b) { return _mm_subs_epu8(a, b); } goofy_inline uint8x16x4_t transposeAs4x4(const uint8x16x4_t& v) { uint8x16_t tr0 = _mm_unpacklo_epi32(v.r0, v.r1); uint8x16_t tr1 = _mm_unpacklo_epi32(v.r2, v.r3); uint8x16_t tr2 = _mm_unpackhi_epi32(v.r0, v.r1); uint8x16_t tr3 = _mm_unpackhi_epi32(v.r2, v.r3); uint8x16x4_t res; res.r0 = _mm_unpacklo_epi64(tr0, tr1); res.r1 = _mm_unpackhi_epi64(tr0, tr1); res.r2 = _mm_unpacklo_epi64(tr2, tr3); res.r3 = _mm_unpackhi_epi64(tr2, tr3); return res; } goofy_inline uint8x16x3_t deinterleaveRGB(const uint8x16x4_t& v) { uint8x16_t s0a = _mm_unpacklo_epi8(v.r0, v.r1); uint8x16_t s0b = _mm_unpackhi_epi8(v.r0, v.r1); uint8x16_t s0c = _mm_unpacklo_epi8(v.r2, v.r3); uint8x16_t s0d = _mm_unpackhi_epi8(v.r2, v.r3); uint8x16_t s1a = _mm_unpacklo_epi8(s0a, s0b); uint8x16_t s1b = _mm_unpackhi_epi8(s0a, s0b); uint8x16_t s1c = _mm_unpacklo_epi8(s0c, s0d); uint8x16_t s1d = _mm_unpackhi_epi8(s0c, s0d); uint8x16_t s2a = _mm_unpacklo_epi8(s1a, s1b); uint8x16_t s2b = _mm_unpackhi_epi8(s1a, s1b); uint8x16_t s2c = _mm_unpacklo_epi8(s1c, s1d); uint8x16_t s2d = _mm_unpackhi_epi8(s1c, s1d); uint8x16x3_t res; res.r0 = _mm_unpacklo_epi64(s2a, s2c); // red res.r1 = _mm_unpackhi_epi64(s2a, s2c); // green res.r2 = _mm_unpacklo_epi64(s2b, s2d); // blue //res.r3 = _mm_unpackhi_epi64(s2b, s2d); // alpha return res; } // transpose as four single channel 4x4 blocks at once // // in: // // R0 = | bl0.abcd | bl0.efgh | bl0.ijkl | bl0.mnop | // R1 = | bl1.abcd | bl1.efgh | bl1.ijkl | bl1.mnop | // R2 = | bl2.abcd | bl2.efgh | bl2.ijkl | bl2.mnop | // R3 = | bl3.abcd | bl3.efgh | bl3.ijkl | bl3.mnop | // // out: // // R1 = | bl0.aeim | bl0.bfjo | bl0.cgko | bl0.dhkl | // R2 = | bl1.aeim | bl1.bfjo | bl1.cgko | bl1.dhkl | // R3 = | bl2.aeim | bl2.bfjo | bl2.cgko | bl2.dhkl | // R0 = | bl3.aeim | bl3.bfjo | bl3.cgko | bl3.dhkl | // // +---+---+---+---+ +---+---+---+---+ // | A | B | C | D | | A | E | I | M | // +---+---+---+---+ +---+---+---+---+ // | E | F | G | H | | B | F | J | O | // +---+---+---+---+ --> +---+---+---+---+ // | I | J | K | L | | C | G | K | O | // +---+---+---+---+ +---+---+---+---+ // | M | N | O | P | | D | H | K | L | // +---+---+---+---+ +---+---+---+---+ // goofy_inline uint8x16x4_t transposeAs4x4x4(const uint8x16x4_t& v) { const uint8x16_t s0a = _mm_unpacklo_epi8(v.r0, v.r1); const uint8x16_t s0b = _mm_unpackhi_epi8(v.r0, v.r1); const uint8x16_t s0c = _mm_unpacklo_epi8(v.r2, v.r3); const uint8x16_t s0d = _mm_unpackhi_epi8(v.r2, v.r3); const uint8x16_t s1a = _mm_unpacklo_epi8(s0a, s0b); const uint8x16_t s1b = _mm_unpackhi_epi8(s0a, s0b); const uint8x16_t s1c = _mm_unpacklo_epi8(s0c, s0d); const uint8x16_t s1d = _mm_unpackhi_epi8(s0c, s0d); const uint8x16_t s2a = _mm_unpacklo_epi8(s1a, s1b); const uint8x16_t s2b = _mm_unpackhi_epi8(s1a, s1b); const uint8x16_t s2c = _mm_unpacklo_epi8(s1c, s1d); const uint8x16_t s2d = _mm_unpackhi_epi8(s1c, s1d); const uint8x16_t s3a = _mm_unpacklo_epi32(s2a, s2b); const uint8x16_t s3b = _mm_unpackhi_epi32(s2a, s2b); const uint8x16_t s3c = _mm_unpacklo_epi32(s2c, s2d); const uint8x16_t s3d = _mm_unpackhi_epi32(s2c, s2d); const uint8x16_t s4a = _mm_unpacklo_epi64(s3a, s3b); const uint8x16_t s4b = _mm_unpackhi_epi64(s3a, s3b); const uint8x16_t s4c = _mm_unpacklo_epi64(s3c, s3d); const uint8x16_t s4d = _mm_unpackhi_epi64(s3c, s3d); uint8x16x4_t res; res.r0 = _mm_shuffle_epi32(s4a, _MM_SHUFFLE(2, 0, 3, 1)); res.r1 = _mm_shuffle_epi32(s4b, _MM_SHUFFLE(2, 0, 3, 1)); res.r2 = _mm_shuffle_epi32(s4c, _MM_SHUFFLE(2, 0, 3, 1)); res.r3 = _mm_shuffle_epi32(s4d, _MM_SHUFFLE(2, 0, 3, 1)); return res; } goofy_inline uint8x16x4_t zipU4x2(const uint8x16_t& a, const uint8x16_t& b, const uint8x16_t& c, const uint8x16_t& d) { const uint8x16x4_t res = { _mm_unpacklo_epi32(a, b), _mm_unpackhi_epi32(a, b), _mm_unpacklo_epi32(c, d), _mm_unpackhi_epi32(c, d), }; return res; } goofy_inline uint8x16x2_t zipU4(const uint8x16_t& a, const uint8x16_t& b) { uint8x16x2_t res; res.r0 = _mm_unpacklo_epi32(a, b); res.r1 = _mm_unpackhi_epi32(a, b); return res; } goofy_inline uint32_t moveMaskMSB(const uint8x16_t& v) { return (uint32_t)_mm_movemask_epi8(v); } goofy_inline uint8x16x2_t zipB16(const uint8x16_t& a, const uint8x16_t& b) { uint8x16x2_t res; res.r0 = _mm_unpacklo_epi8(a, b); res.r1 = _mm_unpackhi_epi8(a, b); return res; } goofy_inline uint8x16_t not(const uint8x16_t& v) { return _mm_xor_si128(v, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())); } #else // generic CPU implementation namespace detail { goofy_inline uint8x16_t unpacklo16(const uint8x16_t& a, const uint8x16_t& b) { uint8x16_t res; res.s0 = a.s0; res.s1 = b.s0; res.s2 = a.s1; res.s3 = b.s1; res.s4 = a.s2; res.s5 = b.s2; res.s6 = a.s3; res.s7 = b.s3; return res; } goofy_inline uint8x16_t unpackhi16(const uint8x16_t& a, const uint8x16_t& b) { uint8x16_t res; res.s0 = a.s4; res.s1 = b.s4; res.s2 = a.s5; res.s3 = b.s5; res.s4 = a.s6; res.s5 = b.s6; res.s6 = a.s7; res.s7 = b.s7; return res; } goofy_inline uint8x16_t unpacklo8(const uint8x16_t& a, const uint8x16_t& b) { uint8x16_t res; res.data[0] = a.data[0]; res.data[1] = b.data[0]; res.data[2] = a.data[1]; res.data[3] = b.data[1]; res.data[4] = a.data[2]; res.data[5] = b.data[2]; res.data[6] = a.data[3]; res.data[7] = b.data[3]; res.data[8] = a.data[4]; res.data[9] = b.data[4]; res.data[10] = a.data[5]; res.data[11] = b.data[5]; res.data[12] = a.data[6]; res.data[13] = b.data[6]; res.data[14] = a.data[7]; res.data[15] = b.data[7]; return res; } goofy_inline uint8x16_t unpackhi8(const uint8x16_t& a, const uint8x16_t& b) { uint8x16_t res; res.data[0] = a.data[8]; res.data[1] = b.data[8]; res.data[2] = a.data[9]; res.data[3] = b.data[9]; res.data[4] = a.data[10]; res.data[5] = b.data[10]; res.data[6] = a.data[11]; res.data[7] = b.data[11]; res.data[8] = a.data[12]; res.data[9] = b.data[12]; res.data[10] = a.data[13]; res.data[11] = b.data[13]; res.data[12] = a.data[14]; res.data[13] = b.data[14]; res.data[14] = a.data[15]; res.data[15] = b.data[15]; return res; } goofy_inline uint8x16_t unpacklo64(const uint8x16_t& a, const uint8x16_t& b) { uint8x16_t res; res.l0 = a.l0; res.l1 = b.l0; return res; } goofy_inline uint8x16_t unpackhi64(const uint8x16_t& a, const uint8x16_t& b) { uint8x16_t res; res.l0 = a.l1; res.l1 = b.l1; return res; } goofy_inline uint8x16_t unpacklo32(const uint8x16_t& a, const uint8x16_t& b) { uint8x16_t res; res.u0 = a.u0; res.u1 = b.u0; res.u2 = a.u1; res.u3 = b.u1; return res; } goofy_inline uint8x16_t unpackhi32(const uint8x16_t& a, const uint8x16_t& b) { uint8x16_t res; res.u0 = a.u2; res.u1 = b.u2; res.u2 = a.u3; res.u3 = b.u3; return res; } goofy_inline uint8x16_t replicateU0011(const uint8x16_t& a) { uint8x16_t res; res.u0 = a.u0; res.u1 = a.u0; res.u2 = a.u1; res.u3 = a.u1; return res; } goofy_inline uint8x16_t replicateU2233(const uint8x16_t& a) { uint8x16_t res; res.u0 = a.u2; res.u1 = a.u2; res.u2 = a.u3; res.u3 = a.u3; return res; } goofy_inline uint8x16_t swizzleU1302(const uint8x16_t& a) { uint8x16_t res; res.u0 = a.u1; res.u1 = a.u3; res.u2 = a.u0; res.u3 = a.u2; return res; } } //detail goofy_inline uint8x16_t zero() { uint8x16_t r; memset(&r, 0, sizeof(uint8x16_t)); return r; } goofy_inline uint8x16_t fetch(const void* p) { uint8x16_t r; memcpy(&r, p, sizeof(uint8x16_t)); return r; } goofy_inline uint64x2_t getAsUInt64x2(const uint8x16_t& a) { uint64x2_t res; res.r0 = a.l0; res.r1 = a.l1; return res; } // bit or goofy_inline uint8x16_t or(const uint8x16_t& a, const uint8x16_t& b) { uint8x16_t res; for (uint32_t i = 0; i < 16; i++) { res.data[i] = a.data[i] | b.data[i]; } return res; } // bit and goofy_inline uint8x16_t and(const uint8x16_t& a, const uint8x16_t& b) { uint8x16_t res; for (uint32_t i = 0; i < 16; i++) { res.data[i] = a.data[i] & b.data[i]; } return res; } goofy_inline uint8x16_t andnot(const uint8x16_t& a, const uint8x16_t& b) { uint8x16_t res; for (uint32_t i = 0; i < 16; i++) { res.data[i] = (~a.data[i]) & b.data[i]; } return res; } goofy_inline uint32_t moveMaskMSB(const uint8x16_t& v) { uint32_t res = 0; for (uint32_t i = 0; i < 16; i++) { uint32_t msb = ((v.data[i] & 0x80) >> 7); res = res | (msb << i); } return res; } // // if (maskA) { // return a; // } // else { // return b; // } goofy_inline uint8x16_t select(const uint8x16_t& mask, const uint8x16_t& a, const uint8x16_t& b) { uint8x16_t res; for (uint32_t i = 0; i < 16; i++) { unsigned char msk = mask.data[i]; res.data[i] = (msk & a.data[i]) | ((~msk) & b.data[i]); // _mm_or_si128(_mm_and_si128(mask, a), _mm_andnot_si128(mask, b)) } return res; } goofy_inline uint8x16_t minu(const uint8x16_t& a, const uint8x16_t& b) { uint8x16_t res; for (uint32_t i = 0; i < 16; i++) { res.data[i] = (a.data[i] < b.data[i]) ? a.data[i] : b.data[i]; } return res; } goofy_inline uint8x16_t maxu(const uint8x16_t& a, const uint8x16_t& b) { uint8x16_t res; for (uint32_t i = 0; i < 16; i++) { res.data[i] = (a.data[i] > b.data[i]) ? a.data[i] : b.data[i]; } return res; } goofy_inline uint8x16_t avg(const uint8x16_t& a, const uint8x16_t& b) { uint8x16_t res; for (uint32_t i = 0; i < 16; i++) { uint32_t t = (a.data[i] + b.data[i]) + 1; res.data[i] = (uint8_t)(t >> 1); } return res; } goofy_inline uint8x16_t replicateU0000(const uint8x16_t& a) { uint8x16_t res; res.u0 = a.u0; res.u1 = a.u0; res.u2 = a.u0; res.u3 = a.u0; return res; } goofy_inline uint8x16_t replicateU1111(const uint8x16_t& a) { uint8x16_t res; res.u0 = a.u1; res.u1 = a.u1; res.u2 = a.u1; res.u3 = a.u1; return res; } goofy_inline uint8x16_t replicateU2222(const uint8x16_t& a) { uint8x16_t res; res.u0 = a.u2; res.u1 = a.u2; res.u2 = a.u2; res.u3 = a.u2; return res; } goofy_inline uint8x16_t replicateU3333(const uint8x16_t& a) { uint8x16_t res; res.u0 = a.u3; res.u1 = a.u3; res.u2 = a.u3; res.u3 = a.u3; return res; } // cmp equal (signed) goofy_inline uint8x16_t cmpeqi(const uint8x16_t& a, const uint8x16_t& b) { uint8x16_t res; for (uint32_t i = 0; i < 16; i++) { res.data[i] = ((char)a.data[i] == (char)b.data[i]) ? 0xFF : 0x00; } return res; } // cmp less (signed) goofy_inline uint8x16_t cmplti(const uint8x16_t& a, const uint8x16_t& b) { uint8x16_t res; for (uint32_t i = 0; i < 16; i++) { res.data[i] = ((char)a.data[i] < (char)b.data[i]) ? 0xFF : 0x00; } return res; } // add unsigned saturate goofy_inline uint8x16_t addsatu(const uint8x16_t& a, const uint8x16_t& b) { uint8x16_t res; for (uint32_t i = 0; i < 16; i++) { int32_t diff = ((unsigned char)a.data[i] + (unsigned char)b.data[i]); if (diff > 255) diff = 255; res.data[i] = (uint8_t)diff; } return res; } // sub unsigned saturate goofy_inline uint8x16_t subsatu(const uint8x16_t& a, const uint8x16_t& b) { uint8x16_t res; for (uint32_t i = 0; i < 16; i++) { int32_t diff = ((unsigned char)a.data[i] - (unsigned char)b.data[i]); if (diff < 0) diff = 0; res.data[i] = (uint8_t)diff; } return res; } // transpose as one 4x4 RGBA block // // in: // // R0 = | a0.rgba | a1.rgba | a2.rgba | a3.rgba | // R1 = | b0.rgba | b1.rgba | b2.rgba | b3.rgba | // R2 = | c0.rgba | c1.rgba | c2.rgba | c3.rgba | // R3 = | d0.rgba | d1.rgba | d2.rgba | d3.rgba | // // out: // // R0 = | a0.rgba | b0.rgba | c0.rgba | d0.rgba | // R1 = | a1.rgba | b1.rgba | c1.rgba | d1.rgba | // R2 = | a2.rgba | b2.rgba | c2.rgba | d2.rgba | // R3 = | a3.rgba | b3.rgba | c3.rgba | d3.rgba | // goofy_inline uint8x16x4_t transposeAs4x4(const uint8x16x4_t& v) { // a0, b0, a1, b1 const uint8x16_t tr0 = detail::unpacklo32(v.r0, v.r1); // c0, d0, c1, d1 const uint8x16_t tr1 = detail::unpacklo32(v.r2, v.r3); // a2, b2, a3, b3 const uint8x16_t tr2 = detail::unpackhi32(v.r0, v.r1); // c2, d2, c3, d3 const uint8x16_t tr3 = detail::unpackhi32(v.r2, v.r3); uint8x16x4_t res; // a0, b0, c0, d0 res.r0 = detail::unpacklo64(tr0, tr1); // a1, b1, c1, d1 res.r1 = detail::unpackhi64(tr0, tr1); // a2, b2, c2, d2 res.r2 = detail::unpacklo64(tr2, tr3); // a3, b3, c3, d3 res.r3 = detail::unpackhi64(tr2, tr3); return res; } // deinterleave as 4x16 // // in: // // R0 = | a0.rgba | a1.rgba | a2.rgba | a3.rgba | // R1 = | b0.rgba | b1.rgba | b2.rgba | b3.rgba | // R2 = | c0.rgba | c1.rgba | c2.rgba | c3.rgba | // R3 = | d0.rgba | d1.rgba | d2.rgba | d3.rgba | // // out: // // R0 = | a0.r | a1.r | a2.r | a3.r | b0.r | b1.r | b2.r | b3.r | c0.r | c1.r | c2.r | c3.r | d0.r | d1.r | d2.r | d3.r | // R1 = | a0.g | a1.g | a2.g | a3.g | b0.g | b1.g | b2.g | b3.g | c0.g | c1.g | c2.g | c3.g | d0.g | d1.g | d2.g | d3.g | // R2 = | a0.b | a1.b | a2.b | a3.b | b0.b | b1.b | b2.b | b3.b | c0.b | c1.b | c2.b | c3.b | d0.b | d1.b | d2.b | d3.b | // R3 = | a0.a | a1.a | a2.a | a3.a | b0.a | b1.a | b2.a | b3.a | c0.a | c1.a | c2.a | c3.a | d0.a | d1.a | d2.a | d3.a | // goofy_inline uint8x16x3_t deinterleaveRGB(const uint8x16x4_t& v) { // step 1 // | a0.r | b0.r | a0.g | b0.g | a0.b | b0.b | a0.a | b0.a | a1.r | b1.r | a1.g | b1.g | a1.b | b1.b | a1.a | b1.a | // | a2.r | b2.r | a2.g | b2.g | a2.b | b2.b | a2.a | b2.a | a3.r | b3.r | a3.g | b3.g | a3.b | b3.b | a3.a | b3.a | const uint8x16_t s0a = detail::unpacklo8(v.r0, v.r1); const uint8x16_t s0b = detail::unpackhi8(v.r0, v.r1); // | c0.r | d0.r | c0.g | d0.g | c0.b | d0.b | c0.a | d0.a | c1.r | d1.r | c1.g | d1.g | c1.b | d1.b | c1.a | d1.a | // | c2.r | d2.r | c2.g | d2.g | c2.b | d2.b | c2.a | d2.a | c3.r | d3.r | c3.g | d3.g | c3.b | d3.b | c3.a | d3.a | const uint8x16_t s0c = detail::unpacklo8(v.r2, v.r3); const uint8x16_t s0d = detail::unpackhi8(v.r2, v.r3); // step 2 // | a0.r | a2.r | b0.r | b2.r | a0.g | a2.g | b0.g | b2.g | a0.b | a2.b | b0.b | b2.b | a0.a | a2.a | b0.a | b2.a | // | a1.r | a3.r | b1.r | b3.r | a1.g | a3.g | b1.g | b3.g | a1.b | a3.b | b1.b | b3.b | a1.a | a3.a | b1.a | b3.a | const uint8x16_t s1a = detail::unpacklo8(s0a, s0b); const uint8x16_t s1b = detail::unpackhi8(s0a, s0b); // | c0.r | c2.r | d0.r | d2.r | c0.g | c2.g | d0.g | d2.g | c0.b | c2.b | d0.b | d2.b | c0.a | c2.a | d0.a | d2.a | // | c1.r | c3.r | d1.r | d3.r | c1.g | c3.g | d1.g | d3.g | c1.b | c3.b | d1.b | d3.b | c1.a | c3.a | d1.a | d3.a | const uint8x16_t s1c = detail::unpacklo8(s0c, s0d); const uint8x16_t s1d = detail::unpackhi8(s0c, s0d); // step 3 // | a0.r | a1.r | a2.r | a3.r | b0.r | b1.r | b2.r | b3.r | a0.g | a1.g | a2.g | a3.g | b0.g | b1.g | b2.g | b3.g | // | a0.b | a1.b | a2.b | a3.b | b0.b | b1.b | b2.b | b3.b | a0.a | a1.a | a2.a | a3.a | b0.a | b1.a | b2.a | b3.a | const uint8x16_t s2a = detail::unpacklo8(s1a, s1b); const uint8x16_t s2b = detail::unpackhi8(s1a, s1b); // | c0.r | c1.r | c2.r | c3.r | d0.r | d1.r | d2.r | d3.r | c0.g | c1.g | c2.g | c3.g | d0.g | d1.g | d2.g | d3.g | // | c0.b | c1.b | c2.b | c3.b | d0.b | d1.b | d2.b | d3.b | c0.a | c1.a | c2.a | c3.a | d0.a | d1.a | d2.a | d3.a | const uint8x16_t s2c = detail::unpacklo8(s1c, s1d); const uint8x16_t s2d = detail::unpackhi8(s1c, s1d); // step 4 (final) uint8x16x3_t res; // | a0.r | a1.r | a2.r | a3.r | b0.r | b1.r | b2.r | b3.r | c0.r | c1.r | c2.r | c3.r | d0.r | d1.r | d2.r | d3.r | // | a0.g | a1.g | a2.g | a3.g | b0.g | b1.g | b2.g | b3.g | c0.g | c1.g | c2.g | c3.g | d0.g | d1.g | d2.g | d3.g | res.r0 = detail::unpacklo64(s2a, s2c); res.r1 = detail::unpackhi64(s2a, s2c); // | a0.b | a1.b | a2.b | a3.b | b0.b | b1.b | b2.b | b3.b | c0.b | c1.b | c2.b | c3.b | d0.b | d1.b | d2.b | d3.b | // | a0.a | a1.a | a2.a | a3.a | b0.a | b1.a | b2.a | b3.a | c0.a | c1.a | c2.a | c3.a | d0.a | d1.a | d2.a | d3.a | res.r2 = detail::unpacklo64(s2b, s2d); //res.r3 = detail::unpackhi64(s2b, s2d); return res; } // transpose as four single channel 4x4 blocks at once // // in: // // R0 = | bl0.abcd | bl0.efgh | bl0.ijkl | bl0.mnop | // R1 = | bl1.abcd | bl1.efgh | bl1.ijkl | bl1.mnop | // R2 = | bl2.abcd | bl2.efgh | bl2.ijkl | bl2.mnop | // R3 = | bl3.abcd | bl3.efgh | bl3.ijkl | bl3.mnop | // // out: // NOTE: columns are swapped! // // 3 4 0 1 // R1 = | bl0.cgko | bl0.dhlp | bl0.aeim | bl0.bfjn | // R2 = | bl1.cgko | bl1.dhlp | bl1.aeim | bl1.bfjn | // R3 = | bl2.cgko | bl2.dhlp | bl2.aeim | bl2.bfjn | // R0 = | bl3.cgko | bl3.dhlp | bl3.aeim | bl3.bfjn | // // +---+---+---+---+ +---+---+---+---+ // | A | B | C | D | | C | G | K | O | // +---+---+---+---+ +---+---+---+---+ // | E | F | G | H | | D | H | L | P | // +---+---+---+---+ --> +---+---+---+---+ // | I | J | K | L | | A | E | I | M | // +---+---+---+---+ +---+---+---+---+ // | M | N | O | P | | B | F | J | N | // +---+---+---+---+ +---+---+---+---+ // goofy_inline uint8x16x4_t transposeAs4x4x4(const uint8x16x4_t& v) { // step 1 // | 0.a | 1.a | 0.b | 1.b | 0.c | 1.c | 0.d | 1.d | 0.e | 1.e | 0.f | 1.f | 0.g | 1.g | 0.h | 1.h | // | 0.i | 1.i | 0.j | 1.j | 0.k | 1.k | 0.l | 1.l | 0.m | 1.m | 0.n | 1.n | 0.o | 1.o | 0.p | 1.p | const uint8x16_t s0a = detail::unpacklo8(v.r0, v.r1); const uint8x16_t s0b = detail::unpackhi8(v.r0, v.r1); // | 2.a | 3.a | 2.b | 3.b | 2.c | 3.c | 2.d | 3.d | 2.e | 3.e | 2.f | 3.f | 2.g | 3.g | 2.h | 3.h | // | 2.i | 3.i | 2.j | 3.j | 2.k | 3.k | 2.l | 3.l | 2.m | 3.m | 2.n | 3.n | 2.o | 3.o | 2.p | 3.p | const uint8x16_t s0c = detail::unpacklo8(v.r2, v.r3); const uint8x16_t s0d = detail::unpackhi8(v.r2, v.r3); // step 2 // | 0.a | 0.i | 1.a | 1.i | 0.b | 0.j | 1.b | 1.j | 0.c | 0.k | 1.c | 1.k | 0.d | 0.l | 1.d | 1.l | // | 0.e | 0.m | 1.e | 1.m | 0.f | 0.n | 1.f | 1.n | 0.g | 0.o | 1.g | 1.o | 0.h | 0.p | 1.h | 1.p | const uint8x16_t s1a = detail::unpacklo8(s0a, s0b); const uint8x16_t s1b = detail::unpackhi8(s0a, s0b); // | 2.a | 2.i | 3.a | 3.i | 2.b | 2.j | 3.b | 3.j | 2.c | 2.k | 3.c | 3.k | 2.d | 2.l | 3.d | 3.l | // | 2.e | 2.m | 3.e | 3.m | 2.f | 2.n | 3.f | 3.n | 2.g | 2.o | 3.g | 3.o | 2.h | 2.p | 3.h | 3.p | const uint8x16_t s1c = detail::unpacklo8(s0c, s0d); const uint8x16_t s1d = detail::unpackhi8(s0c, s0d); // step 3 // | 0.a | 0.e | 0.i | 0.m | 1.a | 1.e | 1.i | 1.m | 0.b | 0.f | 0.j | 0.n | 1.b | 1.f | 1.j | 1.n | // | 0.c | 0.g | 0.k | 0.o | 1.c | 1.g | 1.k | 1.o | 0.d | 0.h | 0.l | 0.p | 1.d | 1.h | 1.l | 1.p | const uint8x16_t s2a = detail::unpacklo8(s1a, s1b); const uint8x16_t s2b = detail::unpackhi8(s1a, s1b); // | 2.a | 2.e | 2.i | 2.m | 3.a | 3.e | 3.i | 3.m | 2.b | 2.f | 2.j | 2.n | 3.b | 3.f | 3.j | 3.n | // | 2.c | 2.g | 2.k | 2.o | 3.c | 3.g | 3.k | 3.o | 2.d | 2.h | 2.l | 2.p | 3.d | 3.h | 3.l | 3.p | const uint8x16_t s2c = detail::unpacklo8(s1c, s1d); const uint8x16_t s2d = detail::unpackhi8(s1c, s1d); // step 4 // | 0.a | 0.e | 0.i | 0.m | 0.c | 0.g | 0.k | 0.o | 1.a | 1.e | 1.i | 1.m | 1.c | 1.g | 1.k | 1.o | // | 0.b | 0.f | 0.j | 0.n | 0.d | 0.h | 0.l | 0.p | 1.b | 1.f | 1.j | 1.n | 1.d | 1.h | 1.l | 1.p | const uint8x16_t s3a = detail::unpacklo32(s2a, s2b); const uint8x16_t s3b = detail::unpackhi32(s2a, s2b); // | 2.a | 2.e | 2.i | 2.m | 2.c | 2.g | 2.k | 2.o | 3.a | 3.e | 3.i | 3.m | 3.c | 3.g | 3.k | 3.o | // | 2.b | 2.f | 2.j | 2.n | 2.d | 2.h | 2.l | 2.p | 3.b | 3.f | 3.j | 3.n | 3.d | 3.h | 3.l | 3.p | const uint8x16_t s3c = detail::unpacklo32(s2c, s2d); const uint8x16_t s3d = detail::unpackhi32(s2c, s2d); // step 5 // | 0.a | 0.e | 0.i | 0.m | 0.c | 0.g | 0.k | 0.o | 0.b | 0.f | 0.j | 0.n | 0.d | 0.h | 0.l | 0.p | // | 1.a | 1.e | 1.i | 1.m | 1.c | 1.g | 1.k | 1.o | 1.b | 1.f | 1.j | 1.n | 1.d | 1.h | 1.l | 1.p | const uint8x16_t s4a = detail::unpacklo64(s3a, s3b); const uint8x16_t s4b = detail::unpackhi64(s3a, s3b); // | 2.a | 2.e | 2.i | 2.m | 2.c | 2.g | 2.k | 2.o | 2.b | 2.f | 2.j | 2.n | 2.d | 2.h | 2.l | 2.p | // | 3.a | 3.e | 3.i | 3.m | 3.c | 3.g | 3.k | 3.o | 3.b | 3.f | 3.j | 3.n | 3.d | 3.h | 3.l | 3.p | const uint8x16_t s4c = detail::unpacklo64(s3c, s3d); const uint8x16_t s4d = detail::unpackhi64(s3c, s3d); // step 5 (final) uint8x16x4_t res; // | 0.c | 0.g | 0.k | 0.o | 0.d | 0.h | 0.l | 0.p | 0.a | 0.e | 0.i | 0.m | 0.b | 0.f | 0.j | 0.n | res.r0 = detail::swizzleU1302(s4a); // | 1.c | 1.g | 1.k | 1.o | 1.d | 1.h | 1.l | 1.p | 1.a | 1.e | 1.i | 1.m | 1.b | 1.f | 1.j | 1.n | res.r1 = detail::swizzleU1302(s4b); // | 2.c | 2.g | 2.k | 2.o | 2.d | 2.h | 2.l | 2.p | 2.a | 2.e | 2.i | 2.m | 2.b | 2.f | 2.j | 2.n | res.r2 = detail::swizzleU1302(s4c); // | 3.c | 3.g | 3.k | 3.o | 3.d | 3.h | 3.l | 3.p | 3.a | 3.e | 3.i | 3.m | 3.b | 3.f | 3.j | 3.n | res.r3 = detail::swizzleU1302(s4d); return res; } // like ZipU4 but for two parallel zips // // in: // // A = | a0.rgba | a1.rgba | a2.rgba | a3.rgba | // B = | b0.rgba | b1.rgba | b2.rgba | b3.rgba | // C = | c0.rgba | c1.rgba | c2.rgba | c3.rgba | // D = | d0.rgba | d1.rgba | d2.rgba | d3.rgba | // // out: // // R0 = | a0.rgba | b0.rgba | a1.rgba | b1.rgba | // R1 = | a2.rgba | b2.rgba | a3.rgba | b3.rgba | // R2 = | c0.rgba | d0.rgba | c1.rgba | d1.rgba | // R3 = | c2.rgba | d2.rgba | c3.rgba | d3.rgba | // goofy_inline uint8x16x4_t zipU4x2(const uint8x16_t& a, const uint8x16_t& b, const uint8x16_t& c, const uint8x16_t& d) { const uint8x16x4_t res = { detail::unpacklo32(a, b), detail::unpackhi32(a, b), detail::unpacklo32(c, d), detail::unpackhi32(c, d), }; return res; } // // in: // // a = | a0.rgba | a1.rgba | a2.rgba| a3.rgba // b = | b0.rgba | b1.rgba | b2.rgba| b3.rgba // // out: // // R0 = | a0.rgba | b0.rgba | a1.rgba | b1.rgba | // R1 = | a2.rgba | b2.rgba | a3.rgba | b3.rgba | // goofy_inline uint8x16x2_t zipU4(const uint8x16_t& a, const uint8x16_t& b) { uint8x16x2_t res; res.r0 = detail::unpacklo32(a, b); res.r1 = detail::unpackhi32(a, b); return res; } // // in: // // a = | a0 | a1 | a2 | a3 | a4 | a5 | a6 | a7 | a8 | a9 | aA | aB | aC | aD | aE | aF | // b = | b0 | b1 | b2 | b3 | b4 | b5 | b6 | b7 | b8 | b9 | bA | bB | bC | bD | bE | bF | // // out: // // R0 = | a0 | b0 | a1 | b1 | a2 | b2 | a3 | b3 | a4 | b4 | a5 | b5 | a6 | b6 | a7 | b7 | // R1 = | a8 | b8 | a9 | b9 | aA | bA | aB | bB | aC | bC | aD | bD | aE | bE | aF | bF | // goofy_inline uint8x16x2_t zipB16(const uint8x16_t& a, const uint8x16_t& b) { uint8x16x2_t res; res.r0 = detail::unpacklo8(a, b); res.r1 = detail::unpackhi8(a, b); return res; } goofy_inline uint8x16_t not(const uint8x16_t& v) { uint8x16_t res; for (int i = 0; i < 16; i++) { res.data[i] = v.data[i] ^ 0xFF; } return res; } #endif } static_assert(sizeof(uint8x16_t) == 16, "Incorrect byte8x16 sizeof"); static_assert(sizeof(uint8x16x2_t) == 32, "Incorrect byte8x16x1 sizeof"); static_assert(sizeof(uint8x16x3_t) == 48, "Incorrect byte8x16x2 sizeof"); static_assert(sizeof(uint8x16x4_t) == 64, "Incorrect byte8x16x4 sizeof"); static_assert(sizeof(uint64x2_t) == 16, "Incorrect uint64x2_t sizeof"); // Block brightness variance to ETC control byte static const uint32_t etc1BrighnessRangeTocontrolByte[256] = { 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xFF000000, 0xFF000000 }; enum GoofyCodecType { GOOFY_DXT1, GOOFY_ETC1, }; // // Encode 4 DXT1/ETC1 at once // template goofy_inline void goofySimdEncode(const unsigned char* goofy_restrict inputRGBA, size_t inputStride, unsigned char* goofy_restrict pResult) { // Fetch 16x4 pixels from the buffer(four DX blocks) // 16 pixels wide is better for the CPU cache utilization (64 bytes per line) and it is better for SIMD lane utilization // ----------------------------------------------------------- uint8x16x4_t bl0; uint8x16x4_t bl1; uint8x16x4_t bl2; uint8x16x4_t bl3; bl0.r0 = simd::fetch(inputRGBA); bl1.r0 = simd::fetch(inputRGBA + 16); bl2.r0 = simd::fetch(inputRGBA + 32); bl3.r0 = simd::fetch(inputRGBA + 48); inputRGBA += inputStride; bl0.r1 = simd::fetch(inputRGBA); bl1.r1 = simd::fetch(inputRGBA + 16); bl2.r1 = simd::fetch(inputRGBA + 32); bl3.r1 = simd::fetch(inputRGBA + 48); inputRGBA += inputStride; bl0.r2 = simd::fetch(inputRGBA); bl1.r2 = simd::fetch(inputRGBA + 16); bl2.r2 = simd::fetch(inputRGBA + 32); bl3.r2 = simd::fetch(inputRGBA + 48); inputRGBA += inputStride; bl0.r3 = simd::fetch(inputRGBA); bl1.r3 = simd::fetch(inputRGBA + 16); bl2.r3 = simd::fetch(inputRGBA + 32); bl3.r3 = simd::fetch(inputRGBA + 48); // Find min block colors // ----------------------------------------------------------- const uint8x16x4_t blMin = { simd::minu(simd::minu(bl0.r0, bl0.r1), simd::minu(bl0.r2, bl0.r3)), // min0_clmn0.rgba | min0_clmn1.rgba | min0_clmn2.rgba | min0_clmn3.rgba simd::minu(simd::minu(bl1.r0, bl1.r1), simd::minu(bl1.r2, bl1.r3)), // min1_clmn0.rgba | min1_clmn1.rgba | min1_clmn2.rgba | min1_clmn3.rgba simd::minu(simd::minu(bl2.r0, bl2.r1), simd::minu(bl2.r2, bl2.r3)), // min2_clmn0.rgba | min2_clmn1.rgba | min2_clmn2.rgba | min2_clmn3.rgba simd::minu(simd::minu(bl3.r0, bl3.r1), simd::minu(bl3.r2, bl3.r3)) // min3_clmn0.rgba | min3_clmn1.rgba | min3_clmn2.rgba | min3_clmn3.rgba }; // blMinTr (transposed blMin) // min0_clmn0.rgba | min1_clmn0.rgba | min2_clmn0.rgba | min3_clmn0.rgba // min0_clmn1.rgba | min1_clmn1.rgba | min2_clmn1.rgba | min3_clmn1.rgba // min0_clmn2.rgba | min1_clmn2.rgba | min2_clmn2.rgba | min3_clmn2.rgba // min0_clmn3.rgba | min1_clmn3.rgba | min2_clmn3.rgba | min3_clmn3.rgba const uint8x16x4_t blMinTr = simd::transposeAs4x4(blMin); // Per-block min colors // min0.rgba | min1.rgba | min2.rgba | min3.rgba const uint8x16_t minColors = simd::minu( simd::minu(blMinTr.r0, blMinTr.r1), simd::minu(blMinTr.r2, blMinTr.r3) ); // Same to find max block colors // ----------------------------------------------------------- const uint8x16x4_t blMax = { simd::maxu(simd::maxu(bl0.r0, bl0.r1), simd::maxu(bl0.r2, bl0.r3)), simd::maxu(simd::maxu(bl1.r0, bl1.r1), simd::maxu(bl1.r2, bl1.r3)), simd::maxu(simd::maxu(bl2.r0, bl2.r1), simd::maxu(bl2.r2, bl2.r3)), simd::maxu(simd::maxu(bl3.r0, bl3.r1), simd::maxu(bl3.r2, bl3.r3)) }; const uint8x16x4_t blMaxTr = simd::transposeAs4x4(blMax); // Per-block max colors // max0.rgba | max1.rgba | max2.rgba | max3.rgba const uint8x16_t maxColors = simd::maxu( simd::maxu(blMaxTr.r0, blMaxTr.r1), simd::maxu(blMaxTr.r2, blMaxTr.r3) ); // Find min/max brigtness // ----------------------------------------------------------- // Note: some SSE lanes wasted, it is not ideal, but seems OK-ish? // // min0.rgba | min0.rgba | min1.rgba | min1.rgba // min2.rgba | min2.rgba | min3.rgba | min3.rgba // max0.rgba | max0.rgba | max1.rgba | max1.rgba // max2.rgba | max2.rgba | max3.rgba | max3.rgba const uint8x16x4_t blMinMax = simd::zipU4x2(minColors, minColors, maxColors, maxColors); // Deinterleave // min0.rr | min1.rr | min2.rr | min3.rr | max0.rr | max1.rr | max2.rr | max3.rr // min0.gg | min1.gg | min2.gg | min3.gg | max0.gg | max1.gg | max2.gg | max3.gg // min0.bb | min1.bb | min2.bb | min3.bb | max0.bb | max1.bb | max2.bb | max3.bb const uint8x16x3_t blMinMaxDi = simd::deinterleaveRGB(blMinMax); // Get Y component of YCoCg color-model (perceptual brightness) // https://en.wikipedia.org/wiki/YCoCg // Y = 0.25 * R + 0.25 * B + 0.5 * G // We can rewrite equation above using the following form // Y = (((R + B) / 2) + G) / 2 // // Y = min0.yy | min1.yy | min2.yy | min3.yy | max0.yy | max1.yy | max2.yy | max3.yy const uint8x16_t Y = simd::avg(simd::avg(blMinMaxDi.r0, blMinMaxDi.r2), blMinMaxDi.r1); // Min/max brightness per block // R0 = min0.yyyy | min1.yyyy | min2.yyyy | min3.yyyy // R1 = max0.yyyy | max1.yyyy | max2.yyyy | max3.yyyy const uint8x16x2_t blMinMaxY = simd::zipB16(Y, Y); // Clamp to min brightness const uint8x16_t constEight = simd::fetch(&gConstEight); // range0.yyyy | range1.yyyy | range2.yyyy | range3.yyyy const uint8x16_t blRangeY = simd::maxu(simd::subsatu(blMinMaxY.r1, blMinMaxY.r0), constEight); // mid0.yyyy | mid1.yyyy | mid2.yyyy | mid3.yyyy const uint8x16_t blMidY = simd::avg(blMinMaxY.r0, blMinMaxY.r1); // Approximate multiplication by 0.375 to get quantization thresholds const uint8x16_t constZero = simd::zero(); const uint8x16_t blHalfRangeY = simd::avg(blRangeY, constZero); const uint8x16_t blQuarterRangeY = simd::avg(blHalfRangeY, constZero); const uint8x16_t blEighthsRangeY = simd::avg(blQuarterRangeY, constZero); // Threshold = (quarter + eights) = (0.25 + 0.125) ~= (range * 0.375) // qt0.yyyy | qt1.yyyy | qt2.yyyy | qt3.yyyy const uint8x16_t blQThreshold = simd::addsatu(blQuarterRangeY, blEighthsRangeY); // Quantization (generate indices) // ----------------------------------------------------------- const uint8x16_t constMaxInt = simd::fetch(&gConstMaxInt); // block 0 // // p0.r p1.r p2.r p3.r p4.r p5.r p6.r p7.r p8.r p9.r p10.r p11.r p12.r p13.r p14.r p15.r // p0.g p1.g p2.g p3.g p4.g p5.g p6.g p7.g p8.g p9.g p10.g p11.g p12.g p13.g p14.g p15.g // p0.b p1.b p2.b p3.b p4.b p5.b p6.b p7.b p8.b p9.b p10.b p11.b p12.b p13.b p14.b p15.b const uint8x16x3_t bl0Di = simd::deinterleaveRGB(bl0); // Convert RGB to brightness // per-pixel block brightness const uint8x16_t bl0Y = simd::avg(simd::avg(bl0Di.r0, bl0Di.r2), bl0Di.r1); // Block brightness to compare with const uint8x16_t bl0MidY = simd::replicateU0000(blMidY); // Brightness difference (per-pixel in block) // NOTE: we need to clamp difference to max signed int8, because of the signed comparison later const uint8x16_t bl0PosDiffY = simd::minu(simd::subsatu(bl0Y, bl0MidY), constMaxInt); const uint8x16_t bl0NegDiffY = simd::minu(simd::subsatu(bl0MidY, bl0Y), constMaxInt); // Greater or Equal to zero mask const uint8x16_t bl0GezMask = simd::cmpeqi(bl0NegDiffY, constZero); // Absolute diffference of brightness (per-pixel in block) const uint8x16_t bl0AbsDiffY = simd::or(bl0PosDiffY, bl0NegDiffY); // get quantization threshold for current block const uint8x16_t bl0QThreshold = simd::replicateU0000(blQThreshold); // Less than Quantization Threshold mask const uint8x16_t bl0LqtMask = simd::cmplti(bl0AbsDiffY, bl0QThreshold); // Here we've got two bitmasks // // GezMask = greater or equal than zero (per pixel) // LqtMask = less than quantization threshold (per pixel) // // // min qt qt max // x---------x-----+-----x---------x // 0 // // |---------------| greater or equal than zero (GezMask) // // |-----------| less than quantization threshold (LqtMask) // // block 1 const uint8x16x3_t bl1Di = simd::deinterleaveRGB(bl1); const uint8x16_t bl1Y = simd::avg(simd::avg(bl1Di.r0, bl1Di.r2), bl1Di.r1); const uint8x16_t bl1MidY = simd::replicateU1111(blMidY); const uint8x16_t bl1PosDiffY = simd::minu(simd::subsatu(bl1Y, bl1MidY), constMaxInt); const uint8x16_t bl1NegDiffY = simd::minu(simd::subsatu(bl1MidY, bl1Y), constMaxInt); const uint8x16_t bl1GezMask = simd::cmpeqi(bl1NegDiffY, constZero); const uint8x16_t bl1AbsDiffY = simd::or(bl1PosDiffY, bl1NegDiffY); const uint8x16_t bl1QThreshold = simd::replicateU1111(blQThreshold); const uint8x16_t bl1LqtMask = simd::cmplti(bl1AbsDiffY, bl1QThreshold); // block 2 const uint8x16x3_t bl2Di = simd::deinterleaveRGB(bl2); const uint8x16_t bl2Y = simd::avg(simd::avg(bl2Di.r0, bl2Di.r2), bl2Di.r1); const uint8x16_t bl2MidY = simd::replicateU2222(blMidY); const uint8x16_t bl2PosDiffY = simd::minu(simd::subsatu(bl2Y, bl2MidY), constMaxInt); const uint8x16_t bl2NegDiffY = simd::minu(simd::subsatu(bl2MidY, bl2Y), constMaxInt); const uint8x16_t bl2GezMask = simd::cmpeqi(bl2NegDiffY, constZero); const uint8x16_t bl2AbsDiffY = simd::or(bl2PosDiffY, bl2NegDiffY); const uint8x16_t bl2QThreshold = simd::replicateU2222(blQThreshold); const uint8x16_t bl2LqtMask = simd::cmplti(bl2AbsDiffY, bl2QThreshold); // block 3 const uint8x16x3_t bl3Di = simd::deinterleaveRGB(bl3); const uint8x16_t bl3Y = simd::avg(simd::avg(bl3Di.r0, bl3Di.r2), bl3Di.r1); const uint8x16_t bl3MidY = simd::replicateU3333(blMidY); const uint8x16_t bl3PosDiffY = simd::minu(simd::subsatu(bl3Y, bl3MidY), constMaxInt); const uint8x16_t bl3NegDiffY = simd::minu(simd::subsatu(bl3MidY, bl3Y), constMaxInt); const uint8x16_t bl3GezMask = simd::cmpeqi(bl3NegDiffY, constZero); const uint8x16_t bl3AbsDiffY = simd::or(bl3PosDiffY, bl3NegDiffY); const uint8x16_t bl3QThreshold = simd::replicateU3333(blQThreshold); const uint8x16_t bl3LqtMask = simd::cmplti(bl3AbsDiffY, bl3QThreshold); // Finalize blocks // ----------------------------------------------------------- if (CODEC_TYPE == GOOFY_DXT1) { // Generate DXT indices using given masks // DXT indices order // ------------------------- // C0(max) C2 C3 C1(min) // DEC: | 0 | 2 | 3 | 1 | // BIN: | 00b | 10b | 11b | 01b | // // | GezMask | // | LqtMask | // Zip two masks to match DX bits order // Gez0 | Lqt0 | Gez1 | Lqt1 | Gez2 | Lqt2 | Gez3 | Lqt3 | Gez4 | Lqt4 | Gez5 | Lqt5 | Gez6 | Lqt6 | Gez7 | Lqt7 // Gez8 | Lqt8 | Gez9 | Lqt9 | GezA | LqtA | GezB | LqtB | GezC | LqtC | GezD | LqtD | GezE | LqtE | GezF | LqtF const uint8x16x2_t bl0RawIndices = simd::zipB16(simd::not(bl0GezMask), bl0LqtMask); const uint8x16x2_t bl3RawIndices = simd::zipB16(simd::not(bl3GezMask), bl3LqtMask); const uint8x16x2_t bl2RawIndices = simd::zipB16(simd::not(bl2GezMask), bl2LqtMask); const uint8x16x2_t bl1RawIndices = simd::zipB16(simd::not(bl1GezMask), bl1LqtMask); // Bytes to bits uint32_t bl0Indices = simd::moveMaskMSB(bl0RawIndices.r0) | (simd::moveMaskMSB(bl0RawIndices.r1) << 16); uint32_t bl1Indices = simd::moveMaskMSB(bl1RawIndices.r0) | (simd::moveMaskMSB(bl1RawIndices.r1) << 16); uint32_t bl2Indices = simd::moveMaskMSB(bl2RawIndices.r0) | (simd::moveMaskMSB(bl2RawIndices.r1) << 16); uint32_t bl3Indices = simd::moveMaskMSB(bl3RawIndices.r0) | (simd::moveMaskMSB(bl3RawIndices.r1) << 16); // Convert rgb888 to rgb555 // We can't shift right by 3 using SIMD, but we can shift right by 1 three times instead // We need to sub eight before, because avg is (a+b+1) >> 1 // max555_0.rgba | max555_1.rgba | max555_2.rgba | max555_3.rgba const uint8x16_t maxColors555 = simd::avg(simd::avg(simd::avg(simd::subsatu(maxColors, constEight), constZero), constZero), constZero); // min555_0.rgba | min555_1.rgba | min555_2.rgba | min555_3.rgba const uint8x16_t minColors555 = simd::avg(simd::avg(simd::avg(simd::subsatu(minColors, constEight), constZero), constZero), constZero); // max555_0.rgba | min555_0.rgba | max555_1.rgba | min555_1.rgba // max555_2.rgba | min555_2.rgba | max555_3.rgba | min555_3.rgba const uint8x16x2_t maxMinColors555 = simd::zipU4(maxColors555, minColors555); const uint64x2_t maxMin01 = simd::getAsUInt64x2(maxMinColors555.r0); const uint64x2_t maxMin23 = simd::getAsUInt64x2(maxMinColors555.r1); // R0 // AAAAAAAA000000000000000000000000AAAAAAAA000000000000000000011111b << 11 = 0000000000000000 1111100000000000b // AAAAAAAA000000000000000000000000AAAAAAAA000000000001111100000000b >> 2 = 0000000000000000 0000011111000000b // AAAAAAAA000000000000000000000000AAAAAAAA000111110000000000000000b >> 16 = 0000000000000000 0000000000011111b // R1 // AAAAAAAA000000000000000000011111AAAAAAAA000000000000000000000000b >> 5 = 1111100000000000 0000000000000000b // AAAAAAAA000000000001111100000000AAAAAAAA000000000000000000000000b >> 18 = 0000011111000000 0000000000000000b // AAAAAAAA000111110000000000000000AAAAAAAA000000000000000000000000b >> 32 = 0000000000011111 0000000000000000b // 0x20 = 0000000000000000 0000000000100000b uint32_t* goofy_restrict pDest = (uint32_t* goofy_restrict)pResult; uint32_t block0a = (uint32_t)(0x20 | // max color green channel LSB (to avoid switching to DXT1 3-color mode) (maxMin01.r0 & 0x1Full) << 11ull | (maxMin01.r0 & 0x1F00ull) >> 2ull | (maxMin01.r0 & 0x1F0000ull) >> 16ull | // max color (maxMin01.r0 & 0x1F00000000ull) >> 5ull | (maxMin01.r0 & 0x1F0000000000ull) >> 18ull | (maxMin01.r0 & 0x1F000000000000ull) >> 32ull); // min color //uint32_t block0b = bl0Indices << 32ull; // indices *pDest = block0a; pDest++; *pDest = bl0Indices; uint32_t block1a = (uint32_t)(0x20 | (maxMin01.r1 & 0x1Full) << 11ull | (maxMin01.r1 & 0x1F00ull) >> 2ull | (maxMin01.r1 & 0x1F0000ull) >> 16ull | (maxMin01.r1 & 0x1F00000000ull) >> 5ull | (maxMin01.r1 & 0x1F0000000000ull) >> 18ull | (maxMin01.r1 & 0x1F000000000000ull) >> 32ull); //uint32_t block1b = bl1Indices << 32ull; pDest++; *pDest = block1a; pDest++; *pDest = bl1Indices; uint32_t block2a = (uint32_t)(0x20 | (maxMin23.r0 & 0x1Full) << 11ull | (maxMin23.r0 & 0x1F00ull) >> 2ull | (maxMin23.r0 & 0x1F0000ull) >> 16ull | (maxMin23.r0 & 0x1F00000000ull) >> 5ull | (maxMin23.r0 & 0x1F0000000000ull) >> 18ull | (maxMin23.r0 & 0x1F000000000000ull) >> 32ull); //bl2Indices << 32ull; pDest++; *pDest = block2a; pDest++; *pDest = bl2Indices; uint32_t block3a = (uint32_t)(0x20 | (maxMin23.r1 & 0x1Full) << 11ull | (maxMin23.r1 & 0x1F00ull) >> 2ull | (maxMin23.r1 & 0x1F0000ull) >> 16ull | (maxMin23.r1 & 0x1F00000000ull) >> 5ull | (maxMin23.r1 & 0x1F0000000000ull) >> 18ull | (maxMin23.r1 & 0x1F000000000000ull) >> 32ull); //bl3Indices << 32ull; pDest++; *pDest = block3a; pDest++; *pDest = bl3Indices; } else if (CODEC_TYPE == GOOFY_ETC1) { // Combined masks (major bit = GreaterEqualZero other 7 bits = LessQuantizationThreshold) const uint8x16x4_t blMasks = { simd::or(simd::andnot(constMaxInt, bl0GezMask), simd::and(bl0LqtMask, constMaxInt)), simd::or(simd::andnot(constMaxInt, bl1GezMask), simd::and(bl1LqtMask, constMaxInt)), simd::or(simd::andnot(constMaxInt, bl2GezMask), simd::and(bl2LqtMask, constMaxInt)), simd::or(simd::andnot(constMaxInt, bl3GezMask), simd::and(bl3LqtMask, constMaxInt)) }; // +---+---+---+---+ +---+---+---+---+ // | A | B | C | D | | C | G | K | O | // +---+---+---+---+ +---+---+---+---+ // | E | F | G | H | | D | H | L | P | // +---+---+---+---+ --> +---+---+---+---+ // | I | J | K | L | | A | E | I | M | // +---+---+---+---+ +---+---+---+---+ // | M | N | O | P | | B | F | J | N | // +---+---+---+---+ +---+---+---+---+ const uint8x16x4_t blMasksTr = simd::transposeAs4x4x4(blMasks); // Unpack masks and copy from bytes to bits const uint32_t bl0PosOrZero = simd::moveMaskMSB(blMasksTr.r0); const uint32_t bl1PosOrZero = simd::moveMaskMSB(blMasksTr.r1); const uint32_t bl2PosOrZero = simd::moveMaskMSB(blMasksTr.r2); const uint32_t bl3PosOrZero = simd::moveMaskMSB(blMasksTr.r3); uint8x16_t bl0LessThanQtMask = simd::and(blMasksTr.r0, constMaxInt); uint8x16_t bl1LessThanQtMask = simd::and(blMasksTr.r1, constMaxInt); uint8x16_t bl2LessThanQtMask = simd::and(blMasksTr.r2, constMaxInt); uint8x16_t bl3LessThanQtMask = simd::and(blMasksTr.r3, constMaxInt); bl0LessThanQtMask = simd::addsatu(bl0LessThanQtMask, bl0LessThanQtMask); bl1LessThanQtMask = simd::addsatu(bl1LessThanQtMask, bl1LessThanQtMask); bl2LessThanQtMask = simd::addsatu(bl2LessThanQtMask, bl2LessThanQtMask); bl3LessThanQtMask = simd::addsatu(bl3LessThanQtMask, bl3LessThanQtMask); const uint32_t bl0LessThanQt = simd::moveMaskMSB(bl0LessThanQtMask); const uint32_t bl1LessThanQt = simd::moveMaskMSB(bl1LessThanQtMask); const uint32_t bl2LessThanQt = simd::moveMaskMSB(bl2LessThanQtMask); const uint32_t bl3LessThanQt = simd::moveMaskMSB(bl3LessThanQtMask); #if 1 // Keep chromatic component from the average color, but override brightness // NOTE: This is slightly slower but gets slightly better quality // Find average blocks color const uint8x16x4_t blAvg = { simd::avg(simd::avg(bl0.r0, bl0.r1), simd::avg(bl0.r2, bl0.r3)), simd::avg(simd::avg(bl1.r0, bl1.r1), simd::avg(bl1.r2, bl1.r3)), simd::avg(simd::avg(bl2.r0, bl2.r1), simd::avg(bl2.r2, bl2.r3)), simd::avg(simd::avg(bl3.r0, bl3.r1), simd::avg(bl3.r2, bl3.r3)) }; const uint8x16x4_t blAvgTr = simd::transposeAs4x4(blAvg); const uint8x16_t blAvgColors = simd::avg( simd::avg(blAvgTr.r0, blAvgTr.r1), simd::avg(blAvgTr.r2, blAvgTr.r3) ); // Note: a lot of SSE lanes wasted, it is not ideal, TODO? // // avg0.rgba | avg0.rgba | avg1.rgba | avg1.rgba // avg2.rgba | avg2.rgba | avg3.rgba | avg3.rgba // avg0.rgba | avg0.rgba | avg1.rgba | avg1.rgba // avg2.rgba | avg2.rgba | avg3.rgba | avg3.rgba const uint8x16x4_t blAvg4 = simd::zipU4x2(blAvgColors, blAvgColors, blAvgColors, blAvgColors); // Deinterleave // avg0.rr | avg1.rr | avg2.rr | avg3.rr | avg0.rr | avg1.rr | avg2.rr | avg3.rr // avg0.gg | avg1.gg | avg2.gg | avg3.gg | avg0.gg | avg1.gg | avg2.gg | avg3.gg // avg0.bb | avg1.bb | avg2.bb | avg3.bb | avg0.bb | avg1.bb | avg2.bb | avg3.bb const uint8x16x3_t blAvg4Di = simd::deinterleaveRGB(blAvg4); // Y = avg0.yy | avg1.yy | avg2.yy | avg3.yy | avg0.yy | avg1.yy | avg2.yy | avg3.yy const uint8x16_t Y = simd::avg(simd::avg(blAvg4Di.r0, blAvg4Di.r2), blAvg4Di.r1); // Min/max brightness per block // R0 = avg0.yyyy | avg1.yyyy | avg2.yyyy | avg3.yyyy // R1 = avg0.yyyy | avg1.yyyy | avg2.yyyy | avg3.yyyy // NOTE: not used! const uint8x16x2_t blAvgY = simd::zipB16(Y, Y); const uint8x16_t blPosCorrectionY = simd::minu(simd::subsatu(blMidY, blAvgY.r0), constMaxInt); const uint8x16_t blNegCorrectionY = simd::minu(simd::subsatu(blAvgY.r0, blMidY), constMaxInt); const uint8x16_t blCorrectionYGezMask = simd::cmpeqi(blNegCorrectionY, constZero); const uint8x16_t blCorrectionYAbs = simd::or(blPosCorrectionY, blNegCorrectionY); // Get the color in the middle between min/max colors of the block. // NOTE: this is not the same as an average block color. const uint8x16_t blBaseColorsPos = simd::addsatu(blAvgColors, blCorrectionYAbs); const uint8x16_t blBaseColorsNeg = simd::subsatu(blAvgColors, blCorrectionYAbs); const uint8x16_t blBaseColors = simd::select(blCorrectionYGezMask, blBaseColorsPos, blBaseColorsNeg); #else // Get the color in the middle between min/max colors of the block. // NOTE: this is not the same as an average block color. const uint8x16_t blBaseColors = simd::avg(minColors, maxColors); #endif // Convert rgb888 to rgb555 // We can't shift right by 3 using SIMD, but we can shift right by 1 three times instead // We need to sub eight before, because avg is (a+b+1) >> 1 // mid555_0.rgba | mid555_1.rgba | mid555_2.rgba | mid555_3.rgba const uint8x16_t baseColors555 = simd::avg(simd::avg(simd::avg(simd::subsatu(blBaseColors, constEight), constZero), constZero), constZero); const uint64x2_t baseColors = simd::getAsUInt64x2(baseColors555); // R0 // AAAAAAAA000000000000000000000000AAAAAAAA000000000000000000011111b << 3 = 00000000 00000000 11111000b // AAAAAAAA000000000000000000000000AAAAAAAA000000000001111100000000b << 3 = 00000000 11111000 00000000b // AAAAAAAA000000000000000000000000AAAAAAAA000111110000000000000000b << 3 = 11111000 00000000 00000000b // R1 // AAAAAAAA000000000000000000011111AAAAAAAA000000000000000000000000b >> 29 = 00000000 00000000 11111000b // AAAAAAAA000000000001111100000000AAAAAAAA000000000000000000000000b >> 29 = 00000000 11111000 00000000b // AAAAAAAA000111110000000000000000AAAAAAAA000000000000000000000000b >> 29 = 11111000 00000000 00000000b uint32_t* goofy_restrict pDest = (uint32_t* goofy_restrict)pResult; const uint32_t block0a = etc1BrighnessRangeTocontrolByte[blRangeY.m128i_u8[0]] | ((baseColors.r0 << 3ull) & 0xFFFFFF); const uint32_t block0b = ~(bl0PosOrZero | (bl0LessThanQt << 16)); *pDest = block0a; pDest++; *pDest = block0b; const uint32_t block1a = etc1BrighnessRangeTocontrolByte[blRangeY.m128i_u8[4]] | ((baseColors.r0 >> 29ull) & 0xFFFFFF); const uint32_t block1b = ~(bl1PosOrZero | (bl1LessThanQt << 16)); pDest++; *pDest = block1a; pDest++; *pDest = block1b; const uint32_t block2a = etc1BrighnessRangeTocontrolByte[blRangeY.m128i_u8[8]] | ((baseColors.r1 << 3ull) & 0xFFFFFF); const uint32_t block2b = ~(bl2PosOrZero | (bl2LessThanQt << 16)); pDest++; *pDest = block2a; pDest++; *pDest = block2b; const uint32_t block3a = etc1BrighnessRangeTocontrolByte[blRangeY.m128i_u8[12]] | ((baseColors.r1 >> 29ull) & 0xFFFFFF); const uint32_t block3b = ~(bl3PosOrZero | (bl3LessThanQt << 16)); pDest++; *pDest = block3a; pDest++; *pDest = block3b; } } int compressDXT1(unsigned char* result, const unsigned char* input, unsigned int width, unsigned int height, unsigned int stride) { // those checks are required because of 4x1 block window inside the compressor if (width % 16 != 0) { return -1; } if (height % 4 != 0) { return -2; } unsigned int blockW = width >> 2; unsigned int blockH = height >> 2; size_t inputStride = stride; for (uint32_t y = 0; y < blockH; y++) { const unsigned char* goofy_restrict encoderPos = input; for (uint32_t x = 0; x < blockW; x += 4) { goofySimdEncode(encoderPos, inputStride, result); encoderPos += 64; // 16 rgba pixels (4 DXT blocks) = 16 * 4 = 64 result += 32; // 4 DXT1 blocks = 8 * 4 = 32 } input += inputStride * 4; // 4 lines } return 0; } int compressETC1(unsigned char* result, const unsigned char* input, unsigned int width, unsigned int height, unsigned int stride) { // those checks are required because of 4x1 block window inside the compressor if (width % 16 != 0) { return -1; } if (height % 4 != 0) { return -2; } unsigned int blockW = width >> 2; unsigned int blockH = height >> 2; size_t inputStride = stride; for (uint32_t y = 0; y < blockH; y++) { const unsigned char* goofy_restrict encoderPos = input; for (uint32_t x = 0; x < blockW; x += 4) { goofySimdEncode(encoderPos, inputStride, result); encoderPos += 64; // 16 rgba pixels (4 DXT blocks) = 16 * 4 = 64 result += 32; // 4 DXT1 blocks = 8 * 4 = 32 } input += inputStride * 4; // 4 lines } return 0; } #undef goofy_restrict #undef goofy_inline #undef goofy_align16 } #endif // Copyright (c) 2020 Sergey Makeev // // Permission is hereby granted, free of charge, to any person obtaining // a copy of this software and associated documentation files (the // "Software"), to deal in the Software without restriction, including // without limitation the rights to use, copy, modify, merge, publish, // distribute, sublicense, and/or sell copies of the Software, and to // permit persons to whom the Software is furnished to do so, subject to // the following conditions: // // The above copyright notice and this permission notice shall be included // in all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. // IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY // CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, // TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE // SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.