diff --git a/extern/goofy_tc.h b/extern/goofy_tc.h
new file mode 100644
index 0000000..43197af
--- /dev/null
+++ b/extern/goofy_tc.h
@@ -0,0 +1,1565 @@
+// goofy_tc.h v1.0
+// Realtime BC1/ETC1 encoder by Sergey Makeev <sergeymakeev@hotmail.com>
+// 
+// LICENSE:
+//  MIT license at the end of this file.
+
+namespace goofy
+{
+    int compressDXT1(unsigned char* result, const unsigned char* input, unsigned int width, unsigned int height, unsigned int stride);
+    int compressETC1(unsigned char* result, const unsigned char* input, unsigned int width, unsigned int height, unsigned int stride);
+}
+
+#include <stdint.h>
+
+// Enable SSE2 codec
+#define GOOFY_SSE2 (1)
+
+#define goofy_restrict __restrict
+#define goofy_inline __forceinline
+
+#define goofy_align16(x) __declspec(align(16)) x
+
+#ifdef GOOFY_SSE2
+#include <emmintrin.h>  // SSE2
+#else
+#include <string> // memset/memcpy
+#endif
+
+#ifdef GOOFYTC_IMPLEMENTATION
+namespace goofy
+{
+
+// constants
+goofy_align16(static const uint32_t gConstEight[4]) = { 0x08080808, 0x08080808, 0x08080808, 0x08080808 };
+goofy_align16(static const uint32_t gConstSixteen[4]) = { 0x10101010, 0x10101010, 0x10101010, 0x10101010 };
+goofy_align16(static const uint32_t gConstMaxInt[4]) = { 0x7f7f7f7f, 0x7f7f7f7f, 0x7f7f7f7f, 0x7f7f7f7f };
+
+#ifdef GOOFY_SSE2
+typedef __m128i uint8x16_t;
+#else
+
+struct uint8x16_t
+{
+    union
+    {
+        uint8_t data[16];
+
+        int8_t m128i_i8[16];
+        uint8_t m128i_u8[16];
+
+        struct
+        {
+            uint8_t r0;
+            uint8_t g0;
+            uint8_t b0;
+            uint8_t a0;
+
+            uint8_t r1;
+            uint8_t g1;
+            uint8_t b1;
+            uint8_t a1;
+
+            uint8_t r2;
+            uint8_t g2;
+            uint8_t b2;
+            uint8_t a2;
+
+            uint8_t r3;
+            uint8_t g3;
+            uint8_t b3;
+            uint8_t a3;
+        };
+
+        struct
+        {
+            uint16_t s0;
+            uint16_t s1;
+            uint16_t s2;
+            uint16_t s3;
+            uint16_t s4;
+            uint16_t s5;
+            uint16_t s6;
+            uint16_t s7;
+        };
+
+        struct
+        {
+            uint32_t u0;
+            uint32_t u1;
+            uint32_t u2;
+            uint32_t u3;
+        };
+
+        struct
+        {
+            uint64_t l0;
+            uint64_t l1;
+        };
+    };
+};
+
+#endif
+
+
+// 2x16xU8 
+struct uint8x16x2_t
+{
+    // rows
+    uint8x16_t r0;
+    uint8x16_t r1;
+};
+
+// 3x16xU8 
+struct uint8x16x3_t
+{
+    // rows
+    uint8x16_t r0;
+    uint8x16_t r1;
+    uint8x16_t r2;
+};
+
+// 4x16xU8 
+struct uint8x16x4_t
+{
+    // rows
+    uint8x16_t r0;
+    uint8x16_t r1;
+    uint8x16_t r2;
+    uint8x16_t r3;
+};
+
+// 2xU64
+struct uint64x2_t
+{
+    uint64_t r0;
+    uint64_t r1;
+};
+
+
+namespace simd
+{
+// SSE2 implementation    
+#ifdef GOOFY_SSE2
+
+    goofy_inline uint8x16_t zero()
+    {
+        return _mm_setzero_si128();
+    }
+
+    goofy_inline uint8x16_t fetch(const void* p)
+    {
+        return _mm_load_si128((const __m128i*)p);
+    }
+
+    goofy_inline uint64x2_t getAsUInt64x2(const uint8x16_t& a)
+    {
+        uint64x2_t res;
+        res.r0 = _mm_cvtsi128_si64(a);
+        res.r1 = _mm_cvtsi128_si64(_mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
+        return res;
+    }
+
+    goofy_inline uint8x16_t or(const uint8x16_t& a, const uint8x16_t& b)
+    {
+        return _mm_or_si128(a, b);
+    }
+
+    goofy_inline uint8x16_t and(const uint8x16_t& a, const uint8x16_t& b)
+    {
+        return _mm_and_si128(a, b);
+    }
+
+    goofy_inline uint8x16_t andnot(const uint8x16_t& a, const uint8x16_t& b)
+    {
+        return _mm_andnot_si128(a, b);
+    }
+
+    goofy_inline uint8x16_t select(const uint8x16_t& mask, const uint8x16_t& a, const uint8x16_t& b)
+    {
+        return _mm_or_si128(_mm_and_si128(mask, a), _mm_andnot_si128(mask, b));
+    }
+
+    goofy_inline uint8x16_t minu(const uint8x16_t& a, const uint8x16_t& b)
+    {
+        return _mm_min_epu8(a, b);
+    }
+
+    goofy_inline uint8x16_t maxu(const uint8x16_t& a, const uint8x16_t& b)
+    {
+        return _mm_max_epu8(a, b);
+    }
+
+    goofy_inline uint8x16_t avg(const uint8x16_t& a, const uint8x16_t& b)
+    {
+        return _mm_avg_epu8(a, b);
+    }
+
+    goofy_inline uint8x16_t replicateU0000(const uint8x16_t& a)
+    {
+        return _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 0));
+    }
+
+    goofy_inline uint8x16_t replicateU1111(const uint8x16_t& a)
+    {
+        return _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 1, 1, 1));
+    }
+
+    goofy_inline uint8x16_t replicateU2222(const uint8x16_t& a)
+    {
+        return _mm_shuffle_epi32(a, _MM_SHUFFLE(2, 2, 2, 2));
+    }
+
+    goofy_inline uint8x16_t replicateU3333(const uint8x16_t& a)
+    {
+        return _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 3, 3, 3));
+    }
+
+    goofy_inline uint8x16_t cmpeqi(const uint8x16_t& a, const uint8x16_t& b)
+    {
+        return _mm_cmpeq_epi8(a, b);
+    }
+
+    goofy_inline uint8x16_t cmplti(const uint8x16_t& a, const uint8x16_t& b)
+    {
+        return _mm_cmplt_epi8(a, b);
+    }
+
+    goofy_inline uint8x16_t addsatu(const uint8x16_t& a, const uint8x16_t& b)
+    {
+        return _mm_adds_epu8(a, b);
+    }
+
+    goofy_inline uint8x16_t subsatu(const uint8x16_t& a, const uint8x16_t& b)
+    {
+        return _mm_subs_epu8(a, b);
+    }
+
+    goofy_inline uint8x16x4_t transposeAs4x4(const uint8x16x4_t& v)
+    {
+        uint8x16_t tr0 = _mm_unpacklo_epi32(v.r0, v.r1);
+        uint8x16_t tr1 = _mm_unpacklo_epi32(v.r2, v.r3);
+        uint8x16_t tr2 = _mm_unpackhi_epi32(v.r0, v.r1);
+        uint8x16_t tr3 = _mm_unpackhi_epi32(v.r2, v.r3);
+
+        uint8x16x4_t res;
+        res.r0 = _mm_unpacklo_epi64(tr0, tr1);
+        res.r1 = _mm_unpackhi_epi64(tr0, tr1);
+        res.r2 = _mm_unpacklo_epi64(tr2, tr3);
+        res.r3 = _mm_unpackhi_epi64(tr2, tr3);
+        return res;
+    }
+
+    goofy_inline uint8x16x3_t deinterleaveRGB(const uint8x16x4_t& v)
+    {
+        uint8x16_t s0a = _mm_unpacklo_epi8(v.r0, v.r1);
+        uint8x16_t s0b = _mm_unpackhi_epi8(v.r0, v.r1);
+        uint8x16_t s0c = _mm_unpacklo_epi8(v.r2, v.r3);
+        uint8x16_t s0d = _mm_unpackhi_epi8(v.r2, v.r3);
+        uint8x16_t s1a = _mm_unpacklo_epi8(s0a, s0b);
+        uint8x16_t s1b = _mm_unpackhi_epi8(s0a, s0b);
+        uint8x16_t s1c = _mm_unpacklo_epi8(s0c, s0d);
+        uint8x16_t s1d = _mm_unpackhi_epi8(s0c, s0d);
+        uint8x16_t s2a = _mm_unpacklo_epi8(s1a, s1b);
+        uint8x16_t s2b = _mm_unpackhi_epi8(s1a, s1b);
+        uint8x16_t s2c = _mm_unpacklo_epi8(s1c, s1d);
+        uint8x16_t s2d = _mm_unpackhi_epi8(s1c, s1d);
+
+        uint8x16x3_t res;
+        res.r0 = _mm_unpacklo_epi64(s2a, s2c);   // red
+        res.r1 = _mm_unpackhi_epi64(s2a, s2c);   // green
+        res.r2 = _mm_unpacklo_epi64(s2b, s2d);   // blue
+        //res.r3 = _mm_unpackhi_epi64(s2b, s2d); // alpha
+        return res;
+    }
+
+    // transpose as four single channel 4x4 blocks at once
+    //
+    // in:
+    //
+    // R0  = | bl0.abcd | bl0.efgh | bl0.ijkl | bl0.mnop |
+    // R1  = | bl1.abcd | bl1.efgh | bl1.ijkl | bl1.mnop |
+    // R2  = | bl2.abcd | bl2.efgh | bl2.ijkl | bl2.mnop |
+    // R3  = | bl3.abcd | bl3.efgh | bl3.ijkl | bl3.mnop |
+    //
+    // out:
+    //
+    // R1  = | bl0.aeim | bl0.bfjo | bl0.cgko | bl0.dhkl |
+    // R2  = | bl1.aeim | bl1.bfjo | bl1.cgko | bl1.dhkl |
+    // R3  = | bl2.aeim | bl2.bfjo | bl2.cgko | bl2.dhkl |
+    // R0  = | bl3.aeim | bl3.bfjo | bl3.cgko | bl3.dhkl |
+    //
+    //  +---+---+---+---+           +---+---+---+---+
+    //  | A | B | C | D |           | A | E | I | M |
+    //  +---+---+---+---+           +---+---+---+---+
+    //  | E | F | G | H |           | B | F | J | O |
+    //  +---+---+---+---+    -->    +---+---+---+---+  
+    //  | I | J | K | L |           | C | G | K | O |
+    //  +---+---+---+---+           +---+---+---+---+
+    //  | M | N | O | P |           | D | H | K | L |
+    //  +---+---+---+---+           +---+---+---+---+
+    //
+    goofy_inline uint8x16x4_t transposeAs4x4x4(const uint8x16x4_t& v)
+    {
+        const uint8x16_t s0a = _mm_unpacklo_epi8(v.r0, v.r1);
+        const uint8x16_t s0b = _mm_unpackhi_epi8(v.r0, v.r1);
+        const uint8x16_t s0c = _mm_unpacklo_epi8(v.r2, v.r3);
+        const uint8x16_t s0d = _mm_unpackhi_epi8(v.r2, v.r3);
+        const uint8x16_t s1a = _mm_unpacklo_epi8(s0a, s0b);
+        const uint8x16_t s1b = _mm_unpackhi_epi8(s0a, s0b);
+        const uint8x16_t s1c = _mm_unpacklo_epi8(s0c, s0d);
+        const uint8x16_t s1d = _mm_unpackhi_epi8(s0c, s0d);
+        const uint8x16_t s2a = _mm_unpacklo_epi8(s1a, s1b);
+        const uint8x16_t s2b = _mm_unpackhi_epi8(s1a, s1b);
+        const uint8x16_t s2c = _mm_unpacklo_epi8(s1c, s1d);
+        const uint8x16_t s2d = _mm_unpackhi_epi8(s1c, s1d);
+
+        const uint8x16_t s3a = _mm_unpacklo_epi32(s2a, s2b);
+        const uint8x16_t s3b = _mm_unpackhi_epi32(s2a, s2b);
+        const uint8x16_t s3c = _mm_unpacklo_epi32(s2c, s2d);
+        const uint8x16_t s3d = _mm_unpackhi_epi32(s2c, s2d);
+
+        const uint8x16_t s4a = _mm_unpacklo_epi64(s3a, s3b);
+        const uint8x16_t s4b = _mm_unpackhi_epi64(s3a, s3b);
+        const uint8x16_t s4c = _mm_unpacklo_epi64(s3c, s3d);
+        const uint8x16_t s4d = _mm_unpackhi_epi64(s3c, s3d);
+
+        uint8x16x4_t res;
+        res.r0 = _mm_shuffle_epi32(s4a, _MM_SHUFFLE(2, 0, 3, 1));
+        res.r1 = _mm_shuffle_epi32(s4b, _MM_SHUFFLE(2, 0, 3, 1));
+        res.r2 = _mm_shuffle_epi32(s4c, _MM_SHUFFLE(2, 0, 3, 1));
+        res.r3 = _mm_shuffle_epi32(s4d, _MM_SHUFFLE(2, 0, 3, 1));
+        return res;
+    }
+
+    goofy_inline uint8x16x4_t zipU4x2(const uint8x16_t& a, const uint8x16_t& b, const uint8x16_t& c, const uint8x16_t& d)
+    {
+        const uint8x16x4_t res = {
+            _mm_unpacklo_epi32(a, b),
+            _mm_unpackhi_epi32(a, b),
+            _mm_unpacklo_epi32(c, d),
+            _mm_unpackhi_epi32(c, d),
+        };
+
+        return res;
+    }
+
+    goofy_inline uint8x16x2_t zipU4(const uint8x16_t& a, const uint8x16_t& b)
+    {
+        uint8x16x2_t res;
+        res.r0 = _mm_unpacklo_epi32(a, b);
+        res.r1 = _mm_unpackhi_epi32(a, b);
+        return res;
+    }
+
+    goofy_inline uint32_t moveMaskMSB(const uint8x16_t& v)
+    {
+        return (uint32_t)_mm_movemask_epi8(v);
+    }
+
+    goofy_inline uint8x16x2_t zipB16(const uint8x16_t& a, const uint8x16_t& b)
+    {
+        uint8x16x2_t res;
+        res.r0 = _mm_unpacklo_epi8(a, b);
+        res.r1 = _mm_unpackhi_epi8(a, b);
+        return res;
+    }
+
+    goofy_inline uint8x16_t not(const uint8x16_t& v)
+    {
+        return _mm_xor_si128(v, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
+    }
+
+#else
+    // generic CPU implementation    
+    namespace detail
+    {
+        goofy_inline uint8x16_t unpacklo16(const uint8x16_t& a, const uint8x16_t& b)
+        {
+            uint8x16_t res;
+            res.s0 = a.s0;
+            res.s1 = b.s0;
+            res.s2 = a.s1;
+            res.s3 = b.s1;
+            res.s4 = a.s2;
+            res.s5 = b.s2;
+            res.s6 = a.s3;
+            res.s7 = b.s3;
+            return res;
+        }
+
+        goofy_inline uint8x16_t unpackhi16(const uint8x16_t& a, const uint8x16_t& b)
+        {
+            uint8x16_t res;
+            res.s0 = a.s4;
+            res.s1 = b.s4;
+            res.s2 = a.s5;
+            res.s3 = b.s5;
+            res.s4 = a.s6;
+            res.s5 = b.s6;
+            res.s6 = a.s7;
+            res.s7 = b.s7;
+            return res;
+        }
+
+        goofy_inline uint8x16_t unpacklo8(const uint8x16_t& a, const uint8x16_t& b)
+        {
+            uint8x16_t res;
+            res.data[0] = a.data[0];
+            res.data[1] = b.data[0];
+            res.data[2] = a.data[1];
+            res.data[3] = b.data[1];
+            res.data[4] = a.data[2];
+            res.data[5] = b.data[2];
+            res.data[6] = a.data[3];
+            res.data[7] = b.data[3];
+            res.data[8] = a.data[4];
+            res.data[9] = b.data[4];
+            res.data[10] = a.data[5];
+            res.data[11] = b.data[5];
+            res.data[12] = a.data[6];
+            res.data[13] = b.data[6];
+            res.data[14] = a.data[7];
+            res.data[15] = b.data[7];
+            return res;
+        }
+
+        goofy_inline uint8x16_t unpackhi8(const uint8x16_t& a, const uint8x16_t& b)
+        {
+            uint8x16_t res;
+            res.data[0] = a.data[8];
+            res.data[1] = b.data[8];
+            res.data[2] = a.data[9];
+            res.data[3] = b.data[9];
+            res.data[4] = a.data[10];
+            res.data[5] = b.data[10];
+            res.data[6] = a.data[11];
+            res.data[7] = b.data[11];
+            res.data[8] = a.data[12];
+            res.data[9] = b.data[12];
+            res.data[10] = a.data[13];
+            res.data[11] = b.data[13];
+            res.data[12] = a.data[14];
+            res.data[13] = b.data[14];
+            res.data[14] = a.data[15];
+            res.data[15] = b.data[15];
+            return res;
+        }
+
+        goofy_inline uint8x16_t unpacklo64(const uint8x16_t& a, const uint8x16_t& b)
+        {
+            uint8x16_t res;
+            res.l0 = a.l0;
+            res.l1 = b.l0;
+            return res;
+        }
+
+        goofy_inline uint8x16_t unpackhi64(const uint8x16_t& a, const uint8x16_t& b)
+        {
+            uint8x16_t res;
+            res.l0 = a.l1;
+            res.l1 = b.l1;
+            return res;
+        }
+
+        goofy_inline uint8x16_t unpacklo32(const uint8x16_t& a, const uint8x16_t& b)
+        {
+            uint8x16_t res;
+            res.u0 = a.u0;
+            res.u1 = b.u0;
+            res.u2 = a.u1;
+            res.u3 = b.u1;
+            return res;
+        }
+
+        goofy_inline uint8x16_t unpackhi32(const uint8x16_t& a, const uint8x16_t& b)
+        {
+            uint8x16_t res;
+            res.u0 = a.u2;
+            res.u1 = b.u2;
+            res.u2 = a.u3;
+            res.u3 = b.u3;
+            return res;
+        }
+
+        goofy_inline uint8x16_t replicateU0011(const uint8x16_t& a)
+        {
+            uint8x16_t res;
+            res.u0 = a.u0;
+            res.u1 = a.u0;
+            res.u2 = a.u1;
+            res.u3 = a.u1;
+            return res;
+        }
+
+        goofy_inline uint8x16_t replicateU2233(const uint8x16_t& a)
+        {
+            uint8x16_t res;
+            res.u0 = a.u2;
+            res.u1 = a.u2;
+            res.u2 = a.u3;
+            res.u3 = a.u3;
+            return res;
+        }
+
+        goofy_inline uint8x16_t swizzleU1302(const uint8x16_t& a)
+        {
+            uint8x16_t res;
+            res.u0 = a.u1;
+            res.u1 = a.u3;
+            res.u2 = a.u0;
+            res.u3 = a.u2;
+            return res;
+        }
+
+    } //detail
+
+    goofy_inline uint8x16_t zero()
+    {
+        uint8x16_t r;
+        memset(&r, 0, sizeof(uint8x16_t));
+        return r;
+    }
+
+    goofy_inline uint8x16_t fetch(const void* p)
+    {
+        uint8x16_t r;
+        memcpy(&r, p, sizeof(uint8x16_t));
+        return r;
+    }
+
+    goofy_inline uint64x2_t getAsUInt64x2(const uint8x16_t& a)
+    {
+        uint64x2_t res;
+        res.r0 = a.l0;
+        res.r1 = a.l1;
+        return res;
+    }
+
+    // bit or
+    goofy_inline uint8x16_t or(const uint8x16_t& a, const uint8x16_t& b)
+    {
+        uint8x16_t res;
+        for (uint32_t i = 0; i < 16; i++)
+        {
+            res.data[i] = a.data[i] | b.data[i];
+        }
+        return res;
+    }
+
+    // bit and
+    goofy_inline uint8x16_t and(const uint8x16_t& a, const uint8x16_t& b)
+    {
+        uint8x16_t res;
+        for (uint32_t i = 0; i < 16; i++)
+        {
+            res.data[i] = a.data[i] & b.data[i];
+        }
+        return res;
+    }
+
+    goofy_inline uint8x16_t andnot(const uint8x16_t& a, const uint8x16_t& b)
+    {
+        uint8x16_t res;
+        for (uint32_t i = 0; i < 16; i++)
+        {
+            res.data[i] = (~a.data[i]) & b.data[i];
+        }
+        return res;
+    }
+
+    goofy_inline uint32_t moveMaskMSB(const uint8x16_t& v)
+    {
+        uint32_t res = 0;
+        for (uint32_t i = 0; i < 16; i++)
+        {
+            uint32_t msb = ((v.data[i] & 0x80) >> 7);
+            res = res | (msb << i);
+        }
+        return res;
+    }
+
+    //
+    // if (maskA) {
+    //  return a;
+    // }
+    // else {
+    //  return b;
+    // }
+    goofy_inline uint8x16_t select(const uint8x16_t& mask, const uint8x16_t& a, const uint8x16_t& b)
+    {
+        uint8x16_t res;
+        for (uint32_t i = 0; i < 16; i++)
+        {
+            unsigned char msk = mask.data[i];
+            res.data[i] = (msk & a.data[i]) | ((~msk) & b.data[i]); // _mm_or_si128(_mm_and_si128(mask, a), _mm_andnot_si128(mask, b))
+        }
+        return res;
+    }
+
+    goofy_inline uint8x16_t minu(const uint8x16_t& a, const uint8x16_t& b)
+    {
+        uint8x16_t res;
+        for (uint32_t i = 0; i < 16; i++)
+        {
+            res.data[i] = (a.data[i] < b.data[i]) ? a.data[i] : b.data[i];
+        }
+        return res;
+    }
+
+    goofy_inline uint8x16_t maxu(const uint8x16_t& a, const uint8x16_t& b)
+    {
+        uint8x16_t res;
+        for (uint32_t i = 0; i < 16; i++)
+        {
+            res.data[i] = (a.data[i] > b.data[i]) ? a.data[i] : b.data[i];
+        }
+        return res;
+    }
+
+    goofy_inline uint8x16_t avg(const uint8x16_t& a, const uint8x16_t& b)
+    {
+        uint8x16_t res;
+        for (uint32_t i = 0; i < 16; i++)
+        {
+            uint32_t t = (a.data[i] + b.data[i]) + 1;
+            res.data[i] = (uint8_t)(t >> 1);
+        }
+        return res;
+    }
+
+    goofy_inline uint8x16_t replicateU0000(const uint8x16_t& a)
+    {
+        uint8x16_t res;
+        res.u0 = a.u0;
+        res.u1 = a.u0;
+        res.u2 = a.u0;
+        res.u3 = a.u0;
+        return res;
+    }
+
+    goofy_inline uint8x16_t replicateU1111(const uint8x16_t& a)
+    {
+        uint8x16_t res;
+        res.u0 = a.u1;
+        res.u1 = a.u1;
+        res.u2 = a.u1;
+        res.u3 = a.u1;
+        return res;
+    }
+
+    goofy_inline uint8x16_t replicateU2222(const uint8x16_t& a)
+    {
+        uint8x16_t res;
+        res.u0 = a.u2;
+        res.u1 = a.u2;
+        res.u2 = a.u2;
+        res.u3 = a.u2;
+        return res;
+    }
+
+    goofy_inline uint8x16_t replicateU3333(const uint8x16_t& a)
+    {
+        uint8x16_t res;
+        res.u0 = a.u3;
+        res.u1 = a.u3;
+        res.u2 = a.u3;
+        res.u3 = a.u3;
+        return res;
+    }
+
+    // cmp equal (signed)
+    goofy_inline uint8x16_t cmpeqi(const uint8x16_t& a, const uint8x16_t& b)
+    {
+        uint8x16_t res;
+        for (uint32_t i = 0; i < 16; i++)
+        {
+            res.data[i] = ((char)a.data[i] == (char)b.data[i]) ? 0xFF : 0x00;
+        }
+        return res;
+    }
+
+    // cmp less (signed)
+    goofy_inline uint8x16_t cmplti(const uint8x16_t& a, const uint8x16_t& b)
+    {
+        uint8x16_t res;
+        for (uint32_t i = 0; i < 16; i++)
+        {
+            res.data[i] = ((char)a.data[i] < (char)b.data[i]) ? 0xFF : 0x00;
+        }
+        return res;
+    }
+
+    // add unsigned saturate
+    goofy_inline uint8x16_t addsatu(const uint8x16_t& a, const uint8x16_t& b)
+    {
+        uint8x16_t res;
+        for (uint32_t i = 0; i < 16; i++)
+        {
+            int32_t diff = ((unsigned char)a.data[i] + (unsigned char)b.data[i]);
+            if (diff > 255)
+                diff = 255;
+            res.data[i] = (uint8_t)diff;
+        }
+        return res;
+    }
+
+    // sub unsigned saturate
+    goofy_inline uint8x16_t subsatu(const uint8x16_t& a, const uint8x16_t& b)
+    {
+        uint8x16_t res;
+        for (uint32_t i = 0; i < 16; i++)
+        {
+            int32_t diff = ((unsigned char)a.data[i] - (unsigned char)b.data[i]);
+            if (diff < 0)
+                diff = 0;
+            res.data[i] = (uint8_t)diff;
+        }
+        return res;
+    }
+
+    // transpose as one 4x4 RGBA block
+    //
+    // in:
+    //
+    // R0  = | a0.rgba | a1.rgba | a2.rgba | a3.rgba |
+    // R1  = | b0.rgba | b1.rgba | b2.rgba | b3.rgba |
+    // R2  = | c0.rgba | c1.rgba | c2.rgba | c3.rgba |
+    // R3  = | d0.rgba | d1.rgba | d2.rgba | d3.rgba |
+    //
+    // out:
+    //
+    // R0  = | a0.rgba | b0.rgba | c0.rgba | d0.rgba |
+    // R1  = | a1.rgba | b1.rgba | c1.rgba | d1.rgba |
+    // R2  = | a2.rgba | b2.rgba | c2.rgba | d2.rgba |
+    // R3  = | a3.rgba | b3.rgba | c3.rgba | d3.rgba |
+    //
+    goofy_inline uint8x16x4_t transposeAs4x4(const uint8x16x4_t& v)
+    {
+        // a0, b0, a1, b1
+        const uint8x16_t tr0 = detail::unpacklo32(v.r0, v.r1);
+        // c0, d0, c1, d1
+        const uint8x16_t tr1 = detail::unpacklo32(v.r2, v.r3);
+        // a2, b2, a3, b3
+        const uint8x16_t tr2 = detail::unpackhi32(v.r0, v.r1);
+        // c2, d2, c3, d3
+        const uint8x16_t tr3 = detail::unpackhi32(v.r2, v.r3);
+
+        uint8x16x4_t res;
+        // a0, b0, c0, d0
+        res.r0 = detail::unpacklo64(tr0, tr1);
+        // a1, b1, c1, d1
+        res.r1 = detail::unpackhi64(tr0, tr1);
+        // a2, b2, c2, d2
+        res.r2 = detail::unpacklo64(tr2, tr3);
+        // a3, b3, c3, d3
+        res.r3 = detail::unpackhi64(tr2, tr3);
+        return res;
+    }
+
+    // deinterleave as 4x16
+    //
+    // in:
+    //
+    // R0  = | a0.rgba | a1.rgba | a2.rgba | a3.rgba |
+    // R1  = | b0.rgba | b1.rgba | b2.rgba | b3.rgba |
+    // R2  = | c0.rgba | c1.rgba | c2.rgba | c3.rgba |
+    // R3  = | d0.rgba | d1.rgba | d2.rgba | d3.rgba |
+    //
+    // out:
+    //
+    // R0  = | a0.r | a1.r | a2.r | a3.r | b0.r | b1.r | b2.r | b3.r | c0.r | c1.r | c2.r | c3.r | d0.r | d1.r | d2.r | d3.r |
+    // R1  = | a0.g | a1.g | a2.g | a3.g | b0.g | b1.g | b2.g | b3.g | c0.g | c1.g | c2.g | c3.g | d0.g | d1.g | d2.g | d3.g |
+    // R2  = | a0.b | a1.b | a2.b | a3.b | b0.b | b1.b | b2.b | b3.b | c0.b | c1.b | c2.b | c3.b | d0.b | d1.b | d2.b | d3.b |
+    // R3  = | a0.a | a1.a | a2.a | a3.a | b0.a | b1.a | b2.a | b3.a | c0.a | c1.a | c2.a | c3.a | d0.a | d1.a | d2.a | d3.a |
+    //
+    goofy_inline uint8x16x3_t deinterleaveRGB(const uint8x16x4_t& v)
+    {
+        // step 1
+
+        // | a0.r | b0.r | a0.g | b0.g | a0.b | b0.b | a0.a | b0.a | a1.r | b1.r | a1.g | b1.g | a1.b | b1.b | a1.a | b1.a |
+        // | a2.r | b2.r | a2.g | b2.g | a2.b | b2.b | a2.a | b2.a | a3.r | b3.r | a3.g | b3.g | a3.b | b3.b | a3.a | b3.a |
+        const uint8x16_t s0a = detail::unpacklo8(v.r0, v.r1);
+        const uint8x16_t s0b = detail::unpackhi8(v.r0, v.r1);
+
+        // | c0.r | d0.r | c0.g | d0.g | c0.b | d0.b | c0.a | d0.a | c1.r | d1.r | c1.g | d1.g | c1.b | d1.b | c1.a | d1.a |
+        // | c2.r | d2.r | c2.g | d2.g | c2.b | d2.b | c2.a | d2.a | c3.r | d3.r | c3.g | d3.g | c3.b | d3.b | c3.a | d3.a |
+        const uint8x16_t s0c = detail::unpacklo8(v.r2, v.r3);
+        const uint8x16_t s0d = detail::unpackhi8(v.r2, v.r3);
+
+        // step 2
+        // | a0.r | a2.r | b0.r | b2.r | a0.g | a2.g | b0.g | b2.g | a0.b | a2.b | b0.b | b2.b | a0.a | a2.a | b0.a | b2.a |
+        // | a1.r | a3.r | b1.r | b3.r | a1.g | a3.g | b1.g | b3.g | a1.b | a3.b | b1.b | b3.b | a1.a | a3.a | b1.a | b3.a |
+        const uint8x16_t s1a = detail::unpacklo8(s0a, s0b);
+        const uint8x16_t s1b = detail::unpackhi8(s0a, s0b);
+
+        // | c0.r | c2.r | d0.r | d2.r | c0.g | c2.g | d0.g | d2.g | c0.b | c2.b | d0.b | d2.b | c0.a | c2.a | d0.a | d2.a |
+        // | c1.r | c3.r | d1.r | d3.r | c1.g | c3.g | d1.g | d3.g | c1.b | c3.b | d1.b | d3.b | c1.a | c3.a | d1.a | d3.a |
+        const uint8x16_t s1c = detail::unpacklo8(s0c, s0d);
+        const uint8x16_t s1d = detail::unpackhi8(s0c, s0d);
+
+        // step 3
+        // | a0.r | a1.r | a2.r | a3.r | b0.r | b1.r | b2.r | b3.r | a0.g | a1.g | a2.g | a3.g | b0.g | b1.g | b2.g | b3.g |
+        // | a0.b | a1.b | a2.b | a3.b | b0.b | b1.b | b2.b | b3.b | a0.a | a1.a | a2.a | a3.a | b0.a | b1.a | b2.a | b3.a |
+        const uint8x16_t s2a = detail::unpacklo8(s1a, s1b);
+        const uint8x16_t s2b = detail::unpackhi8(s1a, s1b);
+
+        // | c0.r | c1.r | c2.r | c3.r | d0.r | d1.r | d2.r | d3.r | c0.g | c1.g | c2.g | c3.g | d0.g | d1.g | d2.g | d3.g |
+        // | c0.b | c1.b | c2.b | c3.b | d0.b | d1.b | d2.b | d3.b | c0.a | c1.a | c2.a | c3.a | d0.a | d1.a | d2.a | d3.a |
+        const uint8x16_t s2c = detail::unpacklo8(s1c, s1d);
+        const uint8x16_t s2d = detail::unpackhi8(s1c, s1d);
+
+        // step 4 (final)
+        uint8x16x3_t res;
+        // | a0.r | a1.r | a2.r | a3.r | b0.r | b1.r | b2.r | b3.r | c0.r | c1.r | c2.r | c3.r | d0.r | d1.r | d2.r | d3.r |
+        // | a0.g | a1.g | a2.g | a3.g | b0.g | b1.g | b2.g | b3.g | c0.g | c1.g | c2.g | c3.g | d0.g | d1.g | d2.g | d3.g |
+        res.r0 = detail::unpacklo64(s2a, s2c);
+        res.r1 = detail::unpackhi64(s2a, s2c);
+
+        // | a0.b | a1.b | a2.b | a3.b | b0.b | b1.b | b2.b | b3.b | c0.b | c1.b | c2.b | c3.b | d0.b | d1.b | d2.b | d3.b |
+        // | a0.a | a1.a | a2.a | a3.a | b0.a | b1.a | b2.a | b3.a | c0.a | c1.a | c2.a | c3.a | d0.a | d1.a | d2.a | d3.a |
+        res.r2 = detail::unpacklo64(s2b, s2d);
+        //res.r3 = detail::unpackhi64(s2b, s2d);
+        return res;
+    }
+
+    // transpose as four single channel 4x4 blocks at once
+    //
+    // in:
+    //
+    // R0  = | bl0.abcd | bl0.efgh | bl0.ijkl | bl0.mnop |
+    // R1  = | bl1.abcd | bl1.efgh | bl1.ijkl | bl1.mnop |
+    // R2  = | bl2.abcd | bl2.efgh | bl2.ijkl | bl2.mnop |
+    // R3  = | bl3.abcd | bl3.efgh | bl3.ijkl | bl3.mnop |
+    //
+    // out:
+    //         NOTE: columns are swapped!
+    //
+    //            3          4           0         1
+    // R1  = | bl0.cgko | bl0.dhlp | bl0.aeim | bl0.bfjn |
+    // R2  = | bl1.cgko | bl1.dhlp | bl1.aeim | bl1.bfjn |
+    // R3  = | bl2.cgko | bl2.dhlp | bl2.aeim | bl2.bfjn |
+    // R0  = | bl3.cgko | bl3.dhlp | bl3.aeim | bl3.bfjn |
+    //
+    //  +---+---+---+---+           +---+---+---+---+
+    //  | A | B | C | D |           | C | G | K | O |
+    //  +---+---+---+---+           +---+---+---+---+
+    //  | E | F | G | H |           | D | H | L | P |
+    //  +---+---+---+---+    -->    +---+---+---+---+
+    //  | I | J | K | L |           | A | E | I | M |
+    //  +---+---+---+---+           +---+---+---+---+
+    //  | M | N | O | P |           | B | F | J | N |
+    //  +---+---+---+---+           +---+---+---+---+
+    //
+    goofy_inline uint8x16x4_t transposeAs4x4x4(const uint8x16x4_t& v)
+    {
+        // step 1
+
+        // | 0.a | 1.a | 0.b | 1.b | 0.c | 1.c | 0.d | 1.d | 0.e | 1.e | 0.f | 1.f | 0.g | 1.g | 0.h | 1.h |
+        // | 0.i | 1.i | 0.j | 1.j | 0.k | 1.k | 0.l | 1.l | 0.m | 1.m | 0.n | 1.n | 0.o | 1.o | 0.p | 1.p |
+        const uint8x16_t s0a = detail::unpacklo8(v.r0, v.r1);
+        const uint8x16_t s0b = detail::unpackhi8(v.r0, v.r1);
+
+        // | 2.a | 3.a | 2.b | 3.b | 2.c | 3.c | 2.d | 3.d | 2.e | 3.e | 2.f | 3.f | 2.g | 3.g | 2.h | 3.h |
+        // | 2.i | 3.i | 2.j | 3.j | 2.k | 3.k | 2.l | 3.l | 2.m | 3.m | 2.n | 3.n | 2.o | 3.o | 2.p | 3.p |
+        const uint8x16_t s0c = detail::unpacklo8(v.r2, v.r3);
+        const uint8x16_t s0d = detail::unpackhi8(v.r2, v.r3);
+
+        // step 2
+
+        // | 0.a | 0.i | 1.a | 1.i | 0.b | 0.j | 1.b | 1.j | 0.c | 0.k | 1.c | 1.k | 0.d | 0.l | 1.d | 1.l |
+        // | 0.e | 0.m | 1.e | 1.m | 0.f | 0.n | 1.f | 1.n | 0.g | 0.o | 1.g | 1.o | 0.h | 0.p | 1.h | 1.p |
+        const uint8x16_t s1a = detail::unpacklo8(s0a, s0b);
+        const uint8x16_t s1b = detail::unpackhi8(s0a, s0b);
+
+        // | 2.a | 2.i | 3.a | 3.i | 2.b | 2.j | 3.b | 3.j | 2.c | 2.k | 3.c | 3.k | 2.d | 2.l | 3.d | 3.l |
+        // | 2.e | 2.m | 3.e | 3.m | 2.f | 2.n | 3.f | 3.n | 2.g | 2.o | 3.g | 3.o | 2.h | 2.p | 3.h | 3.p |
+        const uint8x16_t s1c = detail::unpacklo8(s0c, s0d);
+        const uint8x16_t s1d = detail::unpackhi8(s0c, s0d);
+
+        // step 3
+
+        // | 0.a | 0.e | 0.i | 0.m | 1.a | 1.e | 1.i | 1.m | 0.b | 0.f | 0.j | 0.n | 1.b | 1.f | 1.j | 1.n |
+        // | 0.c | 0.g | 0.k | 0.o | 1.c | 1.g | 1.k | 1.o | 0.d | 0.h | 0.l | 0.p | 1.d | 1.h | 1.l | 1.p |
+        const uint8x16_t s2a = detail::unpacklo8(s1a, s1b);
+        const uint8x16_t s2b = detail::unpackhi8(s1a, s1b);
+
+        // | 2.a | 2.e | 2.i | 2.m | 3.a | 3.e | 3.i | 3.m | 2.b | 2.f | 2.j | 2.n | 3.b | 3.f | 3.j | 3.n |
+        // | 2.c | 2.g | 2.k | 2.o | 3.c | 3.g | 3.k | 3.o | 2.d | 2.h | 2.l | 2.p | 3.d | 3.h | 3.l | 3.p |
+        const uint8x16_t s2c = detail::unpacklo8(s1c, s1d);
+        const uint8x16_t s2d = detail::unpackhi8(s1c, s1d);
+
+        // step 4
+
+        // | 0.a | 0.e | 0.i | 0.m | 0.c | 0.g | 0.k | 0.o | 1.a | 1.e | 1.i | 1.m | 1.c | 1.g | 1.k | 1.o |
+        // | 0.b | 0.f | 0.j | 0.n | 0.d | 0.h | 0.l | 0.p | 1.b | 1.f | 1.j | 1.n | 1.d | 1.h | 1.l | 1.p |
+        const uint8x16_t s3a = detail::unpacklo32(s2a, s2b);
+        const uint8x16_t s3b = detail::unpackhi32(s2a, s2b);
+
+        // | 2.a | 2.e | 2.i | 2.m | 2.c | 2.g | 2.k | 2.o | 3.a | 3.e | 3.i | 3.m | 3.c | 3.g | 3.k | 3.o |
+        // | 2.b | 2.f | 2.j | 2.n | 2.d | 2.h | 2.l | 2.p | 3.b | 3.f | 3.j | 3.n | 3.d | 3.h | 3.l | 3.p |
+        const uint8x16_t s3c = detail::unpacklo32(s2c, s2d);
+        const uint8x16_t s3d = detail::unpackhi32(s2c, s2d);
+
+        // step 5
+
+        // | 0.a | 0.e | 0.i | 0.m | 0.c | 0.g | 0.k | 0.o | 0.b | 0.f | 0.j | 0.n | 0.d | 0.h | 0.l | 0.p |
+        // | 1.a | 1.e | 1.i | 1.m | 1.c | 1.g | 1.k | 1.o | 1.b | 1.f | 1.j | 1.n | 1.d | 1.h | 1.l | 1.p |
+        const uint8x16_t s4a = detail::unpacklo64(s3a, s3b);
+        const uint8x16_t s4b = detail::unpackhi64(s3a, s3b);
+
+        // | 2.a | 2.e | 2.i | 2.m | 2.c | 2.g | 2.k | 2.o | 2.b | 2.f | 2.j | 2.n | 2.d | 2.h | 2.l | 2.p |
+        // | 3.a | 3.e | 3.i | 3.m | 3.c | 3.g | 3.k | 3.o | 3.b | 3.f | 3.j | 3.n | 3.d | 3.h | 3.l | 3.p |
+        const uint8x16_t s4c = detail::unpacklo64(s3c, s3d);
+        const uint8x16_t s4d = detail::unpackhi64(s3c, s3d);
+
+        // step 5 (final)
+        uint8x16x4_t res;
+        // | 0.c | 0.g | 0.k | 0.o | 0.d | 0.h | 0.l | 0.p | 0.a | 0.e | 0.i | 0.m | 0.b | 0.f | 0.j | 0.n |
+        res.r0 = detail::swizzleU1302(s4a);
+        // | 1.c | 1.g | 1.k | 1.o | 1.d | 1.h | 1.l | 1.p | 1.a | 1.e | 1.i | 1.m | 1.b | 1.f | 1.j | 1.n |
+        res.r1 = detail::swizzleU1302(s4b);
+        // | 2.c | 2.g | 2.k | 2.o | 2.d | 2.h | 2.l | 2.p | 2.a | 2.e | 2.i | 2.m | 2.b | 2.f | 2.j | 2.n |
+        res.r2 = detail::swizzleU1302(s4c);
+        // | 3.c | 3.g | 3.k | 3.o | 3.d | 3.h | 3.l | 3.p | 3.a | 3.e | 3.i | 3.m | 3.b | 3.f | 3.j | 3.n |
+        res.r3 = detail::swizzleU1302(s4d);
+        return res;
+    }
+
+    // like ZipU4 but for two parallel zips
+    //
+    // in:
+    //
+    // A  = | a0.rgba | a1.rgba | a2.rgba | a3.rgba |
+    // B  = | b0.rgba | b1.rgba | b2.rgba | b3.rgba |
+    // C  = | c0.rgba | c1.rgba | c2.rgba | c3.rgba |
+    // D  = | d0.rgba | d1.rgba | d2.rgba | d3.rgba |
+    //
+    // out:
+    //
+    // R0  = | a0.rgba | b0.rgba | a1.rgba | b1.rgba |
+    // R1  = | a2.rgba | b2.rgba | a3.rgba | b3.rgba |
+    // R2  = | c0.rgba | d0.rgba | c1.rgba | d1.rgba |
+    // R3  = | c2.rgba | d2.rgba | c3.rgba | d3.rgba |
+    //
+    goofy_inline uint8x16x4_t zipU4x2(const uint8x16_t& a, const uint8x16_t& b, const uint8x16_t& c, const uint8x16_t& d)
+    {
+        const uint8x16x4_t res = {
+                detail::unpacklo32(a, b),
+                detail::unpackhi32(a, b),
+                detail::unpacklo32(c, d),
+                detail::unpackhi32(c, d),
+        };
+        return res;
+    }
+
+    //
+    // in:
+    //
+    // a  = | a0.rgba | a1.rgba | a2.rgba| a3.rgba
+    // b  = | b0.rgba | b1.rgba | b2.rgba| b3.rgba
+    //
+    // out:
+    //
+    // R0  = | a0.rgba | b0.rgba | a1.rgba | b1.rgba |
+    // R1  = | a2.rgba | b2.rgba | a3.rgba | b3.rgba |
+    //
+    goofy_inline uint8x16x2_t zipU4(const uint8x16_t& a, const uint8x16_t& b)
+    {
+        uint8x16x2_t res;
+        res.r0 = detail::unpacklo32(a, b);
+        res.r1 = detail::unpackhi32(a, b);
+        return res;
+    }
+
+    //
+    // in:
+    //
+    // a  = | a0 | a1 | a2 | a3 | a4 | a5 | a6 | a7 | a8 | a9 | aA | aB | aC | aD | aE | aF |
+    // b  = | b0 | b1 | b2 | b3 | b4 | b5 | b6 | b7 | b8 | b9 | bA | bB | bC | bD | bE | bF |
+    //
+    // out:
+    //
+    // R0  = | a0 | b0 | a1 | b1 | a2 | b2 | a3 | b3 | a4 | b4 | a5 | b5 | a6 | b6 | a7 | b7 |
+    // R1  = | a8 | b8 | a9 | b9 | aA | bA | aB | bB | aC | bC | aD | bD | aE | bE | aF | bF |
+    //
+    goofy_inline uint8x16x2_t zipB16(const uint8x16_t& a, const uint8x16_t& b)
+    {
+        uint8x16x2_t res;
+        res.r0 = detail::unpacklo8(a, b);
+        res.r1 = detail::unpackhi8(a, b);
+        return res;
+    }
+
+    goofy_inline uint8x16_t not(const uint8x16_t& v)
+    {
+        uint8x16_t res;
+        for (int i = 0; i < 16; i++)
+        {
+            res.data[i] = v.data[i] ^ 0xFF;
+        }
+        return res;
+    }
+
+#endif
+}
+
+static_assert(sizeof(uint8x16_t) == 16, "Incorrect byte8x16 sizeof");
+static_assert(sizeof(uint8x16x2_t) == 32, "Incorrect byte8x16x1 sizeof");
+static_assert(sizeof(uint8x16x3_t) == 48, "Incorrect byte8x16x2 sizeof");
+static_assert(sizeof(uint8x16x4_t) == 64, "Incorrect byte8x16x4 sizeof");
+static_assert(sizeof(uint64x2_t) == 16, "Incorrect uint64x2_t sizeof");
+
+
+// Block brightness variance to ETC control byte
+static const uint32_t etc1BrighnessRangeTocontrolByte[256] = {
+    0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000,
+    0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000,
+    0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000,
+    0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000,
+    0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000,
+    0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000,
+    0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000,
+    0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000,
+    0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000,
+    0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000,
+    0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000,
+    0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000,
+    0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000,
+    0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000,
+    0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000,
+    0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xFF000000, 0xFF000000
+};
+
+
+enum GoofyCodecType
+{
+    GOOFY_DXT1,
+    GOOFY_ETC1,
+};
+
+//
+// Encode 4 DXT1/ETC1 at once
+//
+template<GoofyCodecType CODEC_TYPE>
+goofy_inline void goofySimdEncode(const unsigned char* goofy_restrict inputRGBA, size_t inputStride, unsigned char* goofy_restrict pResult)
+{
+    // Fetch 16x4 pixels from the buffer(four DX blocks)
+    // 16 pixels wide is better for the CPU cache utilization (64 bytes per line) and it is better for SIMD lane utilization
+    // -----------------------------------------------------------
+    uint8x16x4_t bl0;
+    uint8x16x4_t bl1;
+    uint8x16x4_t bl2;
+    uint8x16x4_t bl3;
+    bl0.r0 = simd::fetch(inputRGBA);
+    bl1.r0 = simd::fetch(inputRGBA + 16);
+    bl2.r0 = simd::fetch(inputRGBA + 32);
+    bl3.r0 = simd::fetch(inputRGBA + 48);
+    inputRGBA += inputStride;
+    bl0.r1 = simd::fetch(inputRGBA);
+    bl1.r1 = simd::fetch(inputRGBA + 16);
+    bl2.r1 = simd::fetch(inputRGBA + 32);
+    bl3.r1 = simd::fetch(inputRGBA + 48);
+    inputRGBA += inputStride;
+    bl0.r2 = simd::fetch(inputRGBA);
+    bl1.r2 = simd::fetch(inputRGBA + 16);
+    bl2.r2 = simd::fetch(inputRGBA + 32);
+    bl3.r2 = simd::fetch(inputRGBA + 48);
+    inputRGBA += inputStride;
+    bl0.r3 = simd::fetch(inputRGBA);
+    bl1.r3 = simd::fetch(inputRGBA + 16);
+    bl2.r3 = simd::fetch(inputRGBA + 32);
+    bl3.r3 = simd::fetch(inputRGBA + 48);
+
+    // Find min block colors
+    // -----------------------------------------------------------
+    const uint8x16x4_t blMin = {
+        simd::minu(simd::minu(bl0.r0, bl0.r1), simd::minu(bl0.r2, bl0.r3)), // min0_clmn0.rgba | min0_clmn1.rgba | min0_clmn2.rgba | min0_clmn3.rgba
+        simd::minu(simd::minu(bl1.r0, bl1.r1), simd::minu(bl1.r2, bl1.r3)), // min1_clmn0.rgba | min1_clmn1.rgba | min1_clmn2.rgba | min1_clmn3.rgba
+        simd::minu(simd::minu(bl2.r0, bl2.r1), simd::minu(bl2.r2, bl2.r3)), // min2_clmn0.rgba | min2_clmn1.rgba | min2_clmn2.rgba | min2_clmn3.rgba
+        simd::minu(simd::minu(bl3.r0, bl3.r1), simd::minu(bl3.r2, bl3.r3))  // min3_clmn0.rgba | min3_clmn1.rgba | min3_clmn2.rgba | min3_clmn3.rgba
+    };
+
+    // blMinTr (transposed blMin)
+    // min0_clmn0.rgba | min1_clmn0.rgba | min2_clmn0.rgba | min3_clmn0.rgba
+    // min0_clmn1.rgba | min1_clmn1.rgba | min2_clmn1.rgba | min3_clmn1.rgba
+    // min0_clmn2.rgba | min1_clmn2.rgba | min2_clmn2.rgba | min3_clmn2.rgba
+    // min0_clmn3.rgba | min1_clmn3.rgba | min2_clmn3.rgba | min3_clmn3.rgba
+    const uint8x16x4_t blMinTr = simd::transposeAs4x4(blMin);
+
+    // Per-block min colors
+    // min0.rgba | min1.rgba | min2.rgba | min3.rgba
+    const uint8x16_t minColors = simd::minu(
+        simd::minu(blMinTr.r0, blMinTr.r1),
+        simd::minu(blMinTr.r2, blMinTr.r3)
+    );
+
+    // Same to find max block colors
+    // -----------------------------------------------------------
+    const uint8x16x4_t blMax = {
+        simd::maxu(simd::maxu(bl0.r0, bl0.r1), simd::maxu(bl0.r2, bl0.r3)),
+        simd::maxu(simd::maxu(bl1.r0, bl1.r1), simd::maxu(bl1.r2, bl1.r3)),
+        simd::maxu(simd::maxu(bl2.r0, bl2.r1), simd::maxu(bl2.r2, bl2.r3)),
+        simd::maxu(simd::maxu(bl3.r0, bl3.r1), simd::maxu(bl3.r2, bl3.r3))
+    };
+
+    const uint8x16x4_t blMaxTr = simd::transposeAs4x4(blMax);
+
+    // Per-block max colors
+    // max0.rgba | max1.rgba | max2.rgba | max3.rgba
+    const uint8x16_t maxColors = simd::maxu(
+        simd::maxu(blMaxTr.r0, blMaxTr.r1),
+        simd::maxu(blMaxTr.r2, blMaxTr.r3)
+    );
+
+    // Find min/max brigtness
+    // -----------------------------------------------------------
+
+    // Note: some SSE lanes wasted, it is not ideal, but seems OK-ish?
+    //
+    // min0.rgba | min0.rgba | min1.rgba | min1.rgba
+    // min2.rgba | min2.rgba | min3.rgba | min3.rgba
+    // max0.rgba | max0.rgba | max1.rgba | max1.rgba
+    // max2.rgba | max2.rgba | max3.rgba | max3.rgba
+    const uint8x16x4_t blMinMax = simd::zipU4x2(minColors, minColors, maxColors, maxColors);
+
+    // Deinterleave
+    // min0.rr | min1.rr | min2.rr | min3.rr | max0.rr | max1.rr | max2.rr | max3.rr
+    // min0.gg | min1.gg | min2.gg | min3.gg | max0.gg | max1.gg | max2.gg | max3.gg
+    // min0.bb | min1.bb | min2.bb | min3.bb | max0.bb | max1.bb | max2.bb | max3.bb
+    const uint8x16x3_t blMinMaxDi = simd::deinterleaveRGB(blMinMax);
+
+    // Get Y component of YCoCg color-model (perceptual brightness)
+    // https://en.wikipedia.org/wiki/YCoCg
+    // Y = 0.25 * R + 0.25 * B + 0.5 * G
+    //   We can rewrite equation above using the following form
+    // Y = (((R + B) / 2) + G) / 2
+    //
+    // Y = min0.yy | min1.yy | min2.yy | min3.yy | max0.yy | max1.yy | max2.yy | max3.yy
+    const uint8x16_t Y = simd::avg(simd::avg(blMinMaxDi.r0, blMinMaxDi.r2), blMinMaxDi.r1);
+
+    // Min/max brightness per block
+    // R0 = min0.yyyy | min1.yyyy | min2.yyyy | min3.yyyy
+    // R1 = max0.yyyy | max1.yyyy | max2.yyyy | max3.yyyy
+    const uint8x16x2_t blMinMaxY = simd::zipB16(Y, Y);
+
+    // Clamp to min brightness
+    const uint8x16_t constEight = simd::fetch(&gConstEight);
+    // range0.yyyy | range1.yyyy | range2.yyyy | range3.yyyy
+    const uint8x16_t blRangeY = simd::maxu(simd::subsatu(blMinMaxY.r1, blMinMaxY.r0), constEight);
+
+    // mid0.yyyy | mid1.yyyy | mid2.yyyy | mid3.yyyy
+    const uint8x16_t blMidY = simd::avg(blMinMaxY.r0, blMinMaxY.r1);
+
+    // Approximate multiplication by 0.375 to get quantization thresholds
+    const uint8x16_t constZero = simd::zero();
+
+    const uint8x16_t blHalfRangeY = simd::avg(blRangeY, constZero);
+    const uint8x16_t blQuarterRangeY = simd::avg(blHalfRangeY, constZero);
+    const uint8x16_t blEighthsRangeY = simd::avg(blQuarterRangeY, constZero);
+
+    // Threshold = (quarter + eights) = (0.25 + 0.125) ~= (range * 0.375)
+    // qt0.yyyy | qt1.yyyy | qt2.yyyy | qt3.yyyy
+    const uint8x16_t blQThreshold = simd::addsatu(blQuarterRangeY, blEighthsRangeY);
+
+    // Quantization (generate indices)
+    // -----------------------------------------------------------   
+    const uint8x16_t constMaxInt = simd::fetch(&gConstMaxInt);
+
+    //  block 0
+    //
+    // p0.r p1.r p2.r p3.r p4.r p5.r p6.r p7.r p8.r p9.r p10.r p11.r p12.r p13.r p14.r p15.r
+    // p0.g p1.g p2.g p3.g p4.g p5.g p6.g p7.g p8.g p9.g p10.g p11.g p12.g p13.g p14.g p15.g
+    // p0.b p1.b p2.b p3.b p4.b p5.b p6.b p7.b p8.b p9.b p10.b p11.b p12.b p13.b p14.b p15.b
+    const uint8x16x3_t bl0Di = simd::deinterleaveRGB(bl0);
+
+    // Convert RGB to brightness 
+    // per-pixel block brightness
+    const uint8x16_t bl0Y = simd::avg(simd::avg(bl0Di.r0, bl0Di.r2), bl0Di.r1);
+
+    // Block brightness to compare with
+    const uint8x16_t bl0MidY = simd::replicateU0000(blMidY);
+
+    // Brightness difference (per-pixel in block)
+    // NOTE: we need to clamp difference to max signed int8, because of the signed comparison later
+    const uint8x16_t bl0PosDiffY = simd::minu(simd::subsatu(bl0Y, bl0MidY), constMaxInt);
+    const uint8x16_t bl0NegDiffY = simd::minu(simd::subsatu(bl0MidY, bl0Y), constMaxInt);
+    // Greater or Equal to zero mask
+    const uint8x16_t bl0GezMask = simd::cmpeqi(bl0NegDiffY, constZero);
+
+    // Absolute diffference of brightness (per-pixel in block)
+    const uint8x16_t bl0AbsDiffY = simd::or(bl0PosDiffY, bl0NegDiffY);
+
+    // get quantization threshold for current block
+    const uint8x16_t bl0QThreshold = simd::replicateU0000(blQThreshold);
+
+    // Less than Quantization Threshold mask
+    const uint8x16_t bl0LqtMask = simd::cmplti(bl0AbsDiffY, bl0QThreshold);
+
+    // Here we've got two bitmasks
+    //
+    // GezMask = greater or equal than zero (per pixel)
+    // LqtMask = less than quantization threshold (per pixel)
+    //
+    //
+    //  min       qt          qt        max
+    //   x---------x-----+-----x---------x
+    //                   0                
+    //                                    
+    //                   |---------------| greater or equal than zero (GezMask)
+    //                                    
+    //             |-----------| less than quantization threshold (LqtMask)
+    //
+
+    //  block 1
+    const uint8x16x3_t bl1Di = simd::deinterleaveRGB(bl1);
+    const uint8x16_t bl1Y = simd::avg(simd::avg(bl1Di.r0, bl1Di.r2), bl1Di.r1);
+    const uint8x16_t bl1MidY = simd::replicateU1111(blMidY);
+    const uint8x16_t bl1PosDiffY = simd::minu(simd::subsatu(bl1Y, bl1MidY), constMaxInt);
+    const uint8x16_t bl1NegDiffY = simd::minu(simd::subsatu(bl1MidY, bl1Y), constMaxInt);
+    const uint8x16_t bl1GezMask = simd::cmpeqi(bl1NegDiffY, constZero);
+    const uint8x16_t bl1AbsDiffY = simd::or(bl1PosDiffY, bl1NegDiffY);
+    const uint8x16_t bl1QThreshold = simd::replicateU1111(blQThreshold);
+    const uint8x16_t bl1LqtMask = simd::cmplti(bl1AbsDiffY, bl1QThreshold);
+
+    //  block 2
+    const uint8x16x3_t bl2Di = simd::deinterleaveRGB(bl2);
+    const uint8x16_t bl2Y = simd::avg(simd::avg(bl2Di.r0, bl2Di.r2), bl2Di.r1);
+    const uint8x16_t bl2MidY = simd::replicateU2222(blMidY);
+    const uint8x16_t bl2PosDiffY = simd::minu(simd::subsatu(bl2Y, bl2MidY), constMaxInt);
+    const uint8x16_t bl2NegDiffY = simd::minu(simd::subsatu(bl2MidY, bl2Y), constMaxInt);
+    const uint8x16_t bl2GezMask = simd::cmpeqi(bl2NegDiffY, constZero);
+    const uint8x16_t bl2AbsDiffY = simd::or(bl2PosDiffY, bl2NegDiffY);
+    const uint8x16_t bl2QThreshold = simd::replicateU2222(blQThreshold);
+    const uint8x16_t bl2LqtMask = simd::cmplti(bl2AbsDiffY, bl2QThreshold);
+
+    //  block 3
+    const uint8x16x3_t bl3Di = simd::deinterleaveRGB(bl3);
+    const uint8x16_t bl3Y = simd::avg(simd::avg(bl3Di.r0, bl3Di.r2), bl3Di.r1);
+    const uint8x16_t bl3MidY = simd::replicateU3333(blMidY);
+    const uint8x16_t bl3PosDiffY = simd::minu(simd::subsatu(bl3Y, bl3MidY), constMaxInt);
+    const uint8x16_t bl3NegDiffY = simd::minu(simd::subsatu(bl3MidY, bl3Y), constMaxInt);
+    const uint8x16_t bl3GezMask = simd::cmpeqi(bl3NegDiffY, constZero);
+    const uint8x16_t bl3AbsDiffY = simd::or(bl3PosDiffY, bl3NegDiffY);
+    const uint8x16_t bl3QThreshold = simd::replicateU3333(blQThreshold);
+    const uint8x16_t bl3LqtMask = simd::cmplti(bl3AbsDiffY, bl3QThreshold);
+
+    // Finalize blocks
+    // -----------------------------------------------------------
+    if (CODEC_TYPE == GOOFY_DXT1)
+    {
+        // Generate DXT indices using given masks
+
+        // DXT indices order
+        // -------------------------
+        //        C0(max)     C2        C3      C1(min)  
+        // DEC: |    0    |    2    |    3    |    1    |
+        // BIN: |   00b   |   10b   |   11b   |   01b   |
+        //
+        //      |      GezMask      |
+        //                          |      LqtMask      |
+
+        // Zip two masks to match DX bits order
+        // Gez0 | Lqt0 | Gez1 | Lqt1 | Gez2 | Lqt2 | Gez3 | Lqt3 | Gez4 | Lqt4 | Gez5 | Lqt5 | Gez6 | Lqt6 | Gez7 | Lqt7
+        // Gez8 | Lqt8 | Gez9 | Lqt9 | GezA | LqtA | GezB | LqtB | GezC | LqtC | GezD | LqtD | GezE | LqtE | GezF | LqtF
+        const uint8x16x2_t bl0RawIndices = simd::zipB16(simd::not(bl0GezMask), bl0LqtMask);
+        const uint8x16x2_t bl3RawIndices = simd::zipB16(simd::not(bl3GezMask), bl3LqtMask);
+        const uint8x16x2_t bl2RawIndices = simd::zipB16(simd::not(bl2GezMask), bl2LqtMask);
+        const uint8x16x2_t bl1RawIndices = simd::zipB16(simd::not(bl1GezMask), bl1LqtMask);
+
+        // Bytes to bits
+        uint32_t bl0Indices = simd::moveMaskMSB(bl0RawIndices.r0) | (simd::moveMaskMSB(bl0RawIndices.r1) << 16);
+        uint32_t bl1Indices = simd::moveMaskMSB(bl1RawIndices.r0) | (simd::moveMaskMSB(bl1RawIndices.r1) << 16);
+        uint32_t bl2Indices = simd::moveMaskMSB(bl2RawIndices.r0) | (simd::moveMaskMSB(bl2RawIndices.r1) << 16);
+        uint32_t bl3Indices = simd::moveMaskMSB(bl3RawIndices.r0) | (simd::moveMaskMSB(bl3RawIndices.r1) << 16);
+
+        // Convert rgb888 to rgb555
+
+        // We can't shift right by 3 using SIMD, but we can shift right by 1 three times instead
+        // We need to sub eight before, because avg is (a+b+1) >> 1
+
+        // max555_0.rgba | max555_1.rgba | max555_2.rgba | max555_3.rgba
+        const uint8x16_t maxColors555 = simd::avg(simd::avg(simd::avg(simd::subsatu(maxColors, constEight), constZero), constZero), constZero);
+        // min555_0.rgba | min555_1.rgba | min555_2.rgba | min555_3.rgba
+        const uint8x16_t minColors555 = simd::avg(simd::avg(simd::avg(simd::subsatu(minColors, constEight), constZero), constZero), constZero);
+
+        // max555_0.rgba | min555_0.rgba | max555_1.rgba | min555_1.rgba
+        // max555_2.rgba | min555_2.rgba | max555_3.rgba | min555_3.rgba
+        const uint8x16x2_t maxMinColors555 = simd::zipU4(maxColors555, minColors555);
+
+        const uint64x2_t maxMin01 = simd::getAsUInt64x2(maxMinColors555.r0);
+        const uint64x2_t maxMin23 = simd::getAsUInt64x2(maxMinColors555.r1);
+
+        // R0
+        // AAAAAAAA000000000000000000000000AAAAAAAA000000000000000000011111b << 11 = 0000000000000000 1111100000000000b
+        // AAAAAAAA000000000000000000000000AAAAAAAA000000000001111100000000b >> 2  = 0000000000000000 0000011111000000b
+        // AAAAAAAA000000000000000000000000AAAAAAAA000111110000000000000000b >> 16 = 0000000000000000 0000000000011111b
+        
+        // R1
+        // AAAAAAAA000000000000000000011111AAAAAAAA000000000000000000000000b >> 5 =  1111100000000000 0000000000000000b
+        // AAAAAAAA000000000001111100000000AAAAAAAA000000000000000000000000b >> 18 = 0000011111000000 0000000000000000b
+        // AAAAAAAA000111110000000000000000AAAAAAAA000000000000000000000000b >> 32 = 0000000000011111 0000000000000000b
+
+        // 0x20                                                                    = 0000000000000000 0000000000100000b
+
+        uint32_t* goofy_restrict pDest = (uint32_t* goofy_restrict)pResult;
+
+        uint32_t block0a = (uint32_t)(0x20 | // max color green channel LSB (to avoid switching to DXT1 3-color mode)
+            (maxMin01.r0 & 0x1Full) << 11ull | (maxMin01.r0 & 0x1F00ull) >> 2ull | (maxMin01.r0 & 0x1F0000ull) >> 16ull |  // max color
+            (maxMin01.r0 & 0x1F00000000ull) >> 5ull | (maxMin01.r0 & 0x1F0000000000ull) >> 18ull | (maxMin01.r0 & 0x1F000000000000ull) >> 32ull); // min color
+        //uint32_t block0b = bl0Indices << 32ull;  // indices
+        *pDest = block0a; pDest++; *pDest = bl0Indices;
+
+        uint32_t block1a = (uint32_t)(0x20 |
+            (maxMin01.r1 & 0x1Full) << 11ull | (maxMin01.r1 & 0x1F00ull) >> 2ull | (maxMin01.r1 & 0x1F0000ull) >> 16ull |
+            (maxMin01.r1 & 0x1F00000000ull) >> 5ull | (maxMin01.r1 & 0x1F0000000000ull) >> 18ull | (maxMin01.r1 & 0x1F000000000000ull) >> 32ull);
+        //uint32_t block1b = bl1Indices << 32ull;
+        pDest++; *pDest = block1a; pDest++; *pDest = bl1Indices;
+
+        uint32_t block2a = (uint32_t)(0x20 |
+            (maxMin23.r0 & 0x1Full) << 11ull | (maxMin23.r0 & 0x1F00ull) >> 2ull | (maxMin23.r0 & 0x1F0000ull) >> 16ull |
+            (maxMin23.r0 & 0x1F00000000ull) >> 5ull | (maxMin23.r0 & 0x1F0000000000ull) >> 18ull | (maxMin23.r0 & 0x1F000000000000ull) >> 32ull);
+            //bl2Indices << 32ull;
+        pDest++; *pDest = block2a; pDest++; *pDest = bl2Indices;
+
+        uint32_t block3a = (uint32_t)(0x20 |
+            (maxMin23.r1 & 0x1Full) << 11ull | (maxMin23.r1 & 0x1F00ull) >> 2ull | (maxMin23.r1 & 0x1F0000ull) >> 16ull |
+            (maxMin23.r1 & 0x1F00000000ull) >> 5ull | (maxMin23.r1 & 0x1F0000000000ull) >> 18ull | (maxMin23.r1 & 0x1F000000000000ull) >> 32ull);
+            //bl3Indices << 32ull;
+        pDest++; *pDest = block3a; pDest++; *pDest = bl3Indices;
+    }
+    else if (CODEC_TYPE == GOOFY_ETC1)
+    {
+        // Combined masks (major bit = GreaterEqualZero  other 7 bits = LessQuantizationThreshold)
+        const uint8x16x4_t blMasks = {
+            simd::or(simd::andnot(constMaxInt, bl0GezMask), simd::and(bl0LqtMask, constMaxInt)),
+            simd::or(simd::andnot(constMaxInt, bl1GezMask), simd::and(bl1LqtMask, constMaxInt)),
+            simd::or(simd::andnot(constMaxInt, bl2GezMask), simd::and(bl2LqtMask, constMaxInt)),
+            simd::or(simd::andnot(constMaxInt, bl3GezMask), simd::and(bl3LqtMask, constMaxInt))
+        };
+
+        //  +---+---+---+---+           +---+---+---+---+
+        //  | A | B | C | D |           | C | G | K | O |
+        //  +---+---+---+---+           +---+---+---+---+
+        //  | E | F | G | H |           | D | H | L | P |
+        //  +---+---+---+---+    -->    +---+---+---+---+
+        //  | I | J | K | L |           | A | E | I | M |
+        //  +---+---+---+---+           +---+---+---+---+
+        //  | M | N | O | P |           | B | F | J | N |
+        //  +---+---+---+---+           +---+---+---+---+
+        const uint8x16x4_t blMasksTr = simd::transposeAs4x4x4(blMasks);
+
+        // Unpack masks and copy from bytes to bits
+        const uint32_t bl0PosOrZero = simd::moveMaskMSB(blMasksTr.r0);
+        const uint32_t bl1PosOrZero = simd::moveMaskMSB(blMasksTr.r1);
+        const uint32_t bl2PosOrZero = simd::moveMaskMSB(blMasksTr.r2);
+        const uint32_t bl3PosOrZero = simd::moveMaskMSB(blMasksTr.r3);
+
+        uint8x16_t bl0LessThanQtMask = simd::and(blMasksTr.r0, constMaxInt);
+        uint8x16_t bl1LessThanQtMask = simd::and(blMasksTr.r1, constMaxInt);
+        uint8x16_t bl2LessThanQtMask = simd::and(blMasksTr.r2, constMaxInt);
+        uint8x16_t bl3LessThanQtMask = simd::and(blMasksTr.r3, constMaxInt);
+        bl0LessThanQtMask = simd::addsatu(bl0LessThanQtMask, bl0LessThanQtMask);
+        bl1LessThanQtMask = simd::addsatu(bl1LessThanQtMask, bl1LessThanQtMask);
+        bl2LessThanQtMask = simd::addsatu(bl2LessThanQtMask, bl2LessThanQtMask);
+        bl3LessThanQtMask = simd::addsatu(bl3LessThanQtMask, bl3LessThanQtMask);
+
+        const uint32_t bl0LessThanQt = simd::moveMaskMSB(bl0LessThanQtMask);
+        const uint32_t bl1LessThanQt = simd::moveMaskMSB(bl1LessThanQtMask);
+        const uint32_t bl2LessThanQt = simd::moveMaskMSB(bl2LessThanQtMask);
+        const uint32_t bl3LessThanQt = simd::moveMaskMSB(bl3LessThanQtMask);
+
+#if 1
+        // Keep chromatic component from the average color, but override brightness
+        // NOTE: This is slightly slower but gets slightly better quality
+
+        // Find average blocks color
+        const uint8x16x4_t blAvg = {
+            simd::avg(simd::avg(bl0.r0, bl0.r1), simd::avg(bl0.r2, bl0.r3)),
+            simd::avg(simd::avg(bl1.r0, bl1.r1), simd::avg(bl1.r2, bl1.r3)),
+            simd::avg(simd::avg(bl2.r0, bl2.r1), simd::avg(bl2.r2, bl2.r3)),
+            simd::avg(simd::avg(bl3.r0, bl3.r1), simd::avg(bl3.r2, bl3.r3))
+        };
+
+        const uint8x16x4_t blAvgTr = simd::transposeAs4x4(blAvg);
+
+        const uint8x16_t blAvgColors = simd::avg(
+            simd::avg(blAvgTr.r0, blAvgTr.r1),
+            simd::avg(blAvgTr.r2, blAvgTr.r3)
+        );
+
+        // Note: a lot of SSE lanes wasted, it is not ideal, TODO?
+        //
+        // avg0.rgba | avg0.rgba | avg1.rgba | avg1.rgba
+        // avg2.rgba | avg2.rgba | avg3.rgba | avg3.rgba
+        // avg0.rgba | avg0.rgba | avg1.rgba | avg1.rgba
+        // avg2.rgba | avg2.rgba | avg3.rgba | avg3.rgba
+        const uint8x16x4_t blAvg4 = simd::zipU4x2(blAvgColors, blAvgColors, blAvgColors, blAvgColors);
+
+        // Deinterleave
+        // avg0.rr | avg1.rr | avg2.rr | avg3.rr | avg0.rr | avg1.rr | avg2.rr | avg3.rr
+        // avg0.gg | avg1.gg | avg2.gg | avg3.gg | avg0.gg | avg1.gg | avg2.gg | avg3.gg
+        // avg0.bb | avg1.bb | avg2.bb | avg3.bb | avg0.bb | avg1.bb | avg2.bb | avg3.bb
+        const uint8x16x3_t blAvg4Di = simd::deinterleaveRGB(blAvg4);
+
+        // Y = avg0.yy | avg1.yy | avg2.yy | avg3.yy | avg0.yy | avg1.yy | avg2.yy | avg3.yy
+        const uint8x16_t Y = simd::avg(simd::avg(blAvg4Di.r0, blAvg4Di.r2), blAvg4Di.r1);
+
+        // Min/max brightness per block
+        // R0 = avg0.yyyy | avg1.yyyy | avg2.yyyy | avg3.yyyy
+        // R1 = avg0.yyyy | avg1.yyyy | avg2.yyyy | avg3.yyyy    // NOTE: not used!
+        const uint8x16x2_t blAvgY = simd::zipB16(Y, Y);
+
+        const uint8x16_t blPosCorrectionY = simd::minu(simd::subsatu(blMidY, blAvgY.r0), constMaxInt);
+        const uint8x16_t blNegCorrectionY = simd::minu(simd::subsatu(blAvgY.r0, blMidY), constMaxInt);
+        const uint8x16_t blCorrectionYGezMask = simd::cmpeqi(blNegCorrectionY, constZero);
+        const uint8x16_t blCorrectionYAbs = simd::or(blPosCorrectionY, blNegCorrectionY);
+
+        // Get the color in the middle between  min/max colors of the block.
+        // NOTE: this is not the same as an average block color.
+
+        const uint8x16_t blBaseColorsPos = simd::addsatu(blAvgColors, blCorrectionYAbs);
+        const uint8x16_t blBaseColorsNeg = simd::subsatu(blAvgColors, blCorrectionYAbs);
+
+        const uint8x16_t blBaseColors = simd::select(blCorrectionYGezMask, blBaseColorsPos, blBaseColorsNeg);
+#else
+        // Get the color in the middle between  min/max colors of the block.
+        // NOTE: this is not the same as an average block color.
+        const uint8x16_t blBaseColors = simd::avg(minColors, maxColors);
+#endif
+
+        // Convert rgb888 to rgb555
+
+        // We can't shift right by 3 using SIMD, but we can shift right by 1 three times instead
+        // We need to sub eight before, because avg is (a+b+1) >> 1
+
+        // mid555_0.rgba | mid555_1.rgba | mid555_2.rgba | mid555_3.rgba
+        const uint8x16_t baseColors555 = simd::avg(simd::avg(simd::avg(simd::subsatu(blBaseColors, constEight), constZero), constZero), constZero);
+
+        const uint64x2_t baseColors = simd::getAsUInt64x2(baseColors555);
+
+        // R0
+        // AAAAAAAA000000000000000000000000AAAAAAAA000000000000000000011111b << 3  = 00000000 00000000 11111000b
+        // AAAAAAAA000000000000000000000000AAAAAAAA000000000001111100000000b << 3  = 00000000 11111000 00000000b
+        // AAAAAAAA000000000000000000000000AAAAAAAA000111110000000000000000b << 3  = 11111000 00000000 00000000b
+
+        // R1
+        // AAAAAAAA000000000000000000011111AAAAAAAA000000000000000000000000b >> 29 = 00000000 00000000 11111000b
+        // AAAAAAAA000000000001111100000000AAAAAAAA000000000000000000000000b >> 29 = 00000000 11111000 00000000b
+        // AAAAAAAA000111110000000000000000AAAAAAAA000000000000000000000000b >> 29 = 11111000 00000000 00000000b
+
+        uint32_t* goofy_restrict pDest = (uint32_t* goofy_restrict)pResult;
+
+        const uint32_t block0a = etc1BrighnessRangeTocontrolByte[blRangeY.m128i_u8[0]] | ((baseColors.r0 << 3ull) & 0xFFFFFF);
+        const uint32_t block0b = ~(bl0PosOrZero | (bl0LessThanQt << 16));
+        *pDest = block0a; pDest++; *pDest = block0b;
+
+        const uint32_t block1a = etc1BrighnessRangeTocontrolByte[blRangeY.m128i_u8[4]] | ((baseColors.r0 >> 29ull) & 0xFFFFFF);
+        const uint32_t block1b = ~(bl1PosOrZero | (bl1LessThanQt << 16));
+        pDest++; *pDest = block1a; pDest++; *pDest = block1b;
+
+        const uint32_t block2a = etc1BrighnessRangeTocontrolByte[blRangeY.m128i_u8[8]] | ((baseColors.r1 << 3ull) & 0xFFFFFF);
+        const uint32_t block2b = ~(bl2PosOrZero | (bl2LessThanQt << 16));
+        pDest++; *pDest = block2a; pDest++; *pDest = block2b;
+
+        const uint32_t block3a = etc1BrighnessRangeTocontrolByte[blRangeY.m128i_u8[12]] | ((baseColors.r1 >> 29ull) & 0xFFFFFF);
+        const uint32_t block3b = ~(bl3PosOrZero | (bl3LessThanQt << 16));
+        pDest++; *pDest = block3a; pDest++; *pDest = block3b;
+    }
+}
+
+
+int compressDXT1(unsigned char* result, const unsigned char* input, unsigned int width, unsigned int height, unsigned int stride)
+{
+    // those checks are required because of 4x1 block window inside the compressor
+    if (width % 16 != 0)
+    {
+        return -1;
+    }
+
+    if (height % 4 != 0)
+    {
+        return -2;
+    }
+
+    unsigned int blockW = width >> 2;
+    unsigned int blockH = height >> 2;
+
+    size_t inputStride = stride;
+    for (uint32_t y = 0; y < blockH; y++)
+    {
+        const unsigned char* goofy_restrict encoderPos = input;
+        for (uint32_t x = 0; x < blockW; x += 4)
+        {
+            goofySimdEncode<GOOFY_DXT1>(encoderPos, inputStride, result);
+            encoderPos += 64; // 16 rgba pixels (4 DXT blocks) = 16 * 4 = 64
+            result += 32;     // 4 DXT1 blocks = 8 * 4 = 32
+        }
+        input += inputStride * 4; // 4 lines
+    }
+    return 0;
+}
+
+int compressETC1(unsigned char* result, const unsigned char* input, unsigned int width, unsigned int height, unsigned int stride)
+{
+    // those checks are required because of 4x1 block window inside the compressor
+    if (width % 16 != 0)
+    {
+        return -1;
+    }
+
+    if (height % 4 != 0)
+    {
+        return -2;
+    }
+
+    unsigned int blockW = width >> 2;
+    unsigned int blockH = height >> 2;
+
+    size_t inputStride = stride;
+    for (uint32_t y = 0; y < blockH; y++)
+    {
+        const unsigned char* goofy_restrict encoderPos = input;
+        for (uint32_t x = 0; x < blockW; x += 4)
+        {
+            goofySimdEncode<GOOFY_ETC1>(encoderPos, inputStride, result);
+            encoderPos += 64; // 16 rgba pixels (4 DXT blocks) = 16 * 4 = 64
+            result += 32;     // 4 DXT1 blocks = 8 * 4 = 32
+        }
+        input += inputStride * 4; // 4 lines
+    }
+    return 0;
+}
+
+
+
+#undef goofy_restrict
+#undef goofy_inline
+#undef goofy_align16
+}
+#endif
+
+
+
+
+
+// Copyright (c) 2020 Sergey Makeev <sergeymakeev@hotmail.com>
+//
+// Permission is hereby granted, free of charge, to any person obtaining
+// a copy of this software and associated documentation files (the
+// "Software"), to	deal in the Software without restriction, including
+// without limitation the rights to use, copy, modify, merge, publish,
+// distribute, sublicense, and/or sell copies of the Software, and to
+// permit persons to whom the Software is furnished to do so, subject to
+// the following conditions:
+//
+// The above copyright notice and this permission notice shall be included
+// in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+// IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+// CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+// TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+// SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.