nvidia-texture-tools/extern/goofy_tc.h

// goofy_tc.h v1.0
// Realtime BC1/ETC1 encoder by Sergey Makeev <sergeymakeev@hotmail.com>
//
// LICENSE:
//  MIT license at the end of this file.

namespace goofy
{
    int compressDXT1(unsigned char* result, const unsigned char* input, unsigned int width, unsigned int height, unsigned int stride);
    int compressETC1(unsigned char* result, const unsigned char* input, unsigned int width, unsigned int height, unsigned int stride);
}

#include <stdint.h>

// Enable SSE2 codec
#define GOOFY_SSE2 (1)

#define goofy_restrict __restrict
#define goofy_inline __forceinline

#define goofy_align16(x) __declspec(align(16)) x

#ifdef GOOFY_SSE2
#include <emmintrin.h>  // SSE2
#else
#include <string> // memset/memcpy
#endif

#ifdef GOOFYTC_IMPLEMENTATION
namespace goofy
{

// constants
goofy_align16(static const uint32_t gConstEight[4]) = { 0x08080808, 0x08080808, 0x08080808, 0x08080808 };
goofy_align16(static const uint32_t gConstSixteen[4]) = { 0x10101010, 0x10101010, 0x10101010, 0x10101010 };
goofy_align16(static const uint32_t gConstMaxInt[4]) = { 0x7f7f7f7f, 0x7f7f7f7f, 0x7f7f7f7f, 0x7f7f7f7f };

#ifdef GOOFY_SSE2
typedef __m128i uint8x16_t;
#else

struct uint8x16_t
{
    union
    {
        uint8_t data[16];

        int8_t m128i_i8[16];
        uint8_t m128i_u8[16];

        struct
        {
            uint8_t r0;
            uint8_t g0;
            uint8_t b0;
            uint8_t a0;

            uint8_t r1;
            uint8_t g1;
            uint8_t b1;
            uint8_t a1;

            uint8_t r2;
            uint8_t g2;
            uint8_t b2;
            uint8_t a2;

            uint8_t r3;
            uint8_t g3;
            uint8_t b3;
            uint8_t a3;
        };

        struct
        {
            uint16_t s0;
            uint16_t s1;
            uint16_t s2;
            uint16_t s3;
            uint16_t s4;
            uint16_t s5;
            uint16_t s6;
            uint16_t s7;
        };

        struct
        {
            uint32_t u0;
            uint32_t u1;
            uint32_t u2;
            uint32_t u3;
        };

        struct
        {
            uint64_t l0;
            uint64_t l1;
        };
    };
};

#endif


// 2x16xU8
struct uint8x16x2_t
{
    // rows
    uint8x16_t r0;
    uint8x16_t r1;
};

// 3x16xU8
struct uint8x16x3_t
{
    // rows
    uint8x16_t r0;
    uint8x16_t r1;
    uint8x16_t r2;
};

// 4x16xU8
struct uint8x16x4_t
{
    // rows
    uint8x16_t r0;
    uint8x16_t r1;
    uint8x16_t r2;
    uint8x16_t r3;
};

// 2xU64
struct uint64x2_t
{
    uint64_t r0;
    uint64_t r1;
};


namespace simd
{
// SSE2 implementation
#ifdef GOOFY_SSE2

    goofy_inline uint8x16_t zero()
    {
        return _mm_setzero_si128();
    }

    goofy_inline uint8x16_t fetch(const void* p)
    {
        return _mm_load_si128((const __m128i*)p);
    }

    goofy_inline uint64x2_t getAsUInt64x2(const uint8x16_t& a)
    {
        uint64x2_t res;
        res.r0 = _mm_cvtsi128_si64(a);
        res.r1 = _mm_cvtsi128_si64(_mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
        return res;
    }

    goofy_inline uint8x16_t or(const uint8x16_t& a, const uint8x16_t& b)
    {
        return _mm_or_si128(a, b);
    }

    goofy_inline uint8x16_t and(const uint8x16_t& a, const uint8x16_t& b)
    {
        return _mm_and_si128(a, b);
    }

    goofy_inline uint8x16_t andnot(const uint8x16_t& a, const uint8x16_t& b)
    {
        return _mm_andnot_si128(a, b);
    }

    goofy_inline uint8x16_t select(const uint8x16_t& mask, const uint8x16_t& a, const uint8x16_t& b)
    {
        return _mm_or_si128(_mm_and_si128(mask, a), _mm_andnot_si128(mask, b));
    }

    goofy_inline uint8x16_t minu(const uint8x16_t& a, const uint8x16_t& b)
    {
        return _mm_min_epu8(a, b);
    }

    goofy_inline uint8x16_t maxu(const uint8x16_t& a, const uint8x16_t& b)
    {
        return _mm_max_epu8(a, b);
    }

    goofy_inline uint8x16_t avg(const uint8x16_t& a, const uint8x16_t& b)
    {
        return _mm_avg_epu8(a, b);
    }

    goofy_inline uint8x16_t replicateU0000(const uint8x16_t& a)
    {
        return _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 0, 0));
    }

    goofy_inline uint8x16_t replicateU1111(const uint8x16_t& a)
    {
        return _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 1, 1, 1));
    }

    goofy_inline uint8x16_t replicateU2222(const uint8x16_t& a)
    {
        return _mm_shuffle_epi32(a, _MM_SHUFFLE(2, 2, 2, 2));
    }

    goofy_inline uint8x16_t replicateU3333(const uint8x16_t& a)
    {
        return _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 3, 3, 3));
    }

    goofy_inline uint8x16_t cmpeqi(const uint8x16_t& a, const uint8x16_t& b)
    {
        return _mm_cmpeq_epi8(a, b);
    }

    goofy_inline uint8x16_t cmplti(const uint8x16_t& a, const uint8x16_t& b)
    {
        return _mm_cmplt_epi8(a, b);
    }

    goofy_inline uint8x16_t addsatu(const uint8x16_t& a, const uint8x16_t& b)
    {
        return _mm_adds_epu8(a, b);
    }

    goofy_inline uint8x16_t subsatu(const uint8x16_t& a, const uint8x16_t& b)
    {
        return _mm_subs_epu8(a, b);
    }

    goofy_inline uint8x16x4_t transposeAs4x4(const uint8x16x4_t& v)
    {
        uint8x16_t tr0 = _mm_unpacklo_epi32(v.r0, v.r1);
        uint8x16_t tr1 = _mm_unpacklo_epi32(v.r2, v.r3);
        uint8x16_t tr2 = _mm_unpackhi_epi32(v.r0, v.r1);
        uint8x16_t tr3 = _mm_unpackhi_epi32(v.r2, v.r3);

        uint8x16x4_t res;
        res.r0 = _mm_unpacklo_epi64(tr0, tr1);
        res.r1 = _mm_unpackhi_epi64(tr0, tr1);
        res.r2 = _mm_unpacklo_epi64(tr2, tr3);
        res.r3 = _mm_unpackhi_epi64(tr2, tr3);
        return res;
    }

    goofy_inline uint8x16x3_t deinterleaveRGB(const uint8x16x4_t& v)
    {
        uint8x16_t s0a = _mm_unpacklo_epi8(v.r0, v.r1);
        uint8x16_t s0b = _mm_unpackhi_epi8(v.r0, v.r1);
        uint8x16_t s0c = _mm_unpacklo_epi8(v.r2, v.r3);
        uint8x16_t s0d = _mm_unpackhi_epi8(v.r2, v.r3);
        uint8x16_t s1a = _mm_unpacklo_epi8(s0a, s0b);
        uint8x16_t s1b = _mm_unpackhi_epi8(s0a, s0b);
        uint8x16_t s1c = _mm_unpacklo_epi8(s0c, s0d);
        uint8x16_t s1d = _mm_unpackhi_epi8(s0c, s0d);
        uint8x16_t s2a = _mm_unpacklo_epi8(s1a, s1b);
        uint8x16_t s2b = _mm_unpackhi_epi8(s1a, s1b);
        uint8x16_t s2c = _mm_unpacklo_epi8(s1c, s1d);
        uint8x16_t s2d = _mm_unpackhi_epi8(s1c, s1d);

        uint8x16x3_t res;
        res.r0 = _mm_unpacklo_epi64(s2a, s2c);   // red
        res.r1 = _mm_unpackhi_epi64(s2a, s2c);   // green
        res.r2 = _mm_unpacklo_epi64(s2b, s2d);   // blue
        //res.r3 = _mm_unpackhi_epi64(s2b, s2d); // alpha
        return res;
    }

    // transpose as four single channel 4x4 blocks at once
    //
    // in:
    //
    // R0  = | bl0.abcd | bl0.efgh | bl0.ijkl | bl0.mnop |
    // R1  = | bl1.abcd | bl1.efgh | bl1.ijkl | bl1.mnop |
    // R2  = | bl2.abcd | bl2.efgh | bl2.ijkl | bl2.mnop |
    // R3  = | bl3.abcd | bl3.efgh | bl3.ijkl | bl3.mnop |
    //
    // out:
    //
    // R1  = | bl0.aeim | bl0.bfjo | bl0.cgko | bl0.dhkl |
    // R2  = | bl1.aeim | bl1.bfjo | bl1.cgko | bl1.dhkl |
    // R3  = | bl2.aeim | bl2.bfjo | bl2.cgko | bl2.dhkl |
    // R0  = | bl3.aeim | bl3.bfjo | bl3.cgko | bl3.dhkl |
    //
    //  +---+---+---+---+           +---+---+---+---+
    //  | A | B | C | D |           | A | E | I | M |
    //  +---+---+---+---+           +---+---+---+---+
    //  | E | F | G | H |           | B | F | J | O |
    //  +---+---+---+---+    -->    +---+---+---+---+
    //  | I | J | K | L |           | C | G | K | O |
    //  +---+---+---+---+           +---+---+---+---+
    //  | M | N | O | P |           | D | H | K | L |
    //  +---+---+---+---+           +---+---+---+---+
    //
    goofy_inline uint8x16x4_t transposeAs4x4x4(const uint8x16x4_t& v)
    {
        const uint8x16_t s0a = _mm_unpacklo_epi8(v.r0, v.r1);
        const uint8x16_t s0b = _mm_unpackhi_epi8(v.r0, v.r1);
        const uint8x16_t s0c = _mm_unpacklo_epi8(v.r2, v.r3);
        const uint8x16_t s0d = _mm_unpackhi_epi8(v.r2, v.r3);
        const uint8x16_t s1a = _mm_unpacklo_epi8(s0a, s0b);
        const uint8x16_t s1b = _mm_unpackhi_epi8(s0a, s0b);
        const uint8x16_t s1c = _mm_unpacklo_epi8(s0c, s0d);
        const uint8x16_t s1d = _mm_unpackhi_epi8(s0c, s0d);
        const uint8x16_t s2a = _mm_unpacklo_epi8(s1a, s1b);
        const uint8x16_t s2b = _mm_unpackhi_epi8(s1a, s1b);
        const uint8x16_t s2c = _mm_unpacklo_epi8(s1c, s1d);
        const uint8x16_t s2d = _mm_unpackhi_epi8(s1c, s1d);

        const uint8x16_t s3a = _mm_unpacklo_epi32(s2a, s2b);
        const uint8x16_t s3b = _mm_unpackhi_epi32(s2a, s2b);
        const uint8x16_t s3c = _mm_unpacklo_epi32(s2c, s2d);
        const uint8x16_t s3d = _mm_unpackhi_epi32(s2c, s2d);

        const uint8x16_t s4a = _mm_unpacklo_epi64(s3a, s3b);
        const uint8x16_t s4b = _mm_unpackhi_epi64(s3a, s3b);
        const uint8x16_t s4c = _mm_unpacklo_epi64(s3c, s3d);
        const uint8x16_t s4d = _mm_unpackhi_epi64(s3c, s3d);

        uint8x16x4_t res;
        res.r0 = _mm_shuffle_epi32(s4a, _MM_SHUFFLE(2, 0, 3, 1));
        res.r1 = _mm_shuffle_epi32(s4b, _MM_SHUFFLE(2, 0, 3, 1));
        res.r2 = _mm_shuffle_epi32(s4c, _MM_SHUFFLE(2, 0, 3, 1));
        res.r3 = _mm_shuffle_epi32(s4d, _MM_SHUFFLE(2, 0, 3, 1));
        return res;
    }

    goofy_inline uint8x16x4_t zipU4x2(const uint8x16_t& a, const uint8x16_t& b, const uint8x16_t& c, const uint8x16_t& d)
    {
        const uint8x16x4_t res = {
            _mm_unpacklo_epi32(a, b),
            _mm_unpackhi_epi32(a, b),
            _mm_unpacklo_epi32(c, d),
            _mm_unpackhi_epi32(c, d),
        };

        return res;
    }

    goofy_inline uint8x16x2_t zipU4(const uint8x16_t& a, const uint8x16_t& b)
    {
        uint8x16x2_t res;
        res.r0 = _mm_unpacklo_epi32(a, b);
        res.r1 = _mm_unpackhi_epi32(a, b);
        return res;
    }

    goofy_inline uint32_t moveMaskMSB(const uint8x16_t& v)
    {
        return (uint32_t)_mm_movemask_epi8(v);
    }

    goofy_inline uint8x16x2_t zipB16(const uint8x16_t& a, const uint8x16_t& b)
    {
        uint8x16x2_t res;
        res.r0 = _mm_unpacklo_epi8(a, b);
        res.r1 = _mm_unpackhi_epi8(a, b);
        return res;
    }

    goofy_inline uint8x16_t not(const uint8x16_t& v)
    {
        return _mm_xor_si128(v, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
    }

#else
    // generic CPU implementation
    namespace detail
    {
        goofy_inline uint8x16_t unpacklo16(const uint8x16_t& a, const uint8x16_t& b)
        {
            uint8x16_t res;
            res.s0 = a.s0;
            res.s1 = b.s0;
            res.s2 = a.s1;
            res.s3 = b.s1;
            res.s4 = a.s2;
            res.s5 = b.s2;
            res.s6 = a.s3;
            res.s7 = b.s3;
            return res;
        }

        goofy_inline uint8x16_t unpackhi16(const uint8x16_t& a, const uint8x16_t& b)
        {
            uint8x16_t res;
            res.s0 = a.s4;
            res.s1 = b.s4;
            res.s2 = a.s5;
            res.s3 = b.s5;
            res.s4 = a.s6;
            res.s5 = b.s6;
            res.s6 = a.s7;
            res.s7 = b.s7;
            return res;
        }

        goofy_inline uint8x16_t unpacklo8(const uint8x16_t& a, const uint8x16_t& b)
        {
            uint8x16_t res;
            res.data[0] = a.data[0];
            res.data[1] = b.data[0];
            res.data[2] = a.data[1];
            res.data[3] = b.data[1];
            res.data[4] = a.data[2];
            res.data[5] = b.data[2];
            res.data[6] = a.data[3];
            res.data[7] = b.data[3];
            res.data[8] = a.data[4];
            res.data[9] = b.data[4];
            res.data[10] = a.data[5];
            res.data[11] = b.data[5];
            res.data[12] = a.data[6];
            res.data[13] = b.data[6];
            res.data[14] = a.data[7];
            res.data[15] = b.data[7];
            return res;
        }

        goofy_inline uint8x16_t unpackhi8(const uint8x16_t& a, const uint8x16_t& b)
        {
            uint8x16_t res;
            res.data[0] = a.data[8];
            res.data[1] = b.data[8];
            res.data[2] = a.data[9];
            res.data[3] = b.data[9];
            res.data[4] = a.data[10];
            res.data[5] = b.data[10];
            res.data[6] = a.data[11];
            res.data[7] = b.data[11];
            res.data[8] = a.data[12];
            res.data[9] = b.data[12];
            res.data[10] = a.data[13];
            res.data[11] = b.data[13];
            res.data[12] = a.data[14];
            res.data[13] = b.data[14];
            res.data[14] = a.data[15];
            res.data[15] = b.data[15];
            return res;
        }

        goofy_inline uint8x16_t unpacklo64(const uint8x16_t& a, const uint8x16_t& b)
        {
            uint8x16_t res;
            res.l0 = a.l0;
            res.l1 = b.l0;
            return res;
        }

        goofy_inline uint8x16_t unpackhi64(const uint8x16_t& a, const uint8x16_t& b)
        {
            uint8x16_t res;
            res.l0 = a.l1;
            res.l1 = b.l1;
            return res;
        }

        goofy_inline uint8x16_t unpacklo32(const uint8x16_t& a, const uint8x16_t& b)
        {
            uint8x16_t res;
            res.u0 = a.u0;
            res.u1 = b.u0;
            res.u2 = a.u1;
            res.u3 = b.u1;
            return res;
        }

        goofy_inline uint8x16_t unpackhi32(const uint8x16_t& a, const uint8x16_t& b)
        {
            uint8x16_t res;
            res.u0 = a.u2;
            res.u1 = b.u2;
            res.u2 = a.u3;
            res.u3 = b.u3;
            return res;
        }

        goofy_inline uint8x16_t replicateU0011(const uint8x16_t& a)
        {
            uint8x16_t res;
            res.u0 = a.u0;
            res.u1 = a.u0;
            res.u2 = a.u1;
            res.u3 = a.u1;
            return res;
        }

        goofy_inline uint8x16_t replicateU2233(const uint8x16_t& a)
        {
            uint8x16_t res;
            res.u0 = a.u2;
            res.u1 = a.u2;
            res.u2 = a.u3;
            res.u3 = a.u3;
            return res;
        }

        goofy_inline uint8x16_t swizzleU1302(const uint8x16_t& a)
        {
            uint8x16_t res;
            res.u0 = a.u1;
            res.u1 = a.u3;
            res.u2 = a.u0;
            res.u3 = a.u2;
            return res;
        }

    } //detail

    goofy_inline uint8x16_t zero()
    {
        uint8x16_t r;
        memset(&r, 0, sizeof(uint8x16_t));
        return r;
    }

    goofy_inline uint8x16_t fetch(const void* p)
    {
        uint8x16_t r;
        memcpy(&r, p, sizeof(uint8x16_t));
        return r;
    }

    goofy_inline uint64x2_t getAsUInt64x2(const uint8x16_t& a)
    {
        uint64x2_t res;
        res.r0 = a.l0;
        res.r1 = a.l1;
        return res;
    }

    // bit or
    goofy_inline uint8x16_t or(const uint8x16_t& a, const uint8x16_t& b)
    {
        uint8x16_t res;
        for (uint32_t i = 0; i < 16; i++)
        {
            res.data[i] = a.data[i] | b.data[i];
        }
        return res;
    }

    // bit and
    goofy_inline uint8x16_t and(const uint8x16_t& a, const uint8x16_t& b)
    {
        uint8x16_t res;
        for (uint32_t i = 0; i < 16; i++)
        {
            res.data[i] = a.data[i] & b.data[i];
        }
        return res;
    }

    goofy_inline uint8x16_t andnot(const uint8x16_t& a, const uint8x16_t& b)
    {
        uint8x16_t res;
        for (uint32_t i = 0; i < 16; i++)
        {
            res.data[i] = (~a.data[i]) & b.data[i];
        }
        return res;
    }

    goofy_inline uint32_t moveMaskMSB(const uint8x16_t& v)
    {
        uint32_t res = 0;
        for (uint32_t i = 0; i < 16; i++)
        {
            uint32_t msb = ((v.data[i] & 0x80) >> 7);
            res = res | (msb << i);
        }
        return res;
    }

    //
    // if (maskA) {
    //  return a;
    // }
    // else {
    //  return b;
    // }
    goofy_inline uint8x16_t select(const uint8x16_t& mask, const uint8x16_t& a, const uint8x16_t& b)
    {
        uint8x16_t res;
        for (uint32_t i = 0; i < 16; i++)
        {
            unsigned char msk = mask.data[i];
            res.data[i] = (msk & a.data[i]) | ((~msk) & b.data[i]); // _mm_or_si128(_mm_and_si128(mask, a), _mm_andnot_si128(mask, b))
        }
        return res;
    }

    goofy_inline uint8x16_t minu(const uint8x16_t& a, const uint8x16_t& b)
    {
        uint8x16_t res;
        for (uint32_t i = 0; i < 16; i++)
        {
            res.data[i] = (a.data[i] < b.data[i]) ? a.data[i] : b.data[i];
        }
        return res;
    }

    goofy_inline uint8x16_t maxu(const uint8x16_t& a, const uint8x16_t& b)
    {
        uint8x16_t res;
        for (uint32_t i = 0; i < 16; i++)
        {
            res.data[i] = (a.data[i] > b.data[i]) ? a.data[i] : b.data[i];
        }
        return res;
    }

    goofy_inline uint8x16_t avg(const uint8x16_t& a, const uint8x16_t& b)
    {
        uint8x16_t res;
        for (uint32_t i = 0; i < 16; i++)
        {
            uint32_t t = (a.data[i] + b.data[i]) + 1;
            res.data[i] = (uint8_t)(t >> 1);
        }
        return res;
    }

    goofy_inline uint8x16_t replicateU0000(const uint8x16_t& a)
    {
        uint8x16_t res;
        res.u0 = a.u0;
        res.u1 = a.u0;
        res.u2 = a.u0;
        res.u3 = a.u0;
        return res;
    }

    goofy_inline uint8x16_t replicateU1111(const uint8x16_t& a)
    {
        uint8x16_t res;
        res.u0 = a.u1;
        res.u1 = a.u1;
        res.u2 = a.u1;
        res.u3 = a.u1;
        return res;
    }

    goofy_inline uint8x16_t replicateU2222(const uint8x16_t& a)
    {
        uint8x16_t res;
        res.u0 = a.u2;
        res.u1 = a.u2;
        res.u2 = a.u2;
        res.u3 = a.u2;
        return res;
    }

    goofy_inline uint8x16_t replicateU3333(const uint8x16_t& a)
    {
        uint8x16_t res;
        res.u0 = a.u3;
        res.u1 = a.u3;
        res.u2 = a.u3;
        res.u3 = a.u3;
        return res;
    }

    // cmp equal (signed)
    goofy_inline uint8x16_t cmpeqi(const uint8x16_t& a, const uint8x16_t& b)
    {
        uint8x16_t res;
        for (uint32_t i = 0; i < 16; i++)
        {
            res.data[i] = ((char)a.data[i] == (char)b.data[i]) ? 0xFF : 0x00;
        }
        return res;
    }

    // cmp less (signed)
    goofy_inline uint8x16_t cmplti(const uint8x16_t& a, const uint8x16_t& b)
    {
        uint8x16_t res;
        for (uint32_t i = 0; i < 16; i++)
        {
            res.data[i] = ((char)a.data[i] < (char)b.data[i]) ? 0xFF : 0x00;
        }
        return res;
    }

    // add unsigned saturate
    goofy_inline uint8x16_t addsatu(const uint8x16_t& a, const uint8x16_t& b)
    {
        uint8x16_t res;
        for (uint32_t i = 0; i < 16; i++)
        {
            int32_t diff = ((unsigned char)a.data[i] + (unsigned char)b.data[i]);
            if (diff > 255)
                diff = 255;
            res.data[i] = (uint8_t)diff;
        }
        return res;
    }

    // sub unsigned saturate
    goofy_inline uint8x16_t subsatu(const uint8x16_t& a, const uint8x16_t& b)
    {
        uint8x16_t res;
        for (uint32_t i = 0; i < 16; i++)
        {
            int32_t diff = ((unsigned char)a.data[i] - (unsigned char)b.data[i]);
            if (diff < 0)
                diff = 0;
            res.data[i] = (uint8_t)diff;
        }
        return res;
    }

    // transpose as one 4x4 RGBA block
    //
    // in:
    //
    // R0  = | a0.rgba | a1.rgba | a2.rgba | a3.rgba |
    // R1  = | b0.rgba | b1.rgba | b2.rgba | b3.rgba |
    // R2  = | c0.rgba | c1.rgba | c2.rgba | c3.rgba |
    // R3  = | d0.rgba | d1.rgba | d2.rgba | d3.rgba |
    //
    // out:
    //
    // R0  = | a0.rgba | b0.rgba | c0.rgba | d0.rgba |
    // R1  = | a1.rgba | b1.rgba | c1.rgba | d1.rgba |
    // R2  = | a2.rgba | b2.rgba | c2.rgba | d2.rgba |
    // R3  = | a3.rgba | b3.rgba | c3.rgba | d3.rgba |
    //
    goofy_inline uint8x16x4_t transposeAs4x4(const uint8x16x4_t& v)
    {
        // a0, b0, a1, b1
        const uint8x16_t tr0 = detail::unpacklo32(v.r0, v.r1);
        // c0, d0, c1, d1
        const uint8x16_t tr1 = detail::unpacklo32(v.r2, v.r3);
        // a2, b2, a3, b3
        const uint8x16_t tr2 = detail::unpackhi32(v.r0, v.r1);
        // c2, d2, c3, d3
        const uint8x16_t tr3 = detail::unpackhi32(v.r2, v.r3);

        uint8x16x4_t res;
        // a0, b0, c0, d0
        res.r0 = detail::unpacklo64(tr0, tr1);
        // a1, b1, c1, d1
        res.r1 = detail::unpackhi64(tr0, tr1);
        // a2, b2, c2, d2
        res.r2 = detail::unpacklo64(tr2, tr3);
        // a3, b3, c3, d3
        res.r3 = detail::unpackhi64(tr2, tr3);
        return res;
    }

    // deinterleave as 4x16
    //
    // in:
    //
    // R0  = | a0.rgba | a1.rgba | a2.rgba | a3.rgba |
    // R1  = | b0.rgba | b1.rgba | b2.rgba | b3.rgba |
    // R2  = | c0.rgba | c1.rgba | c2.rgba | c3.rgba |
    // R3  = | d0.rgba | d1.rgba | d2.rgba | d3.rgba |
    //
    // out:
    //
    // R0  = | a0.r | a1.r | a2.r | a3.r | b0.r | b1.r | b2.r | b3.r | c0.r | c1.r | c2.r | c3.r | d0.r | d1.r | d2.r | d3.r |
    // R1  = | a0.g | a1.g | a2.g | a3.g | b0.g | b1.g | b2.g | b3.g | c0.g | c1.g | c2.g | c3.g | d0.g | d1.g | d2.g | d3.g |
    // R2  = | a0.b | a1.b | a2.b | a3.b | b0.b | b1.b | b2.b | b3.b | c0.b | c1.b | c2.b | c3.b | d0.b | d1.b | d2.b | d3.b |
    // R3  = | a0.a | a1.a | a2.a | a3.a | b0.a | b1.a | b2.a | b3.a | c0.a | c1.a | c2.a | c3.a | d0.a | d1.a | d2.a | d3.a |
    //
    goofy_inline uint8x16x3_t deinterleaveRGB(const uint8x16x4_t& v)
    {
        // step 1

        // | a0.r | b0.r | a0.g | b0.g | a0.b | b0.b | a0.a | b0.a | a1.r | b1.r | a1.g | b1.g | a1.b | b1.b | a1.a | b1.a |
        // | a2.r | b2.r | a2.g | b2.g | a2.b | b2.b | a2.a | b2.a | a3.r | b3.r | a3.g | b3.g | a3.b | b3.b | a3.a | b3.a |
        const uint8x16_t s0a = detail::unpacklo8(v.r0, v.r1);
        const uint8x16_t s0b = detail::unpackhi8(v.r0, v.r1);

        // | c0.r | d0.r | c0.g | d0.g | c0.b | d0.b | c0.a | d0.a | c1.r | d1.r | c1.g | d1.g | c1.b | d1.b | c1.a | d1.a |
        // | c2.r | d2.r | c2.g | d2.g | c2.b | d2.b | c2.a | d2.a | c3.r | d3.r | c3.g | d3.g | c3.b | d3.b | c3.a | d3.a |
        const uint8x16_t s0c = detail::unpacklo8(v.r2, v.r3);
        const uint8x16_t s0d = detail::unpackhi8(v.r2, v.r3);

        // step 2
        // | a0.r | a2.r | b0.r | b2.r | a0.g | a2.g | b0.g | b2.g | a0.b | a2.b | b0.b | b2.b | a0.a | a2.a | b0.a | b2.a |
        // | a1.r | a3.r | b1.r | b3.r | a1.g | a3.g | b1.g | b3.g | a1.b | a3.b | b1.b | b3.b | a1.a | a3.a | b1.a | b3.a |
        const uint8x16_t s1a = detail::unpacklo8(s0a, s0b);
        const uint8x16_t s1b = detail::unpackhi8(s0a, s0b);

        // | c0.r | c2.r | d0.r | d2.r | c0.g | c2.g | d0.g | d2.g | c0.b | c2.b | d0.b | d2.b | c0.a | c2.a | d0.a | d2.a |
        // | c1.r | c3.r | d1.r | d3.r | c1.g | c3.g | d1.g | d3.g | c1.b | c3.b | d1.b | d3.b | c1.a | c3.a | d1.a | d3.a |
        const uint8x16_t s1c = detail::unpacklo8(s0c, s0d);
        const uint8x16_t s1d = detail::unpackhi8(s0c, s0d);

        // step 3
        // | a0.r | a1.r | a2.r | a3.r | b0.r | b1.r | b2.r | b3.r | a0.g | a1.g | a2.g | a3.g | b0.g | b1.g | b2.g | b3.g |
        // | a0.b | a1.b | a2.b | a3.b | b0.b | b1.b | b2.b | b3.b | a0.a | a1.a | a2.a | a3.a | b0.a | b1.a | b2.a | b3.a |
        const uint8x16_t s2a = detail::unpacklo8(s1a, s1b);
        const uint8x16_t s2b = detail::unpackhi8(s1a, s1b);

        // | c0.r | c1.r | c2.r | c3.r | d0.r | d1.r | d2.r | d3.r | c0.g | c1.g | c2.g | c3.g | d0.g | d1.g | d2.g | d3.g |
        // | c0.b | c1.b | c2.b | c3.b | d0.b | d1.b | d2.b | d3.b | c0.a | c1.a | c2.a | c3.a | d0.a | d1.a | d2.a | d3.a |
        const uint8x16_t s2c = detail::unpacklo8(s1c, s1d);
        const uint8x16_t s2d = detail::unpackhi8(s1c, s1d);

        // step 4 (final)
        uint8x16x3_t res;
        // | a0.r | a1.r | a2.r | a3.r | b0.r | b1.r | b2.r | b3.r | c0.r | c1.r | c2.r | c3.r | d0.r | d1.r | d2.r | d3.r |
        // | a0.g | a1.g | a2.g | a3.g | b0.g | b1.g | b2.g | b3.g | c0.g | c1.g | c2.g | c3.g | d0.g | d1.g | d2.g | d3.g |
        res.r0 = detail::unpacklo64(s2a, s2c);
        res.r1 = detail::unpackhi64(s2a, s2c);

        // | a0.b | a1.b | a2.b | a3.b | b0.b | b1.b | b2.b | b3.b | c0.b | c1.b | c2.b | c3.b | d0.b | d1.b | d2.b | d3.b |
        // | a0.a | a1.a | a2.a | a3.a | b0.a | b1.a | b2.a | b3.a | c0.a | c1.a | c2.a | c3.a | d0.a | d1.a | d2.a | d3.a |
        res.r2 = detail::unpacklo64(s2b, s2d);
        //res.r3 = detail::unpackhi64(s2b, s2d);
        return res;
    }

    // transpose as four single channel 4x4 blocks at once
    //
    // in:
    //
    // R0  = | bl0.abcd | bl0.efgh | bl0.ijkl | bl0.mnop |
    // R1  = | bl1.abcd | bl1.efgh | bl1.ijkl | bl1.mnop |
    // R2  = | bl2.abcd | bl2.efgh | bl2.ijkl | bl2.mnop |
    // R3  = | bl3.abcd | bl3.efgh | bl3.ijkl | bl3.mnop |
    //
    // out:
    //         NOTE: columns are swapped!
    //
    //            3          4           0         1
    // R1  = | bl0.cgko | bl0.dhlp | bl0.aeim | bl0.bfjn |
    // R2  = | bl1.cgko | bl1.dhlp | bl1.aeim | bl1.bfjn |
    // R3  = | bl2.cgko | bl2.dhlp | bl2.aeim | bl2.bfjn |
    // R0  = | bl3.cgko | bl3.dhlp | bl3.aeim | bl3.bfjn |
    //
    //  +---+---+---+---+           +---+---+---+---+
    //  | A | B | C | D |           | C | G | K | O |
    //  +---+---+---+---+           +---+---+---+---+
    //  | E | F | G | H |           | D | H | L | P |
    //  +---+---+---+---+    -->    +---+---+---+---+
    //  | I | J | K | L |           | A | E | I | M |
    //  +---+---+---+---+           +---+---+---+---+
    //  | M | N | O | P |           | B | F | J | N |
    //  +---+---+---+---+           +---+---+---+---+
    //
    goofy_inline uint8x16x4_t transposeAs4x4x4(const uint8x16x4_t& v)
    {
        // step 1

        // | 0.a | 1.a | 0.b | 1.b | 0.c | 1.c | 0.d | 1.d | 0.e | 1.e | 0.f | 1.f | 0.g | 1.g | 0.h | 1.h |
        // | 0.i | 1.i | 0.j | 1.j | 0.k | 1.k | 0.l | 1.l | 0.m | 1.m | 0.n | 1.n | 0.o | 1.o | 0.p | 1.p |
        const uint8x16_t s0a = detail::unpacklo8(v.r0, v.r1);
        const uint8x16_t s0b = detail::unpackhi8(v.r0, v.r1);

        // | 2.a | 3.a | 2.b | 3.b | 2.c | 3.c | 2.d | 3.d | 2.e | 3.e | 2.f | 3.f | 2.g | 3.g | 2.h | 3.h |
        // | 2.i | 3.i | 2.j | 3.j | 2.k | 3.k | 2.l | 3.l | 2.m | 3.m | 2.n | 3.n | 2.o | 3.o | 2.p | 3.p |
        const uint8x16_t s0c = detail::unpacklo8(v.r2, v.r3);
        const uint8x16_t s0d = detail::unpackhi8(v.r2, v.r3);

        // step 2

        // | 0.a | 0.i | 1.a | 1.i | 0.b | 0.j | 1.b | 1.j | 0.c | 0.k | 1.c | 1.k | 0.d | 0.l | 1.d | 1.l |
        // | 0.e | 0.m | 1.e | 1.m | 0.f | 0.n | 1.f | 1.n | 0.g | 0.o | 1.g | 1.o | 0.h | 0.p | 1.h | 1.p |
        const uint8x16_t s1a = detail::unpacklo8(s0a, s0b);
        const uint8x16_t s1b = detail::unpackhi8(s0a, s0b);

        // | 2.a | 2.i | 3.a | 3.i | 2.b | 2.j | 3.b | 3.j | 2.c | 2.k | 3.c | 3.k | 2.d | 2.l | 3.d | 3.l |
        // | 2.e | 2.m | 3.e | 3.m | 2.f | 2.n | 3.f | 3.n | 2.g | 2.o | 3.g | 3.o | 2.h | 2.p | 3.h | 3.p |
        const uint8x16_t s1c = detail::unpacklo8(s0c, s0d);
        const uint8x16_t s1d = detail::unpackhi8(s0c, s0d);

        // step 3

        // | 0.a | 0.e | 0.i | 0.m | 1.a | 1.e | 1.i | 1.m | 0.b | 0.f | 0.j | 0.n | 1.b | 1.f | 1.j | 1.n |
        // | 0.c | 0.g | 0.k | 0.o | 1.c | 1.g | 1.k | 1.o | 0.d | 0.h | 0.l | 0.p | 1.d | 1.h | 1.l | 1.p |
        const uint8x16_t s2a = detail::unpacklo8(s1a, s1b);
        const uint8x16_t s2b = detail::unpackhi8(s1a, s1b);

        // | 2.a | 2.e | 2.i | 2.m | 3.a | 3.e | 3.i | 3.m | 2.b | 2.f | 2.j | 2.n | 3.b | 3.f | 3.j | 3.n |
        // | 2.c | 2.g | 2.k | 2.o | 3.c | 3.g | 3.k | 3.o | 2.d | 2.h | 2.l | 2.p | 3.d | 3.h | 3.l | 3.p |
        const uint8x16_t s2c = detail::unpacklo8(s1c, s1d);
        const uint8x16_t s2d = detail::unpackhi8(s1c, s1d);

        // step 4

        // | 0.a | 0.e | 0.i | 0.m | 0.c | 0.g | 0.k | 0.o | 1.a | 1.e | 1.i | 1.m | 1.c | 1.g | 1.k | 1.o |
        // | 0.b | 0.f | 0.j | 0.n | 0.d | 0.h | 0.l | 0.p | 1.b | 1.f | 1.j | 1.n | 1.d | 1.h | 1.l | 1.p |
        const uint8x16_t s3a = detail::unpacklo32(s2a, s2b);
        const uint8x16_t s3b = detail::unpackhi32(s2a, s2b);

        // | 2.a | 2.e | 2.i | 2.m | 2.c | 2.g | 2.k | 2.o | 3.a | 3.e | 3.i | 3.m | 3.c | 3.g | 3.k | 3.o |
        // | 2.b | 2.f | 2.j | 2.n | 2.d | 2.h | 2.l | 2.p | 3.b | 3.f | 3.j | 3.n | 3.d | 3.h | 3.l | 3.p |
        const uint8x16_t s3c = detail::unpacklo32(s2c, s2d);
        const uint8x16_t s3d = detail::unpackhi32(s2c, s2d);

        // step 5

        // | 0.a | 0.e | 0.i | 0.m | 0.c | 0.g | 0.k | 0.o | 0.b | 0.f | 0.j | 0.n | 0.d | 0.h | 0.l | 0.p |
        // | 1.a | 1.e | 1.i | 1.m | 1.c | 1.g | 1.k | 1.o | 1.b | 1.f | 1.j | 1.n | 1.d | 1.h | 1.l | 1.p |
        const uint8x16_t s4a = detail::unpacklo64(s3a, s3b);
        const uint8x16_t s4b = detail::unpackhi64(s3a, s3b);

        // | 2.a | 2.e | 2.i | 2.m | 2.c | 2.g | 2.k | 2.o | 2.b | 2.f | 2.j | 2.n | 2.d | 2.h | 2.l | 2.p |
        // | 3.a | 3.e | 3.i | 3.m | 3.c | 3.g | 3.k | 3.o | 3.b | 3.f | 3.j | 3.n | 3.d | 3.h | 3.l | 3.p |
        const uint8x16_t s4c = detail::unpacklo64(s3c, s3d);
        const uint8x16_t s4d = detail::unpackhi64(s3c, s3d);

        // step 5 (final)
        uint8x16x4_t res;
        // | 0.c | 0.g | 0.k | 0.o | 0.d | 0.h | 0.l | 0.p | 0.a | 0.e | 0.i | 0.m | 0.b | 0.f | 0.j | 0.n |
        res.r0 = detail::swizzleU1302(s4a);
        // | 1.c | 1.g | 1.k | 1.o | 1.d | 1.h | 1.l | 1.p | 1.a | 1.e | 1.i | 1.m | 1.b | 1.f | 1.j | 1.n |
        res.r1 = detail::swizzleU1302(s4b);
        // | 2.c | 2.g | 2.k | 2.o | 2.d | 2.h | 2.l | 2.p | 2.a | 2.e | 2.i | 2.m | 2.b | 2.f | 2.j | 2.n |
        res.r2 = detail::swizzleU1302(s4c);
        // | 3.c | 3.g | 3.k | 3.o | 3.d | 3.h | 3.l | 3.p | 3.a | 3.e | 3.i | 3.m | 3.b | 3.f | 3.j | 3.n |
        res.r3 = detail::swizzleU1302(s4d);
        return res;
    }

    // like ZipU4 but for two parallel zips
    //
    // in:
    //
    // A  = | a0.rgba | a1.rgba | a2.rgba | a3.rgba |
    // B  = | b0.rgba | b1.rgba | b2.rgba | b3.rgba |
    // C  = | c0.rgba | c1.rgba | c2.rgba | c3.rgba |
    // D  = | d0.rgba | d1.rgba | d2.rgba | d3.rgba |
    //
    // out:
    //
    // R0  = | a0.rgba | b0.rgba | a1.rgba | b1.rgba |
    // R1  = | a2.rgba | b2.rgba | a3.rgba | b3.rgba |
    // R2  = | c0.rgba | d0.rgba | c1.rgba | d1.rgba |
    // R3  = | c2.rgba | d2.rgba | c3.rgba | d3.rgba |
    //
    goofy_inline uint8x16x4_t zipU4x2(const uint8x16_t& a, const uint8x16_t& b, const uint8x16_t& c, const uint8x16_t& d)
    {
        const uint8x16x4_t res = {
                detail::unpacklo32(a, b),
                detail::unpackhi32(a, b),
                detail::unpacklo32(c, d),
                detail::unpackhi32(c, d),
        };
        return res;
    }

    //
    // in:
    //
    // a  = | a0.rgba | a1.rgba | a2.rgba| a3.rgba
    // b  = | b0.rgba | b1.rgba | b2.rgba| b3.rgba
    //
    // out:
    //
    // R0  = | a0.rgba | b0.rgba | a1.rgba | b1.rgba |
    // R1  = | a2.rgba | b2.rgba | a3.rgba | b3.rgba |
    //
    goofy_inline uint8x16x2_t zipU4(const uint8x16_t& a, const uint8x16_t& b)
    {
        uint8x16x2_t res;
        res.r0 = detail::unpacklo32(a, b);
        res.r1 = detail::unpackhi32(a, b);
        return res;
    }

    //
    // in:
    //
    // a  = | a0 | a1 | a2 | a3 | a4 | a5 | a6 | a7 | a8 | a9 | aA | aB | aC | aD | aE | aF |
    // b  = | b0 | b1 | b2 | b3 | b4 | b5 | b6 | b7 | b8 | b9 | bA | bB | bC | bD | bE | bF |
    //
    // out:
    //
    // R0  = | a0 | b0 | a1 | b1 | a2 | b2 | a3 | b3 | a4 | b4 | a5 | b5 | a6 | b6 | a7 | b7 |
    // R1  = | a8 | b8 | a9 | b9 | aA | bA | aB | bB | aC | bC | aD | bD | aE | bE | aF | bF |
    //
    goofy_inline uint8x16x2_t zipB16(const uint8x16_t& a, const uint8x16_t& b)
    {
        uint8x16x2_t res;
        res.r0 = detail::unpacklo8(a, b);
        res.r1 = detail::unpackhi8(a, b);
        return res;
    }

    goofy_inline uint8x16_t not(const uint8x16_t& v)
    {
        uint8x16_t res;
        for (int i = 0; i < 16; i++)
        {
            res.data[i] = v.data[i] ^ 0xFF;
        }
        return res;
    }

#endif
}

static_assert(sizeof(uint8x16_t) == 16, "Incorrect byte8x16 sizeof");
static_assert(sizeof(uint8x16x2_t) == 32, "Incorrect byte8x16x1 sizeof");
static_assert(sizeof(uint8x16x3_t) == 48, "Incorrect byte8x16x2 sizeof");
static_assert(sizeof(uint8x16x4_t) == 64, "Incorrect byte8x16x4 sizeof");
static_assert(sizeof(uint64x2_t) == 16, "Incorrect uint64x2_t sizeof");


// Block brightness variance to ETC control byte
static const uint32_t etc1BrighnessRangeTocontrolByte[256] = {
    0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000,
    0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x03000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000,
    0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x27000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000,
    0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000,
    0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x4B000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000,
    0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000,
    0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x6F000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000,
    0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000,
    0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000,
    0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0x93000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000,
    0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000,
    0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xB7000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000,
    0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000,
    0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000,
    0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000,
    0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xDB000000, 0xFF000000, 0xFF000000
};


enum GoofyCodecType
{
    GOOFY_DXT1,
    GOOFY_ETC1,
};

//
// Encode 4 DXT1/ETC1 at once
//
template<GoofyCodecType CODEC_TYPE>
goofy_inline void goofySimdEncode(const unsigned char* goofy_restrict inputRGBA, size_t inputStride, unsigned char* goofy_restrict pResult)
{
    // Fetch 16x4 pixels from the buffer(four DX blocks)
    // 16 pixels wide is better for the CPU cache utilization (64 bytes per line) and it is better for SIMD lane utilization
    // -----------------------------------------------------------
    uint8x16x4_t bl0;
    uint8x16x4_t bl1;
    uint8x16x4_t bl2;
    uint8x16x4_t bl3;
    bl0.r0 = simd::fetch(inputRGBA);
    bl1.r0 = simd::fetch(inputRGBA + 16);
    bl2.r0 = simd::fetch(inputRGBA + 32);
    bl3.r0 = simd::fetch(inputRGBA + 48);
    inputRGBA += inputStride;
    bl0.r1 = simd::fetch(inputRGBA);
    bl1.r1 = simd::fetch(inputRGBA + 16);
    bl2.r1 = simd::fetch(inputRGBA + 32);
    bl3.r1 = simd::fetch(inputRGBA + 48);
    inputRGBA += inputStride;
    bl0.r2 = simd::fetch(inputRGBA);
    bl1.r2 = simd::fetch(inputRGBA + 16);
    bl2.r2 = simd::fetch(inputRGBA + 32);
    bl3.r2 = simd::fetch(inputRGBA + 48);
    inputRGBA += inputStride;
    bl0.r3 = simd::fetch(inputRGBA);
    bl1.r3 = simd::fetch(inputRGBA + 16);
    bl2.r3 = simd::fetch(inputRGBA + 32);
    bl3.r3 = simd::fetch(inputRGBA + 48);

    // Find min block colors
    // -----------------------------------------------------------
    const uint8x16x4_t blMin = {
        simd::minu(simd::minu(bl0.r0, bl0.r1), simd::minu(bl0.r2, bl0.r3)), // min0_clmn0.rgba | min0_clmn1.rgba | min0_clmn2.rgba | min0_clmn3.rgba
        simd::minu(simd::minu(bl1.r0, bl1.r1), simd::minu(bl1.r2, bl1.r3)), // min1_clmn0.rgba | min1_clmn1.rgba | min1_clmn2.rgba | min1_clmn3.rgba
        simd::minu(simd::minu(bl2.r0, bl2.r1), simd::minu(bl2.r2, bl2.r3)), // min2_clmn0.rgba | min2_clmn1.rgba | min2_clmn2.rgba | min2_clmn3.rgba
        simd::minu(simd::minu(bl3.r0, bl3.r1), simd::minu(bl3.r2, bl3.r3))  // min3_clmn0.rgba | min3_clmn1.rgba | min3_clmn2.rgba | min3_clmn3.rgba
    };

    // blMinTr (transposed blMin)
    // min0_clmn0.rgba | min1_clmn0.rgba | min2_clmn0.rgba | min3_clmn0.rgba
    // min0_clmn1.rgba | min1_clmn1.rgba | min2_clmn1.rgba | min3_clmn1.rgba
    // min0_clmn2.rgba | min1_clmn2.rgba | min2_clmn2.rgba | min3_clmn2.rgba
    // min0_clmn3.rgba | min1_clmn3.rgba | min2_clmn3.rgba | min3_clmn3.rgba
    const uint8x16x4_t blMinTr = simd::transposeAs4x4(blMin);

    // Per-block min colors
    // min0.rgba | min1.rgba | min2.rgba | min3.rgba
    const uint8x16_t minColors = simd::minu(
        simd::minu(blMinTr.r0, blMinTr.r1),
        simd::minu(blMinTr.r2, blMinTr.r3)
    );

    // Same to find max block colors
    // -----------------------------------------------------------
    const uint8x16x4_t blMax = {
        simd::maxu(simd::maxu(bl0.r0, bl0.r1), simd::maxu(bl0.r2, bl0.r3)),
        simd::maxu(simd::maxu(bl1.r0, bl1.r1), simd::maxu(bl1.r2, bl1.r3)),
        simd::maxu(simd::maxu(bl2.r0, bl2.r1), simd::maxu(bl2.r2, bl2.r3)),
        simd::maxu(simd::maxu(bl3.r0, bl3.r1), simd::maxu(bl3.r2, bl3.r3))
    };

    const uint8x16x4_t blMaxTr = simd::transposeAs4x4(blMax);

    // Per-block max colors
    // max0.rgba | max1.rgba | max2.rgba | max3.rgba
    const uint8x16_t maxColors = simd::maxu(
        simd::maxu(blMaxTr.r0, blMaxTr.r1),
        simd::maxu(blMaxTr.r2, blMaxTr.r3)
    );

    // Find min/max brigtness
    // -----------------------------------------------------------

    // Note: some SSE lanes wasted, it is not ideal, but seems OK-ish?
    //
    // min0.rgba | min0.rgba | min1.rgba | min1.rgba
    // min2.rgba | min2.rgba | min3.rgba | min3.rgba
    // max0.rgba | max0.rgba | max1.rgba | max1.rgba
    // max2.rgba | max2.rgba | max3.rgba | max3.rgba
    const uint8x16x4_t blMinMax = simd::zipU4x2(minColors, minColors, maxColors, maxColors);

    // Deinterleave
    // min0.rr | min1.rr | min2.rr | min3.rr | max0.rr | max1.rr | max2.rr | max3.rr
    // min0.gg | min1.gg | min2.gg | min3.gg | max0.gg | max1.gg | max2.gg | max3.gg
    // min0.bb | min1.bb | min2.bb | min3.bb | max0.bb | max1.bb | max2.bb | max3.bb
    const uint8x16x3_t blMinMaxDi = simd::deinterleaveRGB(blMinMax);

    // Get Y component of YCoCg color-model (perceptual brightness)
    // https://en.wikipedia.org/wiki/YCoCg
    // Y = 0.25 * R + 0.25 * B + 0.5 * G
    //   We can rewrite equation above using the following form
    // Y = (((R + B) / 2) + G) / 2
    //
    // Y = min0.yy | min1.yy | min2.yy | min3.yy | max0.yy | max1.yy | max2.yy | max3.yy
    const uint8x16_t Y = simd::avg(simd::avg(blMinMaxDi.r0, blMinMaxDi.r2), blMinMaxDi.r1);

    // Min/max brightness per block
    // R0 = min0.yyyy | min1.yyyy | min2.yyyy | min3.yyyy
    // R1 = max0.yyyy | max1.yyyy | max2.yyyy | max3.yyyy
    const uint8x16x2_t blMinMaxY = simd::zipB16(Y, Y);

    // Clamp to min brightness
    const uint8x16_t constEight = simd::fetch(&gConstEight);
    // range0.yyyy | range1.yyyy | range2.yyyy | range3.yyyy
    const uint8x16_t blRangeY = simd::maxu(simd::subsatu(blMinMaxY.r1, blMinMaxY.r0), constEight);

    // mid0.yyyy | mid1.yyyy | mid2.yyyy | mid3.yyyy
    const uint8x16_t blMidY = simd::avg(blMinMaxY.r0, blMinMaxY.r1);

    // Approximate multiplication by 0.375 to get quantization thresholds
    const uint8x16_t constZero = simd::zero();

    const uint8x16_t blHalfRangeY = simd::avg(blRangeY, constZero);
    const uint8x16_t blQuarterRangeY = simd::avg(blHalfRangeY, constZero);
    const uint8x16_t blEighthsRangeY = simd::avg(blQuarterRangeY, constZero);

    // Threshold = (quarter + eights) = (0.25 + 0.125) ~= (range * 0.375)
    // qt0.yyyy | qt1.yyyy | qt2.yyyy | qt3.yyyy
    const uint8x16_t blQThreshold = simd::addsatu(blQuarterRangeY, blEighthsRangeY);

    // Quantization (generate indices)
    // -----------------------------------------------------------
    const uint8x16_t constMaxInt = simd::fetch(&gConstMaxInt);

    //  block 0
    //
    // p0.r p1.r p2.r p3.r p4.r p5.r p6.r p7.r p8.r p9.r p10.r p11.r p12.r p13.r p14.r p15.r
    // p0.g p1.g p2.g p3.g p4.g p5.g p6.g p7.g p8.g p9.g p10.g p11.g p12.g p13.g p14.g p15.g
    // p0.b p1.b p2.b p3.b p4.b p5.b p6.b p7.b p8.b p9.b p10.b p11.b p12.b p13.b p14.b p15.b
    const uint8x16x3_t bl0Di = simd::deinterleaveRGB(bl0);

    // Convert RGB to brightness
    // per-pixel block brightness
    const uint8x16_t bl0Y = simd::avg(simd::avg(bl0Di.r0, bl0Di.r2), bl0Di.r1);

    // Block brightness to compare with
    const uint8x16_t bl0MidY = simd::replicateU0000(blMidY);

    // Brightness difference (per-pixel in block)
    // NOTE: we need to clamp difference to max signed int8, because of the signed comparison later
    const uint8x16_t bl0PosDiffY = simd::minu(simd::subsatu(bl0Y, bl0MidY), constMaxInt);
    const uint8x16_t bl0NegDiffY = simd::minu(simd::subsatu(bl0MidY, bl0Y), constMaxInt);
    // Greater or Equal to zero mask
    const uint8x16_t bl0GezMask = simd::cmpeqi(bl0NegDiffY, constZero);

    // Absolute diffference of brightness (per-pixel in block)
    const uint8x16_t bl0AbsDiffY = simd::or(bl0PosDiffY, bl0NegDiffY);

    // get quantization threshold for current block
    const uint8x16_t bl0QThreshold = simd::replicateU0000(blQThreshold);

    // Less than Quantization Threshold mask
    const uint8x16_t bl0LqtMask = simd::cmplti(bl0AbsDiffY, bl0QThreshold);

    // Here we've got two bitmasks
    //
    // GezMask = greater or equal than zero (per pixel)
    // LqtMask = less than quantization threshold (per pixel)
    //
    //
    //  min       qt          qt        max
    //   x---------x-----+-----x---------x
    //                   0
    //
    //                   |---------------| greater or equal than zero (GezMask)
    //
    //             |-----------| less than quantization threshold (LqtMask)
    //

    //  block 1
    const uint8x16x3_t bl1Di = simd::deinterleaveRGB(bl1);
    const uint8x16_t bl1Y = simd::avg(simd::avg(bl1Di.r0, bl1Di.r2), bl1Di.r1);
    const uint8x16_t bl1MidY = simd::replicateU1111(blMidY);
    const uint8x16_t bl1PosDiffY = simd::minu(simd::subsatu(bl1Y, bl1MidY), constMaxInt);
    const uint8x16_t bl1NegDiffY = simd::minu(simd::subsatu(bl1MidY, bl1Y), constMaxInt);
    const uint8x16_t bl1GezMask = simd::cmpeqi(bl1NegDiffY, constZero);
    const uint8x16_t bl1AbsDiffY = simd::or(bl1PosDiffY, bl1NegDiffY);
    const uint8x16_t bl1QThreshold = simd::replicateU1111(blQThreshold);
    const uint8x16_t bl1LqtMask = simd::cmplti(bl1AbsDiffY, bl1QThreshold);

    //  block 2
    const uint8x16x3_t bl2Di = simd::deinterleaveRGB(bl2);
    const uint8x16_t bl2Y = simd::avg(simd::avg(bl2Di.r0, bl2Di.r2), bl2Di.r1);
    const uint8x16_t bl2MidY = simd::replicateU2222(blMidY);
    const uint8x16_t bl2PosDiffY = simd::minu(simd::subsatu(bl2Y, bl2MidY), constMaxInt);
    const uint8x16_t bl2NegDiffY = simd::minu(simd::subsatu(bl2MidY, bl2Y), constMaxInt);
    const uint8x16_t bl2GezMask = simd::cmpeqi(bl2NegDiffY, constZero);
    const uint8x16_t bl2AbsDiffY = simd::or(bl2PosDiffY, bl2NegDiffY);
    const uint8x16_t bl2QThreshold = simd::replicateU2222(blQThreshold);
    const uint8x16_t bl2LqtMask = simd::cmplti(bl2AbsDiffY, bl2QThreshold);

    //  block 3
    const uint8x16x3_t bl3Di = simd::deinterleaveRGB(bl3);
    const uint8x16_t bl3Y = simd::avg(simd::avg(bl3Di.r0, bl3Di.r2), bl3Di.r1);
    const uint8x16_t bl3MidY = simd::replicateU3333(blMidY);
    const uint8x16_t bl3PosDiffY = simd::minu(simd::subsatu(bl3Y, bl3MidY), constMaxInt);
    const uint8x16_t bl3NegDiffY = simd::minu(simd::subsatu(bl3MidY, bl3Y), constMaxInt);
    const uint8x16_t bl3GezMask = simd::cmpeqi(bl3NegDiffY, constZero);
    const uint8x16_t bl3AbsDiffY = simd::or(bl3PosDiffY, bl3NegDiffY);
    const uint8x16_t bl3QThreshold = simd::replicateU3333(blQThreshold);
    const uint8x16_t bl3LqtMask = simd::cmplti(bl3AbsDiffY, bl3QThreshold);

    // Finalize blocks
    // -----------------------------------------------------------
    if (CODEC_TYPE == GOOFY_DXT1)
    {
        // Generate DXT indices using given masks

        // DXT indices order
        // -------------------------
        //        C0(max)     C2        C3      C1(min)
        // DEC: |    0    |    2    |    3    |    1    |
        // BIN: |   00b   |   10b   |   11b   |   01b   |
        //
        //      |      GezMask      |
        //                          |      LqtMask      |

        // Zip two masks to match DX bits order
        // Gez0 | Lqt0 | Gez1 | Lqt1 | Gez2 | Lqt2 | Gez3 | Lqt3 | Gez4 | Lqt4 | Gez5 | Lqt5 | Gez6 | Lqt6 | Gez7 | Lqt7
        // Gez8 | Lqt8 | Gez9 | Lqt9 | GezA | LqtA | GezB | LqtB | GezC | LqtC | GezD | LqtD | GezE | LqtE | GezF | LqtF
        const uint8x16x2_t bl0RawIndices = simd::zipB16(simd::not(bl0GezMask), bl0LqtMask);
        const uint8x16x2_t bl3RawIndices = simd::zipB16(simd::not(bl3GezMask), bl3LqtMask);
        const uint8x16x2_t bl2RawIndices = simd::zipB16(simd::not(bl2GezMask), bl2LqtMask);
        const uint8x16x2_t bl1RawIndices = simd::zipB16(simd::not(bl1GezMask), bl1LqtMask);

        // Bytes to bits
        uint32_t bl0Indices = simd::moveMaskMSB(bl0RawIndices.r0) | (simd::moveMaskMSB(bl0RawIndices.r1) << 16);
        uint32_t bl1Indices = simd::moveMaskMSB(bl1RawIndices.r0) | (simd::moveMaskMSB(bl1RawIndices.r1) << 16);
        uint32_t bl2Indices = simd::moveMaskMSB(bl2RawIndices.r0) | (simd::moveMaskMSB(bl2RawIndices.r1) << 16);
        uint32_t bl3Indices = simd::moveMaskMSB(bl3RawIndices.r0) | (simd::moveMaskMSB(bl3RawIndices.r1) << 16);

        // Convert rgb888 to rgb555

        // We can't shift right by 3 using SIMD, but we can shift right by 1 three times instead
        // We need to sub eight before, because avg is (a+b+1) >> 1

        // max555_0.rgba | max555_1.rgba | max555_2.rgba | max555_3.rgba
        const uint8x16_t maxColors555 = simd::avg(simd::avg(simd::avg(simd::subsatu(maxColors, constEight), constZero), constZero), constZero);
        // min555_0.rgba | min555_1.rgba | min555_2.rgba | min555_3.rgba
        const uint8x16_t minColors555 = simd::avg(simd::avg(simd::avg(simd::subsatu(minColors, constEight), constZero), constZero), constZero);

        // max555_0.rgba | min555_0.rgba | max555_1.rgba | min555_1.rgba
        // max555_2.rgba | min555_2.rgba | max555_3.rgba | min555_3.rgba
        const uint8x16x2_t maxMinColors555 = simd::zipU4(maxColors555, minColors555);

        const uint64x2_t maxMin01 = simd::getAsUInt64x2(maxMinColors555.r0);
        const uint64x2_t maxMin23 = simd::getAsUInt64x2(maxMinColors555.r1);

        // R0
        // AAAAAAAA000000000000000000000000AAAAAAAA000000000000000000011111b << 11 = 0000000000000000 1111100000000000b
        // AAAAAAAA000000000000000000000000AAAAAAAA000000000001111100000000b >> 2  = 0000000000000000 0000011111000000b
        // AAAAAAAA000000000000000000000000AAAAAAAA000111110000000000000000b >> 16 = 0000000000000000 0000000000011111b

        // R1
        // AAAAAAAA000000000000000000011111AAAAAAAA000000000000000000000000b >> 5 =  1111100000000000 0000000000000000b
        // AAAAAAAA000000000001111100000000AAAAAAAA000000000000000000000000b >> 18 = 0000011111000000 0000000000000000b
        // AAAAAAAA000111110000000000000000AAAAAAAA000000000000000000000000b >> 32 = 0000000000011111 0000000000000000b

        // 0x20                                                                    = 0000000000000000 0000000000100000b

        uint32_t* goofy_restrict pDest = (uint32_t* goofy_restrict)pResult;

        uint32_t block0a = (uint32_t)(0x20 | // max color green channel LSB (to avoid switching to DXT1 3-color mode)
            (maxMin01.r0 & 0x1Full) << 11ull | (maxMin01.r0 & 0x1F00ull) >> 2ull | (maxMin01.r0 & 0x1F0000ull) >> 16ull |  // max color
            (maxMin01.r0 & 0x1F00000000ull) >> 5ull | (maxMin01.r0 & 0x1F0000000000ull) >> 18ull | (maxMin01.r0 & 0x1F000000000000ull) >> 32ull); // min color
        //uint32_t block0b = bl0Indices << 32ull;  // indices
        *pDest = block0a; pDest++; *pDest = bl0Indices;

        uint32_t block1a = (uint32_t)(0x20 |
            (maxMin01.r1 & 0x1Full) << 11ull | (maxMin01.r1 & 0x1F00ull) >> 2ull | (maxMin01.r1 & 0x1F0000ull) >> 16ull |
            (maxMin01.r1 & 0x1F00000000ull) >> 5ull | (maxMin01.r1 & 0x1F0000000000ull) >> 18ull | (maxMin01.r1 & 0x1F000000000000ull) >> 32ull);
        //uint32_t block1b = bl1Indices << 32ull;
        pDest++; *pDest = block1a; pDest++; *pDest = bl1Indices;

        uint32_t block2a = (uint32_t)(0x20 |
            (maxMin23.r0 & 0x1Full) << 11ull | (maxMin23.r0 & 0x1F00ull) >> 2ull | (maxMin23.r0 & 0x1F0000ull) >> 16ull |
            (maxMin23.r0 & 0x1F00000000ull) >> 5ull | (maxMin23.r0 & 0x1F0000000000ull) >> 18ull | (maxMin23.r0 & 0x1F000000000000ull) >> 32ull);
            //bl2Indices << 32ull;
        pDest++; *pDest = block2a; pDest++; *pDest = bl2Indices;

        uint32_t block3a = (uint32_t)(0x20 |
            (maxMin23.r1 & 0x1Full) << 11ull | (maxMin23.r1 & 0x1F00ull) >> 2ull | (maxMin23.r1 & 0x1F0000ull) >> 16ull |
            (maxMin23.r1 & 0x1F00000000ull) >> 5ull | (maxMin23.r1 & 0x1F0000000000ull) >> 18ull | (maxMin23.r1 & 0x1F000000000000ull) >> 32ull);
            //bl3Indices << 32ull;
        pDest++; *pDest = block3a; pDest++; *pDest = bl3Indices;
    }
    else if (CODEC_TYPE == GOOFY_ETC1)
    {
        // Combined masks (major bit = GreaterEqualZero  other 7 bits = LessQuantizationThreshold)
        const uint8x16x4_t blMasks = {
            simd::or(simd::andnot(constMaxInt, bl0GezMask), simd::and(bl0LqtMask, constMaxInt)),
            simd::or(simd::andnot(constMaxInt, bl1GezMask), simd::and(bl1LqtMask, constMaxInt)),
            simd::or(simd::andnot(constMaxInt, bl2GezMask), simd::and(bl2LqtMask, constMaxInt)),
            simd::or(simd::andnot(constMaxInt, bl3GezMask), simd::and(bl3LqtMask, constMaxInt))
        };

        //  +---+---+---+---+           +---+---+---+---+
        //  | A | B | C | D |           | C | G | K | O |
        //  +---+---+---+---+           +---+---+---+---+
        //  | E | F | G | H |           | D | H | L | P |
        //  +---+---+---+---+    -->    +---+---+---+---+
        //  | I | J | K | L |           | A | E | I | M |
        //  +---+---+---+---+           +---+---+---+---+
        //  | M | N | O | P |           | B | F | J | N |
        //  +---+---+---+---+           +---+---+---+---+
        const uint8x16x4_t blMasksTr = simd::transposeAs4x4x4(blMasks);

        // Unpack masks and copy from bytes to bits
        const uint32_t bl0PosOrZero = simd::moveMaskMSB(blMasksTr.r0);
        const uint32_t bl1PosOrZero = simd::moveMaskMSB(blMasksTr.r1);
        const uint32_t bl2PosOrZero = simd::moveMaskMSB(blMasksTr.r2);
        const uint32_t bl3PosOrZero = simd::moveMaskMSB(blMasksTr.r3);

        uint8x16_t bl0LessThanQtMask = simd::and(blMasksTr.r0, constMaxInt);
        uint8x16_t bl1LessThanQtMask = simd::and(blMasksTr.r1, constMaxInt);
        uint8x16_t bl2LessThanQtMask = simd::and(blMasksTr.r2, constMaxInt);
        uint8x16_t bl3LessThanQtMask = simd::and(blMasksTr.r3, constMaxInt);
        bl0LessThanQtMask = simd::addsatu(bl0LessThanQtMask, bl0LessThanQtMask);
        bl1LessThanQtMask = simd::addsatu(bl1LessThanQtMask, bl1LessThanQtMask);
        bl2LessThanQtMask = simd::addsatu(bl2LessThanQtMask, bl2LessThanQtMask);
        bl3LessThanQtMask = simd::addsatu(bl3LessThanQtMask, bl3LessThanQtMask);

        const uint32_t bl0LessThanQt = simd::moveMaskMSB(bl0LessThanQtMask);
        const uint32_t bl1LessThanQt = simd::moveMaskMSB(bl1LessThanQtMask);
        const uint32_t bl2LessThanQt = simd::moveMaskMSB(bl2LessThanQtMask);
        const uint32_t bl3LessThanQt = simd::moveMaskMSB(bl3LessThanQtMask);

#if 1
        // Keep chromatic component from the average color, but override brightness
        // NOTE: This is slightly slower but gets slightly better quality

        // Find average blocks color
        const uint8x16x4_t blAvg = {
            simd::avg(simd::avg(bl0.r0, bl0.r1), simd::avg(bl0.r2, bl0.r3)),
            simd::avg(simd::avg(bl1.r0, bl1.r1), simd::avg(bl1.r2, bl1.r3)),
            simd::avg(simd::avg(bl2.r0, bl2.r1), simd::avg(bl2.r2, bl2.r3)),
            simd::avg(simd::avg(bl3.r0, bl3.r1), simd::avg(bl3.r2, bl3.r3))
        };

        const uint8x16x4_t blAvgTr = simd::transposeAs4x4(blAvg);

        const uint8x16_t blAvgColors = simd::avg(
            simd::avg(blAvgTr.r0, blAvgTr.r1),
            simd::avg(blAvgTr.r2, blAvgTr.r3)
        );

        // Note: a lot of SSE lanes wasted, it is not ideal, TODO?
        //
        // avg0.rgba | avg0.rgba | avg1.rgba | avg1.rgba
        // avg2.rgba | avg2.rgba | avg3.rgba | avg3.rgba
        // avg0.rgba | avg0.rgba | avg1.rgba | avg1.rgba
        // avg2.rgba | avg2.rgba | avg3.rgba | avg3.rgba
        const uint8x16x4_t blAvg4 = simd::zipU4x2(blAvgColors, blAvgColors, blAvgColors, blAvgColors);

        // Deinterleave
        // avg0.rr | avg1.rr | avg2.rr | avg3.rr | avg0.rr | avg1.rr | avg2.rr | avg3.rr
        // avg0.gg | avg1.gg | avg2.gg | avg3.gg | avg0.gg | avg1.gg | avg2.gg | avg3.gg
        // avg0.bb | avg1.bb | avg2.bb | avg3.bb | avg0.bb | avg1.bb | avg2.bb | avg3.bb
        const uint8x16x3_t blAvg4Di = simd::deinterleaveRGB(blAvg4);

        // Y = avg0.yy | avg1.yy | avg2.yy | avg3.yy | avg0.yy | avg1.yy | avg2.yy | avg3.yy
        const uint8x16_t Y = simd::avg(simd::avg(blAvg4Di.r0, blAvg4Di.r2), blAvg4Di.r1);

        // Min/max brightness per block
        // R0 = avg0.yyyy | avg1.yyyy | avg2.yyyy | avg3.yyyy
        // R1 = avg0.yyyy | avg1.yyyy | avg2.yyyy | avg3.yyyy    // NOTE: not used!
        const uint8x16x2_t blAvgY = simd::zipB16(Y, Y);

        const uint8x16_t blPosCorrectionY = simd::minu(simd::subsatu(blMidY, blAvgY.r0), constMaxInt);
        const uint8x16_t blNegCorrectionY = simd::minu(simd::subsatu(blAvgY.r0, blMidY), constMaxInt);
        const uint8x16_t blCorrectionYGezMask = simd::cmpeqi(blNegCorrectionY, constZero);
        const uint8x16_t blCorrectionYAbs = simd::or(blPosCorrectionY, blNegCorrectionY);

        // Get the color in the middle between  min/max colors of the block.
        // NOTE: this is not the same as an average block color.

        const uint8x16_t blBaseColorsPos = simd::addsatu(blAvgColors, blCorrectionYAbs);
        const uint8x16_t blBaseColorsNeg = simd::subsatu(blAvgColors, blCorrectionYAbs);

        const uint8x16_t blBaseColors = simd::select(blCorrectionYGezMask, blBaseColorsPos, blBaseColorsNeg);
#else
        // Get the color in the middle between  min/max colors of the block.
        // NOTE: this is not the same as an average block color.
        const uint8x16_t blBaseColors = simd::avg(minColors, maxColors);
#endif

        // Convert rgb888 to rgb555

        // We can't shift right by 3 using SIMD, but we can shift right by 1 three times instead
        // We need to sub eight before, because avg is (a+b+1) >> 1

        // mid555_0.rgba | mid555_1.rgba | mid555_2.rgba | mid555_3.rgba
        const uint8x16_t baseColors555 = simd::avg(simd::avg(simd::avg(simd::subsatu(blBaseColors, constEight), constZero), constZero), constZero);

        const uint64x2_t baseColors = simd::getAsUInt64x2(baseColors555);

        // R0
        // AAAAAAAA000000000000000000000000AAAAAAAA000000000000000000011111b << 3  = 00000000 00000000 11111000b
        // AAAAAAAA000000000000000000000000AAAAAAAA000000000001111100000000b << 3  = 00000000 11111000 00000000b
        // AAAAAAAA000000000000000000000000AAAAAAAA000111110000000000000000b << 3  = 11111000 00000000 00000000b

        // R1
        // AAAAAAAA000000000000000000011111AAAAAAAA000000000000000000000000b >> 29 = 00000000 00000000 11111000b
        // AAAAAAAA000000000001111100000000AAAAAAAA000000000000000000000000b >> 29 = 00000000 11111000 00000000b
        // AAAAAAAA000111110000000000000000AAAAAAAA000000000000000000000000b >> 29 = 11111000 00000000 00000000b

        uint32_t* goofy_restrict pDest = (uint32_t* goofy_restrict)pResult;

        const uint32_t block0a = etc1BrighnessRangeTocontrolByte[blRangeY.m128i_u8[0]] | ((baseColors.r0 << 3ull) & 0xFFFFFF);
        const uint32_t block0b = ~(bl0PosOrZero | (bl0LessThanQt << 16));
        *pDest = block0a; pDest++; *pDest = block0b;

        const uint32_t block1a = etc1BrighnessRangeTocontrolByte[blRangeY.m128i_u8[4]] | ((baseColors.r0 >> 29ull) & 0xFFFFFF);
        const uint32_t block1b = ~(bl1PosOrZero | (bl1LessThanQt << 16));
        pDest++; *pDest = block1a; pDest++; *pDest = block1b;

        const uint32_t block2a = etc1BrighnessRangeTocontrolByte[blRangeY.m128i_u8[8]] | ((baseColors.r1 << 3ull) & 0xFFFFFF);
        const uint32_t block2b = ~(bl2PosOrZero | (bl2LessThanQt << 16));
        pDest++; *pDest = block2a; pDest++; *pDest = block2b;

        const uint32_t block3a = etc1BrighnessRangeTocontrolByte[blRangeY.m128i_u8[12]] | ((baseColors.r1 >> 29ull) & 0xFFFFFF);
        const uint32_t block3b = ~(bl3PosOrZero | (bl3LessThanQt << 16));
        pDest++; *pDest = block3a; pDest++; *pDest = block3b;
    }
}


int compressDXT1(unsigned char* result, const unsigned char* input, unsigned int width, unsigned int height, unsigned int stride)
{
    // those checks are required because of 4x1 block window inside the compressor
    if (width % 16 != 0)
    {
        return -1;
    }

    if (height % 4 != 0)
    {
        return -2;
    }

    unsigned int blockW = width >> 2;
    unsigned int blockH = height >> 2;

    size_t inputStride = stride;
    for (uint32_t y = 0; y < blockH; y++)
    {
        const unsigned char* goofy_restrict encoderPos = input;
        for (uint32_t x = 0; x < blockW; x += 4)
        {
            goofySimdEncode<GOOFY_DXT1>(encoderPos, inputStride, result);
            encoderPos += 64; // 16 rgba pixels (4 DXT blocks) = 16 * 4 = 64
            result += 32;     // 4 DXT1 blocks = 8 * 4 = 32
        }
        input += inputStride * 4; // 4 lines
    }
    return 0;
}

int compressETC1(unsigned char* result, const unsigned char* input, unsigned int width, unsigned int height, unsigned int stride)
{
    // those checks are required because of 4x1 block window inside the compressor
    if (width % 16 != 0)
    {
        return -1;
    }

    if (height % 4 != 0)
    {
        return -2;
    }

    unsigned int blockW = width >> 2;
    unsigned int blockH = height >> 2;

    size_t inputStride = stride;
    for (uint32_t y = 0; y < blockH; y++)
    {
        const unsigned char* goofy_restrict encoderPos = input;
        for (uint32_t x = 0; x < blockW; x += 4)
        {
            goofySimdEncode<GOOFY_ETC1>(encoderPos, inputStride, result);
            encoderPos += 64; // 16 rgba pixels (4 DXT blocks) = 16 * 4 = 64
            result += 32;     // 4 DXT1 blocks = 8 * 4 = 32
        }
        input += inputStride * 4; // 4 lines
    }
    return 0;
}


#undef goofy_restrict
#undef goofy_inline
#undef goofy_align16
}
#endif


// Copyright (c) 2020 Sergey Makeev <sergeymakeev@hotmail.com>
//
// Permission is hereby granted, free of charge, to any person obtaining
// a copy of this software and associated documentation files (the
// "Software"), to	deal in the Software without restriction, including
// without limitation the rights to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
// IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
// CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
// TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
// SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.